wgit 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,30 +2,46 @@ require_relative '../utils'
2
2
 
3
3
  module Wgit
4
4
 
5
- # @author Michael Telford
6
- # Module containing the DB data model structure.
5
+ # Module containing the database (DB) data model structure.
7
6
  module Model
8
- def self.url(url)
9
- raise "url must respond to to_h" unless url.respond_to?(:to_h)
10
- url.to_h
11
- end
12
-
13
- def self.document(doc)
14
- raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
15
- doc.to_h(false)
16
- end
17
-
18
- def self.common_insert_data
19
- {
20
- :date_added => Wgit::Utils.time_stamp,
21
- :date_modified => Wgit::Utils.time_stamp,
22
- }
23
- end
24
-
25
- def self.common_update_data
26
- {
27
- :date_modified => Wgit::Utils.time_stamp,
28
- }
29
- end
7
+
8
+ # The data model for a Wgit::Url.
9
+ #
10
+ # @param url [Wgit::Url] The URL DB record.
11
+ # @return [Hash] The URL model ready for DB insertion.
12
+ def self.url(url)
13
+ raise "url must respond_to? to_h" unless url.respond_to?(:to_h)
14
+ model = url.to_h
15
+ Wgit::Utils.remove_non_bson_types(model)
16
+ end
17
+
18
+ # The data model for a Wgit::Document.
19
+ #
20
+ # @param doc [Wgit::Document] The Document DB record.
21
+ # @return [Hash] The Document model ready for DB insertion.
22
+ def self.document(doc)
23
+ raise "doc must respond_to? to_h" unless doc.respond_to?(:to_h)
24
+ model = doc.to_h(false)
25
+ Wgit::Utils.remove_non_bson_types(model)
26
+ end
27
+
28
+ # Default fields when inserting a record into the DB.
29
+ #
30
+ # @return [Hash] Containing common insertion fields for all models.
31
+ def self.common_insert_data
32
+ {
33
+ date_added: Wgit::Utils.time_stamp,
34
+ date_modified: Wgit::Utils.time_stamp,
35
+ }
36
+ end
37
+
38
+ # Default fields when updating a record in the DB.
39
+ #
40
+ # @return [Hash] Containing common update fields for all models.
41
+ def self.common_update_data
42
+ {
43
+ date_modified: Wgit::Utils.time_stamp,
44
+ }
45
+ end
30
46
  end
31
47
  end
@@ -1,27 +1,48 @@
1
-
2
- # @author Michael Telford
3
1
  module Wgit
4
- DB_PROVIDER = :MongoLabs.freeze
2
+ # The connection details for the database. This must be set if you want to
3
+ # store and access webpages in a database. Don't set the constant directly,
4
+ # instead use the funcs contained within the Wgit module.
5
+ CONNECTION_DETAILS = {}
6
+
7
+ # Set the database's connection details from the given hash and freeze them.
8
+ # It is your responsibility to ensure the correct hash vars are present and
9
+ # set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
10
+ # to be called only once.
11
+ #
12
+ # @param hash [Hash] Containing the database connection details to use.
13
+ # The hash should contain the following keys (of type String):
14
+ # host, port, uname, pword, db
15
+ # @raise [KeyError, FrozenError] If any of the required connection
16
+ # details are missing or if the connection details have already been set.
17
+ # @return [Hash] Containing the database connection details from hash.
18
+ def self.set_connection_details(hash)
19
+ CONNECTION_DETAILS[:host] = hash.fetch('host')
20
+ CONNECTION_DETAILS[:port] = hash.fetch('port')
21
+ CONNECTION_DETAILS[:uname] = hash.fetch('uname')
22
+ CONNECTION_DETAILS[:pword] = hash.fetch('pword')
23
+ CONNECTION_DETAILS[:db] = hash.fetch('db')
24
+
25
+ CONNECTION_DETAILS.freeze
26
+ end
27
+
28
+ # Set the database's connection details from the ENV and freeze them. It is
29
+ # your responsibility to ensure the correct ENV vars are present and set.
30
+ # Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
31
+ # called only once.
32
+ #
33
+ # The ENV should contain the following keys (of type String):
34
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
35
+ #
36
+ # @raise [KeyError, FrozenError] If any of the required connection
37
+ # details are missing or if the connection details have already been set.
38
+ # @return [Hash] Containing the database connection details from the ENV.
39
+ def self.set_connection_details_from_env
40
+ CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
41
+ CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
42
+ CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
43
+ CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
44
+ CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
5
45
 
6
- # OpenShift (MongoDB 2.4)
7
- if DB_PROVIDER == :OpenShift
8
- CONNECTION_DETAILS = {
9
- :host => "127.0.0.1",
10
- :port => "27017",
11
- :db => "admin",
12
- :uname => "admin",
13
- :pword => "R5jUKv1fessb"
14
- }.freeze
15
- # MongoLabs (MongoDB 3.0)
16
- elsif DB_PROVIDER == :MongoLabs
17
- CONNECTION_DETAILS = {
18
- :host => "ds037205.mongolab.com",
19
- :port => "37205",
20
- :db => "crawler",
21
- :uname => "rubyapp",
22
- :pword => "R5jUKv1fessb",
23
- }.freeze
24
- else
25
- raise "Database provider '#{DB_PROVIDER}' is not recognized"
46
+ CONNECTION_DETAILS.freeze
26
47
  end
27
48
  end
@@ -2,288 +2,589 @@ require_relative 'url'
2
2
  require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
+ require 'json'
5
6
 
6
7
  module Wgit
7
8
 
8
- # @author Michael Telford
9
- # Class modeling a HTML web document. Also doubles as a search result.
9
+ # Class modeling a HTML web document. Also doubles as a search result when
10
+ # loading Documents from the database.
11
+ #
12
+ # The initialize method dynamically initializes certain variables from the
13
+ # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
+ # Document class can be easily extended allowing you to pull out the bits of
15
+ # a webpage that are important to you. See Wgit::Document.define_extension.
10
16
  class Document
11
17
  include Assertable
12
-
13
- TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
14
- :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
15
18
 
16
- attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
17
-
18
- def initialize(url_or_doc, html = nil)
19
- if (url_or_doc.is_a?(String))
20
- assert_type(url_or_doc, Url)
21
- html ||= ""
19
+ # The HTML elements that make up the visible text on a page.
20
+ # These elements are used to initialize the @text of the Document.
21
+ # See the README.md for how to add to this Array dynamically.
22
+ @@text_elements = [
23
+ :dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
24
+ :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
+ ]
26
+
27
+ # The URL of the webpage, an instance of Wgit:Url.
28
+ attr_reader :url
29
+
30
+ # The HTML of the webpage, an instance of String.
31
+ attr_reader :html
32
+
33
+ # The Nokogiri document object initialized from @html.
34
+ attr_reader :doc
35
+
36
+ # The score is only used following a Database#search and records matches.
37
+ attr_reader :score
38
+
39
+ # Initialize takes either two strings (representing the URL and HTML) or an
40
+ # object representing a database record (of a HTTP crawled web page). This
41
+ # allows for initialisation from both crawled web pages and (afterwards)
42
+ # documents/web pages retrieved from the database.
43
+ #
44
+ # During initialisation, the Document will call any
45
+ # 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
46
+ # default init_* methods exist while others can be defined by the user.
47
+ # See the README and Wgit::Document.define_extension for more info.
48
+ #
49
+ # @param url_or_obj [String, Object#fetch] Either a String representing a
50
+ # URL or a Hash-like object responding to :fetch. e.g. a MongoDB
51
+ # collection object. The Object's :fetch method should support Strings as
52
+ # keys.
53
+ # @param html [String] The crawled web page's HTML. This param is only
54
+ # required if url_or_obj is a String representing the web page's URL.
55
+ def initialize(url_or_obj, html = "")
56
+ # Init from URL String and HTML String.
57
+ if url_or_obj.is_a?(String)
58
+ url = url_or_obj
59
+ assert_type(url, Url)
60
+
61
+ @url = url
62
+ @html = html ||= ""
63
+ @doc = init_nokogiri
64
+ @score = 0.0
22
65
 
23
- @url = url_or_doc
24
- @html = html
25
-
26
- @doc = Nokogiri::HTML(html) do |config|
27
- # TODO: Remove #'s below when crawling in production.
28
- #config.options = Nokogiri::XML::ParseOptions::STRICT |
29
- # Nokogiri::XML::ParseOptions::NONET
30
- end
31
-
32
- init_title
33
- init_author
34
- init_keywords
35
- init_links
36
- init_text
37
- @score = 0.0
38
- else
39
- # Init from a mongo collection document.
40
- @url = Wgit::Url.new(url_or_doc[:url])
41
- @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
42
- @title = url_or_doc[:title]
43
- @author = url_or_doc[:author]
44
- @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
45
- @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
46
- @links.map! { |link| Wgit::Url.new(link) }
47
- @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
48
- @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
66
+ # Dynamically run the init_*_from_html methods.
67
+ Document.private_instance_methods(false).each do |method|
68
+ if method.to_s.start_with?("init_") &&
69
+ method.to_s.end_with?("_from_html")
70
+ self.send(method)
49
71
  end
50
- end
51
-
52
- def internal_links
53
- return [] if @links.empty?
54
- @links.reject do |link|
55
- begin
56
- not link.relative_link?
57
- rescue
58
- true
59
- end
72
+ end
73
+ # Init from a Hash like object containing Strings as keys e.g. Mongo
74
+ # collection obj.
75
+ else
76
+ obj = url_or_obj
77
+ assert_respond_to(obj, :fetch)
78
+
79
+ @url = obj.fetch("url") # Should always be present.
80
+ @html = obj.fetch("html", "")
81
+ @doc = init_nokogiri
82
+ @score = obj.fetch("score", 0.0)
83
+
84
+ # Dynamically run the init_*_from_object methods.
85
+ Document.private_instance_methods(false).each do |method|
86
+ if method.to_s.start_with?("init_") &&
87
+ method.to_s.end_with?("_from_object")
88
+ self.send(method, obj)
60
89
  end
61
- end
90
+ end
91
+ end
92
+ end
62
93
 
63
- def internal_full_links
64
- return [] if internal_links.empty?
65
- internal_links.map do |link|
66
- link.replace("/" + link) unless link.start_with?("/")
67
- Wgit::Url.new(@url.to_base + link)
68
- end
94
+ # Determines if both the url and html match. Use
95
+ # doc.object_id == other_doc.object_id for exact object comparison.
96
+ #
97
+ # @param other_doc [Wgit::Document] To compare self against.
98
+ # @return [Boolean] True if @url and @html are equal, false if not.
99
+ def ==(other_doc)
100
+ return false unless other_doc.is_a? Wgit::Document
101
+ @url == other_doc.url and @html == other_doc.html
102
+ end
103
+
104
+ # Is a shortcut for calling Document#html[range].
105
+ #
106
+ # @param range [Range] The range of @html to return.
107
+ # @return [String] The given range of @html.
108
+ def [](range)
109
+ @html[range]
110
+ end
111
+
112
+ def date_crawled
113
+ @url.date_crawled
114
+ end
115
+
116
+ # Returns a Hash containing this Document's instance vars.
117
+ # Used when storing the Document in a Database e.g. MongoDB etc.
118
+ # By default the @html var is excluded from the returned Hash.
119
+ #
120
+ # @param include_html [Boolean] Whether or not to include @html in the
121
+ # returned Hash.
122
+ # @return [Hash] Containing self's instance vars.
123
+ def to_h(include_html = false)
124
+ ignore = include_html ? [] : ["@html"]
125
+ ignore << "@doc" # Always ignore "@doc"
126
+ Wgit::Utils.to_h(self, ignore)
127
+ end
128
+
129
+ # Converts this Document's to_h return value to a JSON String.
130
+ #
131
+ # @param include_html [Boolean] Whether or not to include @html in the
132
+ # returned JSON String.
133
+ # @return [String] This Document represented as a JSON String.
134
+ def to_json(include_html = false)
135
+ h = to_h(include_html)
136
+ JSON.generate(h)
137
+ end
138
+
139
+ # Returns a Hash containing this Document's instance variables and
140
+ # their :length (if they respond to it). Works dynamically so that any
141
+ # user defined extensions (and their created instance vars) will appear in
142
+ # the returned Hash as well. The number of text snippets as well as total
143
+ # number of textual bytes are always included in the returned Hash.
144
+ #
145
+ # @return [Hash] Containing self's HTML statistics.
146
+ def stats
147
+ hash = {}
148
+ instance_variables.each do |var|
149
+ # Add up the total bytes of text as well as the length.
150
+ if var == :@text
151
+ count = 0
152
+ @text.each { |t| count += t.length }
153
+ hash[:text_length] = @text.length
154
+ hash[:text_bytes] = count
155
+ # Else take the var's #length method return value.
156
+ else
157
+ next unless instance_variable_get(var).respond_to?(:length)
158
+ hash[var[1..-1].to_sym] =
159
+ instance_variable_get(var).send(:length)
160
+ end
69
161
  end
70
-
71
- def external_links
162
+ hash
163
+ end
164
+
165
+ # Determine the size of this Document's HTML.
166
+ #
167
+ # @return [Integer] The total number of bytes in @html.
168
+ def size
169
+ stats[:html]
170
+ end
171
+
172
+ # Determine if this Document's HTML is empty or not.
173
+ #
174
+ # @return [Boolean] True if @html is nil/empty, false otherwise.
175
+ def empty?
176
+ return true if @html.nil?
177
+ @html.strip.empty?
178
+ end
179
+
180
+ # Uses Nokogiri's xpath method to search the doc's html and return the
181
+ # results.
182
+ #
183
+ # @param xpath [String] The xpath to search the @html with.
184
+ # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
185
+ def xpath(xpath)
186
+ @doc.xpath(xpath)
187
+ end
188
+
189
+ # Uses Nokogiri's css method to search the doc's html and return the
190
+ # results.
191
+ #
192
+ # @param selector [String] The CSS selector to search the @html with.
193
+ # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
194
+ def css(selector)
195
+ @doc.css(selector)
196
+ end
197
+
198
+ # Get all internal links of this Document.
199
+ #
200
+ # @return [Array<Wgit::Url>] self's internal/relative URL's.
201
+ def internal_links
72
202
  return [] if @links.empty?
73
- @links.reject do |link|
203
+ @links.reject do |link|
74
204
  begin
75
- link.relative_link?
205
+ not link.relative_link?
76
206
  rescue
77
- true
207
+ true
78
208
  end
79
209
  end
80
- end
210
+ end
81
211
 
82
- def stats
83
- hash = {}
84
- instance_variables.each do |var|
85
- # Add up the total bytes of text as well as the length.
86
- if var == :@text
87
- count = 0
88
- @text.each { |t| count += t.length }
89
- hash[:text_length] = @text.length
90
- hash[:text_bytes] = count
91
- # Else take the #length method return value.
92
- else
93
- next unless instance_variable_get(var).respond_to?(:length)
94
- hash[var[1..-1].to_sym] =
95
- instance_variable_get(var).send(:length)
96
- end
97
- end
98
- hash
212
+ # Get all internal links of this Document and append them to this
213
+ # Document's base URL.
214
+ #
215
+ # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
216
+ # form.
217
+ def internal_full_links
218
+ in_links = internal_links
219
+ return [] if in_links.empty?
220
+ in_links.map do |link|
221
+ link.replace("/" + link) unless link.start_with?("/")
222
+ Wgit::Url.new(@url.to_base + link)
99
223
  end
100
-
101
- def size
102
- stats[:html]
224
+ end
225
+
226
+ # Get all external links of this Document.
227
+ #
228
+ # @return [Array<Wgit::Url>] self's external/absolute URL's.
229
+ def external_links
230
+ return [] if @links.empty?
231
+ @links.reject do |link|
232
+ begin
233
+ link.relative_link?
234
+ rescue
235
+ true
236
+ end
103
237
  end
238
+ end
239
+
240
+ # Searches against the @text for the given search query.
241
+ # The number of search hits for each sentenence are recorded internally
242
+ # and used to rank/sort the search results before being returned. Where
243
+ # the Wgit::Database#search method search all documents for the most hits,
244
+ # this method searches each document's @text for the most hits.
245
+ #
246
+ # Each search result comprises of a sentence of a given length. The length
247
+ # will be based on the sentence_limit parameter or the full length of the
248
+ # original sentence, which ever is less. The algorithm obviously ensures
249
+ # that the search query is visible somewhere in the sentence.
250
+ #
251
+ # @param query [String] The value to search the document's text against.
252
+ # @param sentence_limit [Integer] The max length of each search result
253
+ # sentence.
254
+ # @return [Array<String>] Representing the search results.
255
+ def search(query, sentence_limit = 80)
256
+ raise "A search value must be provided" if query.empty?
257
+ raise "The sentence length value must be even" if sentence_limit.odd?
258
+
259
+ results = {}
260
+ regex = Regexp.new(query, Regexp::IGNORECASE)
104
261
 
105
- def to_h(include_html = false)
106
- ignore = include_html ? [] : [:@html]
107
- ignore << :@doc # Always ignore :@doc
108
- Wgit::Utils.to_h(self, ignore)
262
+ @text.each do |sentence|
263
+ hits = sentence.scan(regex).count
264
+ if hits > 0
265
+ sentence.strip!
266
+ index = sentence.index(regex)
267
+ Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
268
+ results[sentence] = hits
269
+ end
109
270
  end
271
+
272
+ return [] if results.empty?
273
+ results = Hash[results.sort_by { |k, v| v }]
274
+ results.keys.reverse
275
+ end
276
+
277
+ # Performs a text search (see Document#search for details) but assigns the
278
+ # results to the @text instance variable. This can be used for sub search
279
+ # functionality. The original text is returned; no other reference to it
280
+ # is kept thereafter.
281
+ #
282
+ # @param query [String] The value to search the document's text against.
283
+ # @return [String] This Document's original @text value.
284
+ def search!(query)
285
+ orig_text = @text
286
+ @text = search(query)
287
+ orig_text
288
+ end
289
+
290
+ ### Document (Class) methods ###
291
+
292
+ # Returns Document.text_elements used to obtain the text in a webpage.
293
+ #
294
+ # @return [Array<Symbols>] The page elements containing visual text on a
295
+ # webpage.
296
+ def self.text_elements
297
+ @@text_elements
298
+ end
299
+
300
+ # Initialises a private instance variable with the xpath or database object
301
+ # result(s). When initialising from HTML, a true singleton value will only
302
+ # ever return one result otherwise all xpath results are returned in an
303
+ # Array. When initialising from a database object, the value is taken as
304
+ # is and singleton is only used to define the default empty value.
305
+ # If a value cannot be found (in either the HTML or database object), then
306
+ # a default will be used. The default is: singleton ? nil : [].
307
+ #
308
+ # Note that defined extensions work for both documents being crawled from
309
+ # the WWW and for documents being retrieved from the database. This
310
+ # effectively implements ORM like behavior using this class.
311
+ #
312
+ # @param var [Symbol] The name of the variable to be initialised.
313
+ # @param xpath [String] Used to find the element(s) of the webpage.
314
+ # @option options [Boolean] :singleton The singleton option determines
315
+ # whether or not the result(s) should be in an Array. If multiple
316
+ # results are found and singleton is true then the first result will be
317
+ # used. Defaults to true.
318
+ # @option options [Boolean] :text_content_only The text_content_only option
319
+ # if true will use the text content of the Nokogiri result object,
320
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
321
+ # @yield [var_value] Gives the value about to be assigned to the new var.
322
+ # The return value of the block becomes the new var value, unless nil.
323
+ # Return nil if you want to inspect but not change the var value.
324
+ # @return [Symbol] The first half of the newly created method names e.g.
325
+ # if var == "title" then :init_title is returned.
326
+ def self.define_extension(var, xpath, options = {}, &block)
327
+ default_options = { singleton: true, text_content_only: true }
328
+ options = default_options.merge(options)
110
329
 
111
- # Override of the default == method, is equal if url and html both match.
112
- # Use doc.object_id == other_doc.object_id for exact object comparison.
113
- def ==(other_doc)
114
- return false unless other_doc.is_a? Wgit::Document
115
- url == other_doc.url and html == other_doc.html
330
+ # Define the private init_*_from_html method for HTML.
331
+ # Gets the HTML's xpath value and creates a var for it.
332
+ func_name = Document.send(:define_method, "init_#{var}_from_html") do
333
+ result = find_in_html(xpath, options, &block)
334
+ init_var(var, result)
335
+ end
336
+ Document.send :private, func_name
337
+
338
+ # Define the private init_*_from_object method for a Database object.
339
+ # Gets the Object's "key" value and creates a var for it.
340
+ func_name = Document.send(
341
+ :define_method, "init_#{var}_from_object") do |obj|
342
+ result = find_in_object(
343
+ obj, var.to_s, singleton: options[:singleton], &block)
344
+ init_var(var, result)
345
+ end
346
+ Document.send :private, func_name
347
+
348
+ "init_#{var}".to_sym
349
+ end
350
+
351
+ # Removes the init_* methods created when an extension is defined.
352
+ # Therefore, this is the opposing method to Document.define_extension.
353
+ # Returns true if successful or false if the method(s) cannot be found.
354
+ #
355
+ # @param var [Symbol] The extension variable already defined.
356
+ # @return [Boolean] True if the extension var was found and removed;
357
+ # otherwise false.
358
+ def self.remove_extension(var)
359
+ Document.send(:remove_method, "init_#{var}_from_html")
360
+ Document.send(:remove_method, "init_#{var}_from_object")
361
+ true
362
+ rescue NameError
363
+ false
364
+ end
365
+
366
+ private
367
+
368
+ # Initializes the nokogiri object using @html, which must be already set.
369
+ def init_nokogiri
370
+ raise "@html must be set" unless @html
371
+ Nokogiri::HTML(@html) do |config|
372
+ # TODO: Remove #'s below when crawling in production.
373
+ #config.options = Nokogiri::XML::ParseOptions::STRICT |
374
+ # Nokogiri::XML::ParseOptions::NONET
116
375
  end
376
+ end
377
+
378
+ # Returns an object/value from this Document's @html using the provided
379
+ # xpath param.
380
+ # singleton ? results.first (single Object) : results (Array)
381
+ # text_content_only ? result.content (String) : result (nokogiri Object)
382
+ # A block can be used to set the final value before it is returned.
383
+ # Return nil from the block if you don't want to override the value.
384
+ def find_in_html(xpath, singleton: true, text_content_only: true)
385
+ results = @doc.xpath(xpath)
117
386
 
118
- # Shortcut for calling Document#html[range].
119
- def [](range)
120
- html[range]
387
+ if results and not results.empty?
388
+ result = if singleton
389
+ text_content_only ? results.first.content : results.first
390
+ else
391
+ text_content_only ? results.map(&:content) : results
392
+ end
393
+ else
394
+ result = singleton ? nil : []
121
395
  end
122
-
123
- def empty?
124
- html.strip.empty?
396
+
397
+ singleton ? process_str(result) : process_arr(result)
398
+
399
+ if block_given?
400
+ new_result = yield(result)
401
+ result = new_result if new_result
125
402
  end
126
-
127
- # Searches against the Document#text for the given search text.
128
- # The number of search hits for each sentenence are recorded internally
129
- # and used to rank/sort the search results before being returned. Where
130
- # the Database#search method search all documents for the most hits this
131
- # method searches each documents text for the most hits.
132
- #
133
- # Each search result comprises of a sentence of a given length. The length
134
- # will be based on the sentence_limit parameter or the full length of the
135
- # original sentence, which ever is less. The algorithm obviously ensures
136
- # that the search value is visible somewhere in the sentence.
137
- #
138
- # @param text [String] the value to search the document text against.
139
- # @param sentence_limit [Fixnum] the length of each search result
140
- # sentence.
141
- #
142
- # @return [Array] of String objects representing the search results.
143
- def search(text, sentence_limit = 80)
144
- raise "A search value must be provided" if text.empty?
145
- raise "The sentence length value must be even" if sentence_limit.odd?
146
-
147
- results = {}
148
- regex = Regexp.new(text, Regexp::IGNORECASE)
149
-
150
- @text.each do |sentence|
151
- hits = sentence.scan(regex).count
152
- if hits > 0
153
- sentence.strip!
154
- index = sentence.index(regex)
155
- Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
156
- results[sentence] = hits
157
- end
158
- end
159
-
160
- return [] if results.empty?
161
- results = Hash[results.sort_by { |k, v| v }]
162
- results.keys.reverse
403
+
404
+ result
405
+ end
406
+
407
+ # Finds a value in the obj using the key.
408
+ # singleton is used to set the value if not found in obj.
409
+ # A block can be used to set the final value before it is returned.
410
+ # Return nil from the block if you don't want to override the value.
411
+ def find_in_object(obj, key, singleton: true)
412
+ assert_respond_to(obj, :fetch)
413
+
414
+ default = singleton ? nil : []
415
+ result = obj.fetch(key.to_s, default)
416
+ singleton ? process_str(result) : process_arr(result)
417
+
418
+ if block_given?
419
+ new_result = yield(result)
420
+ result = new_result if new_result
163
421
  end
422
+
423
+ result
424
+ end
164
425
 
165
- # Performs a text search (see search for details) but assigns the results
166
- # to the @text instance variable. This can be used for sub search
167
- # functionality. Note that there is no way of getting the original text
168
- # back however.
169
- def search!(text)
170
- @text = search(text)
426
+ # Initialises an instance variable and defines a getter method for it.
427
+ # @param var [Symbol] The name of the variable to be initialized.
428
+ # @param value [Object] The newly initialized variable's value.
429
+ # @return [Symbol] The name of the newly created getter method.
430
+ def init_var(var, value)
431
+ # instance_var_name starts with @, var_name doesn't.
432
+ var = var.to_s
433
+ var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
434
+ instance_var_name = "@#{var_name}".to_sym
435
+
436
+ instance_variable_set(instance_var_name, value)
437
+
438
+ Document.send(:define_method, var_name) do
439
+ instance_variable_get(instance_var_name)
171
440
  end
441
+ end
172
442
 
173
- # Uses Nokogiri's xpath method to search the doc's html and return the
174
- # results.
175
- def xpath(xpath)
176
- @doc.xpath(xpath)
443
+ # Takes Docuent.text_elements and returns an xpath String used to obtain
444
+ # all of the combined text.
445
+ def text_elements_xpath
446
+ xpath = ""
447
+ return xpath if @@text_elements.empty?
448
+ el_xpath = "//%s/text()"
449
+ @@text_elements.each_with_index do |el, i|
450
+ xpath += " | " unless i == 0
451
+ xpath += el_xpath % [el]
177
452
  end
178
-
179
- private
453
+ xpath
454
+ end
180
455
 
181
- def process_str(str)
182
- str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
183
- str.strip!
184
- str # This is required to return the str, do not remove.
456
+ # Processes a String to make it uniform.
457
+ def process_str(str)
458
+ if str.is_a?(String)
459
+ str.encode!('UTF-8', 'UTF-8', invalid: :replace)
460
+ str.strip!
185
461
  end
462
+ str
463
+ end
186
464
 
187
- def process_arr(array)
188
- assert_arr_types(array, String)
189
- array.map! { |str| process_str(str) }
190
- array.reject! { |str| str.empty? }
191
- array.uniq!
465
+ # Processes an Array to make it uniform.
466
+ def process_arr(array)
467
+ if array.is_a?(Array)
468
+ array.map! { |str| process_str(str) }
469
+ array.reject! { |str| str.is_a?(String) ? str.empty? : false }
470
+ array.uniq!
192
471
  end
193
-
194
- # Modifies internal links by removing this doc's base or host url if
195
- # present. http://www.google.co.uk/about.html (with or without the
196
- # protocol prefix) will become about.html meaning it'll appear within
197
- # internal_links.
198
- def process_internal_links(links)
199
- links.map! do |link|
200
- host_or_base = if link.start_with?("http")
201
- url.base
202
- else
203
- url.host
204
- end
205
- if link.start_with?(host_or_base)
206
- link.sub!(host_or_base, "")
207
- link.replace(link[1..-1]) if link.start_with?("/")
208
- link.strip!
209
- end
210
- link
211
- end
472
+ array
473
+ end
474
+
475
+ # Modifies internal links by removing this doc's base or host URL, if
476
+ # present. http://www.google.co.uk/about.html (with or without the
477
+ # protocol prefix) will become about.html meaning it'll appear within
478
+ # Document#internal_links.
479
+ def process_internal_links(links)
480
+ links.map! do |link|
481
+ host_or_base = if link.start_with?("http")
482
+ @url.base
483
+ else
484
+ @url.host
485
+ end
486
+ if link.start_with?(host_or_base)
487
+ link.sub!(host_or_base, "")
488
+ link.replace(link[1..-1]) if link.start_with?("/")
489
+ link.strip!
490
+ end
491
+ link
212
492
  end
493
+ end
494
+
495
+ ### Default init_* (Document extension) methods. ###
213
496
 
214
- def text_elements_xpath
215
- xpath = ""
216
- return xpath if TEXT_ELEMENTS.empty?
217
- el_xpath = "//%s/text()"
218
- TEXT_ELEMENTS.each_with_index do |el, i|
219
- xpath += " | " unless i == 0
220
- xpath += el_xpath % [el]
221
- end
222
- xpath
223
- end
497
+ # Init methods for title.
224
498
 
225
- def init_var(xpath, var, first_result = true)
226
- results = @doc.xpath(xpath)
227
- unless results.nil? || results.empty?
228
- result = if first_result
229
- results.first.content
230
- else
231
- results.map { |res| res.content }
232
- end
233
- instance_variable_set(var, result)
234
- end
235
- end
236
-
237
- def init_title
238
- @title = nil
499
+ def init_title_from_html
239
500
  xpath = "//title"
240
- init_var(xpath, :@title)
241
- process_str(@title) unless @title.nil?
242
- end
243
-
244
- def init_author
245
- @author = nil
501
+ result = find_in_html(xpath)
502
+ init_var(:@title, result)
503
+ end
504
+
505
+ def init_title_from_object(obj)
506
+ result = find_in_object(obj, "title")
507
+ init_var(:@title, result)
508
+ end
509
+
510
+ # Init methods for author.
511
+
512
+ def init_author_from_html
246
513
  xpath = "//meta[@name='author']/@content"
247
- init_var(xpath, :@author)
248
- process_str(@author) unless @author.nil?
249
- end
250
-
251
- def init_keywords
252
- @keywords = nil
514
+ result = find_in_html(xpath)
515
+ init_var(:@author, result)
516
+ end
517
+
518
+ def init_author_from_object(obj)
519
+ result = find_in_object(obj, "author")
520
+ init_var(:@author, result)
521
+ end
522
+
523
+ # Init methods for keywords.
524
+
525
+ def init_keywords_from_html
253
526
  xpath = "//meta[@name='keywords']/@content"
254
- init_var(xpath, :@keywords)
255
- return @keywords = [] unless @keywords
256
- @keywords = @keywords.split(",")
257
- process_arr(@keywords)
258
- end
527
+ result = find_in_html(xpath) do |keywords|
528
+ if keywords
529
+ keywords = keywords.split(",")
530
+ process_arr(keywords)
531
+ end
532
+ keywords
533
+ end
534
+ init_var(:@keywords, result)
535
+ end
536
+
537
+ def init_keywords_from_object(obj)
538
+ result = find_in_object(obj, "keywords", singleton: false)
539
+ init_var(:@keywords, result)
540
+ end
259
541
 
260
- def init_links
261
- @links = nil
542
+ # Init methods for links.
543
+
544
+ def init_links_from_html
262
545
  xpath = "//a/@href"
263
- init_var(xpath, :@links, false)
264
- return @links = [] unless @links
265
- process_arr(@links)
266
- @links.reject! { |link| link == "/" }
267
- @links.map! do |link|
268
- begin
269
- Wgit::Url.new(link)
270
- rescue
271
- nil
546
+ result = find_in_html(xpath, singleton: false) do |links|
547
+ if links
548
+ links.reject! { |link| link == "/" }
549
+ links.map! do |link|
550
+ begin
551
+ Wgit::Url.new(link)
552
+ rescue
553
+ nil
554
+ end
555
+ end
556
+ links.reject! { |link| link.nil? }
557
+ process_internal_links(links)
272
558
  end
559
+ links
273
560
  end
274
- @links.reject! { |link| link.nil? }
275
- process_internal_links(@links)
561
+ init_var(:@links, result)
276
562
  end
277
-
278
- def init_text
279
- @text = nil
563
+
564
+ def init_links_from_object(obj)
565
+ result = find_in_object(obj, "links", singleton: false) do |links|
566
+ if links
567
+ links.map! { |link| Wgit::Url.new(link) }
568
+ end
569
+ links
570
+ end
571
+ init_var(:@links, result)
572
+ end
573
+
574
+ # Init methods for text.
575
+
576
+ def init_text_from_html
280
577
  xpath = text_elements_xpath
281
- init_var(xpath, :@text, false)
282
- return @text = [] unless @text
283
- process_arr(@text)
578
+ result = find_in_html(xpath, singleton: false)
579
+ init_var(:@text, result)
580
+ end
581
+
582
+ def init_text_from_object(obj)
583
+ result = find_in_object(obj, "text", singleton: false)
584
+ init_var(:@text, result)
284
585
  end
285
586
 
286
- alias :to_hash :to_h
587
+ alias :to_hash :to_h
287
588
  alias :relative_links :internal_links
288
589
  alias :relative_urls :internal_links
289
590
  alias :relative_full_links :internal_full_links