wgit 0.0.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,14 +3,14 @@
3
3
  require_relative '../utils'
4
4
 
5
5
  module Wgit
6
- # Module containing the database (DB) data model structure.
6
+ # Module used to build the database collection objects.
7
7
  module Model
8
8
  # The data model for a Wgit::Url.
9
9
  #
10
- # @param url [Wgit::Url] The URL DB record.
10
+ # @param url [Wgit::Url] The Url DB record.
11
11
  # @return [Hash] The URL model ready for DB insertion.
12
12
  def self.url(url)
13
- raise 'url must respond_to? to_h' unless url.respond_to?(:to_h)
13
+ raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
14
14
 
15
15
  model = url.to_h
16
16
  Wgit::Utils.remove_non_bson_types(model)
@@ -21,13 +21,13 @@ module Wgit
21
21
  # @param doc [Wgit::Document] The Document DB record.
22
22
  # @return [Hash] The Document model ready for DB insertion.
23
23
  def self.document(doc)
24
- raise 'doc must respond_to? to_h' unless doc.respond_to?(:to_h)
24
+ raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
25
25
 
26
- model = doc.to_h(false)
26
+ model = doc.to_h(include_html: false)
27
27
  Wgit::Utils.remove_non_bson_types(model)
28
28
  end
29
29
 
30
- # Default fields when inserting a record into the DB.
30
+ # Common fields when inserting a record into the DB.
31
31
  #
32
32
  # @return [Hash] Containing common insertion fields for all models.
33
33
  def self.common_insert_data
@@ -37,7 +37,7 @@ module Wgit
37
37
  }
38
38
  end
39
39
 
40
- # Default fields when updating a record in the DB.
40
+ # Common fields when updating a record in the DB.
41
41
  #
42
42
  # @return [Hash] Containing common update fields for all models.
43
43
  def self.common_update_data
@@ -6,15 +6,18 @@ require 'json'
6
6
 
7
7
  module Wgit
8
8
  # Class modeling a HTML web document. Also doubles as a search result when
9
- # loading Documents from the database.
9
+ # loading Documents from the database via Wgit::Database#search.
10
10
  #
11
- # The initialize method dynamically initializes certain variables from the
11
+ # The initialize method dynamically initializes instance variables from the
12
12
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
13
13
  # Document class can be easily extended allowing you to pull out the bits of
14
14
  # a webpage that are important to you. See Wgit::Document.define_extension.
15
15
  class Document
16
16
  include Assertable
17
17
 
18
+ # Regex for the allowed var names when defining an extension.
19
+ REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+
18
21
  # The HTML elements that make up the visible text on a page.
19
22
  # These elements are used to initialize the @text of the Document.
20
23
  # See the README.md for how to add to this Array dynamically.
@@ -25,7 +28,6 @@ module Wgit
25
28
 
26
29
  class << self
27
30
  # Class level instance reader method for @text_elements.
28
- # Call using Wgit::Document.text_elements.
29
31
  attr_reader :text_elements
30
32
  end
31
33
 
@@ -35,7 +37,7 @@ module Wgit
35
37
  # The HTML of the webpage, an instance of String.
36
38
  attr_reader :html
37
39
 
38
- # The Nokogiri document object initialized from @html.
40
+ # The Nokogiri::HTML document object initialized from @html.
39
41
  attr_reader :doc
40
42
 
41
43
  # The score is only used following a Database#search and records matches.
@@ -43,72 +45,140 @@ module Wgit
43
45
 
44
46
  # Initialize takes either two strings (representing the URL and HTML) or an
45
47
  # object representing a database record (of a HTTP crawled web page). This
46
- # allows for initialisation from both crawled web pages and (afterwards)
47
- # documents/web pages retrieved from the database.
48
- #
49
- # During initialisation, the Document will call any
50
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
51
- # default init_* methods exist while others can be defined by the user.
52
- # See the README and Wgit::Document.define_extension for more info.
53
- #
54
- # @param url_or_obj [String, Object#fetch] Either a String representing a
55
- # URL or a Hash-like object responding to :fetch. e.g. a MongoDB
56
- # collection object. The Object's :fetch method should support Strings as
57
- # keys.
58
- # @param html [String] The crawled web page's HTML. This param is only
59
- # required if url_or_obj is a String representing the web page's URL.
48
+ # allows for initialisation from both crawled web pages and documents/web
49
+ # pages retrieved from the database.
50
+ #
51
+ # During initialisation, the Document will call any private
52
+ # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
53
+ # README.md and Wgit::Document.define_extension method for more details.
54
+ #
55
+ # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
56
+ # representing a URL or a Hash-like object responding to :fetch. e.g. a
57
+ # MongoDB collection object. The Object's :fetch method should support
58
+ # Strings as keys.
59
+ # @param html [String, NilClass] The crawled web page's HTML. This param is
60
+ # only used if url_or_obj is a String representing the web page's URL.
61
+ # Otherwise, the HTML comes from the database object. A html of nil will
62
+ # be defaulted to an empty String.
60
63
  def initialize(url_or_obj, html = '')
61
- # Init from URL String and HTML String.
62
64
  if url_or_obj.is_a?(String)
63
- url = url_or_obj
64
- assert_type(url, Wgit::Url)
65
-
66
- @url = url
67
- @html = html || ''
68
- @doc = init_nokogiri
69
- @score = 0.0
70
-
71
- process_url_and_html
72
-
73
- # Dynamically run the init_*_from_html methods.
74
- Document.private_instance_methods(false).each do |method|
75
- if method.to_s.start_with?('init_') &&
76
- method.to_s.end_with?('_from_html')
77
- send(method)
78
- end
79
- end
80
- # Init from a Hash like object containing Strings as keys e.g. Mongo
81
- # collection obj.
65
+ init_from_strings(url_or_obj, html)
82
66
  else
83
- obj = url_or_obj
84
- assert_respond_to(obj, :fetch)
85
-
86
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
87
- @html = obj.fetch('html', '')
88
- @doc = init_nokogiri
89
- @score = obj.fetch('score', 0.0)
90
-
91
- process_url_and_html
92
-
93
- # Dynamically run the init_*_from_object methods.
94
- Document.private_instance_methods(false).each do |method|
95
- if method.to_s.start_with?('init_') &&
96
- method.to_s.end_with?('_from_object')
97
- send(method, obj)
98
- end
99
- end
67
+ init_from_object(url_or_obj)
68
+ end
69
+ end
70
+
71
+ ### Document Class Methods ###
72
+
73
+ # Uses Document.text_elements to build an xpath String, used to obtain
74
+ # all of the combined text on a webpage.
75
+ #
76
+ # @return [String] An xpath String to obtain a webpage's text elements.
77
+ def self.text_elements_xpath
78
+ xpath = ''
79
+ return xpath if Wgit::Document.text_elements.empty?
80
+
81
+ el_xpath = '//%s/text()'
82
+ Wgit::Document.text_elements.each_with_index do |el, i|
83
+ xpath += ' | ' unless i.zero?
84
+ xpath += format(el_xpath, el)
100
85
  end
86
+
87
+ xpath
101
88
  end
102
89
 
90
+ # Defines an extension, which is a way to extract HTML elements into
91
+ # instance variables upon Document initialization. See the default
92
+ # extensions defined in 'document_extensions.rb' as examples.
93
+ #
94
+ # Initialises a private instance variable with the xpath or database object
95
+ # result(s). When initialising from HTML, a true singleton value will only
96
+ # ever return one result otherwise all xpath results are returned in an
97
+ # Array. When initialising from a database object, the value is taken as
98
+ # is and singleton is only used to define the default empty value.
99
+ # If a value cannot be found (in either the HTML or database object), then
100
+ # a default will be used. The default value is: singleton ? nil : [].
101
+ #
102
+ # Note that defined extensions work for both documents initialized from
103
+ # the WWW (via Wgit::Crawler methods) and from database objects. This
104
+ # effectively implements ORM like behavior using this class.
105
+ #
106
+ # @param var [Symbol] The name of the variable to be initialised.
107
+ # @param xpath [String, Object#call] The xpath used to find the element(s)
108
+ # of the webpage. Pass a callable object (proc etc.) if you want the
109
+ # xpath value to be derived on Document initialisation (instead of when
110
+ # the extension is defined). The call method must return a valid xpath
111
+ # String.
112
+ # @param options [Hash] The options to define an extension with.
113
+ # @option options [Boolean] :singleton The singleton option determines
114
+ # whether or not the result(s) should be in an Array. If multiple
115
+ # results are found and singleton is true then the first result will be
116
+ # used. Defaults to true.
117
+ # @option options [Boolean] :text_content_only The text_content_only option
118
+ # if true will use the text content of the Nokogiri result object,
119
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
120
+ # @yield [value, source] Yields the value (Object) about to be assigned to
121
+ # the new var and the source (Symbol) of the value (either :html or
122
+ # :object). The return value of the block becomes the new var value,
123
+ # unless nil. Return nil if you want to inspect but not change the var
124
+ # value. The block gets executed when a Document is initialized from html
125
+ # or an object e.g. database.
126
+ # @raise [StandardError] If the var param isn't valid.
127
+ # @return [Symbol] The first half of the newly defined method names e.g.
128
+ # if var == "title" then :init_title is returned.
129
+ def self.define_extension(var, xpath, options = {}, &block)
130
+ default_options = { singleton: true, text_content_only: true }
131
+ options = default_options.merge(options)
132
+
133
+ raise "var must match #{REGEX_EXTENSION_NAME}" unless \
134
+ var =~ REGEX_EXTENSION_NAME
135
+
136
+ # Define the private init_*_from_html method for HTML.
137
+ # Gets the HTML's xpath value and creates a var for it.
138
+ func_name = Document.send(:define_method, "init_#{var}_from_html") do
139
+ result = find_in_html(xpath, options, &block)
140
+ init_var(var, result)
141
+ end
142
+ Document.send :private, func_name
143
+
144
+ # Define the private init_*_from_object method for a Database object.
145
+ # Gets the Object's 'key' value and creates a var for it.
146
+ func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
147
+ result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
148
+ init_var(var, result)
149
+ end
150
+ Document.send :private, func_name
151
+
152
+ "init_#{var}".to_sym
153
+ end
154
+
155
+ # Removes the init_* methods created when an extension is defined.
156
+ # Therefore, this is the opposing method to Document.define_extension.
157
+ # Returns true if successful or false if the method(s) cannot be found.
158
+ #
159
+ # @param var [Symbol] The extension variable already defined.
160
+ # @return [Boolean] True if the extension var was found and removed;
161
+ # otherwise false.
162
+ def self.remove_extension(var)
163
+ Document.send(:remove_method, "init_#{var}_from_html")
164
+ Document.send(:remove_method, "init_#{var}_from_object")
165
+
166
+ true
167
+ rescue NameError
168
+ false
169
+ end
170
+
171
+ ### Document Instance Methods ###
172
+
103
173
  # Determines if both the url and html match. Use
104
- # doc.object_id == other_doc.object_id for exact object comparison.
174
+ # doc.object_id == other.object_id for exact object comparison.
105
175
  #
106
- # @param other_doc [Wgit::Document] To compare self against.
176
+ # @param other [Wgit::Document] To compare self against.
107
177
  # @return [Boolean] True if @url and @html are equal, false if not.
108
- def ==(other_doc)
109
- return false unless other_doc.is_a? Wgit::Document
178
+ def ==(other)
179
+ return false unless other.is_a?(Wgit::Document)
110
180
 
111
- (@url == other_doc.url) && (@html == other_doc.html)
181
+ (@url == other.url) && (@html == other.html)
112
182
  end
113
183
 
114
184
  # Is a shortcut for calling Document#html[range].
@@ -129,33 +199,38 @@ module Wgit
129
199
  # Returns the base URL of this Wgit::Document. The base URL is either the
130
200
  # <base> element's href value or @url (if @base is nil). If @base is
131
201
  # present and relative, then @url.to_base + @base is returned. This method
132
- # should be used instead of `doc.url.to_base` etc. if manually building
133
- # absolute links.
202
+ # should be used instead of `doc.url.to_base` etc. when manually building
203
+ # absolute links from relative links.
134
204
  #
135
205
  # Provide the `link:` parameter to get the correct base URL for that type
136
206
  # of link. For example, a link of `#top` would always return @url because
137
207
  # it applies to that page, not a different one. Query strings work in the
138
- # same way. Use this parameter if manually concatting links e.g.
139
- # `absolute_link = doc.base_url(link: link).concat(link)` etc.
208
+ # same way. Use this parameter if manually concatting Url's e.g.
140
209
  #
141
- # @param link [Wgit::Url] The link to obtain the correct base URL for.
210
+ # relative_link = Wgit::Url.new '?q=hello'
211
+ # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
212
+ #
213
+ # This is similar to how Wgit::Document#internal_absolute_links works.
214
+ #
215
+ # @param link [Wgit::Url, String] The link to obtain the correct base URL
216
+ # for.
142
217
  # @return [Wgit::Url] The base URL of this Document e.g.
143
218
  # 'http://example.com/public'.
144
219
  def base_url(link: nil)
145
220
  get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
146
221
 
147
222
  if link
148
- assert_type(link, Wgit::Url)
223
+ link = Wgit::Url.new(link)
149
224
  raise "link must be relative: #{link}" unless link.is_relative?
150
225
 
151
- if link.is_anchor? || link.is_query_string?
226
+ if link.is_anchor? || link.is_query?
152
227
  base_url = @base ? get_base.call : @url
153
- return base_url.without_anchor.without_query_string
228
+ return base_url.without_anchor.without_query
154
229
  end
155
230
  end
156
231
 
157
232
  base_url = @base ? get_base.call : @url.base
158
- base_url.without_anchor.without_query_string
233
+ base_url.without_anchor.without_query
159
234
  end
160
235
 
161
236
  # Returns a Hash containing this Document's instance vars.
@@ -165,52 +240,51 @@ module Wgit
165
240
  # @param include_html [Boolean] Whether or not to include @html in the
166
241
  # returned Hash.
167
242
  # @return [Hash] Containing self's instance vars.
168
- def to_h(include_html = false)
243
+ def to_h(include_html: false)
169
244
  ignore = include_html ? [] : ['@html']
170
- ignore << '@doc' # Always ignore "@doc"
171
- Wgit::Utils.to_h(self, ignore)
245
+ ignore << '@doc' # Always ignore Nokogiri @doc.
246
+
247
+ Wgit::Utils.to_h(self, ignore: ignore)
172
248
  end
173
249
 
174
- # Converts this Document's to_h return value to a JSON String.
250
+ # Converts this Document's #to_h return value to a JSON String.
175
251
  #
176
252
  # @param include_html [Boolean] Whether or not to include @html in the
177
253
  # returned JSON String.
178
254
  # @return [String] This Document represented as a JSON String.
179
- def to_json(include_html = false)
180
- h = to_h(include_html)
255
+ def to_json(include_html: false)
256
+ h = to_h(include_html: include_html)
181
257
  JSON.generate(h)
182
258
  end
183
259
 
184
260
  # Returns a Hash containing this Document's instance variables and
185
- # their :length (if they respond to it). Works dynamically so that any
261
+ # their #length (if they respond to it). Works dynamically so that any
186
262
  # user defined extensions (and their created instance vars) will appear in
187
263
  # the returned Hash as well. The number of text snippets as well as total
188
264
  # number of textual bytes are always included in the returned Hash.
189
265
  #
190
- # @return [Hash] Containing self's HTML statistics.
266
+ # @return [Hash] Containing self's HTML page statistics.
191
267
  def stats
192
268
  hash = {}
193
269
  instance_variables.each do |var|
194
270
  # Add up the total bytes of text as well as the length.
195
271
  if var == :@text
196
- count = 0
197
- @text.each { |t| count += t.length }
198
272
  hash[:text_snippets] = @text.length
199
- hash[:text_bytes] = count
273
+ hash[:text_bytes] = @text.sum(&:length)
200
274
  # Else take the var's #length method return value.
201
275
  else
202
276
  next unless instance_variable_get(var).respond_to?(:length)
203
277
 
204
- hash[var[1..-1].to_sym] =
205
- instance_variable_get(var).send(:length)
278
+ hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
206
279
  end
207
280
  end
281
+
208
282
  hash
209
283
  end
210
284
 
211
285
  # Determine the size of this Document's HTML.
212
286
  #
213
- # @return [Integer] The total number of bytes in @html.
287
+ # @return [Integer] The total number of @html bytes.
214
288
  def size
215
289
  stats[:html]
216
290
  end
@@ -242,56 +316,55 @@ module Wgit
242
316
  @doc.css(selector)
243
317
  end
244
318
 
245
- # Get all the internal links of this Document in relative form. Internal
246
- # meaning a link to another document on the same host. This Document's host
247
- # is used to determine if an absolute URL is actually a relative link e.g.
248
- # For a Document representing http://www.server.com/about, an absolute link
249
- # of <a href='http://www.server.com/search'> will be recognized and
250
- # returned as an internal link because both Documents live on the same
251
- # host. Also see Wgit::Document#internal_full_links.
319
+ # Returns all internal links from this Document in relative form. Internal
320
+ # meaning a link to another document on the same host.
321
+ #
322
+ # This Document's host is used to determine if an absolute URL is actually
323
+ # a relative link e.g. For a Document representing
324
+ # http://www.server.com/about, an absolute link of
325
+ # <a href='http://www.server.com/search'> will be recognized and returned
326
+ # as an internal link because both Documents live on the same host. Also
327
+ # see Wgit::Document#internal_absolute_links.
252
328
  #
253
- # @return [Array<Wgit::Url>] self's internal/relative URL's.
329
+ # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
254
330
  def internal_links
255
331
  return [] if @links.empty?
256
332
 
257
333
  links = @links
258
334
  .select { |link| link.is_relative?(host: @url.to_base) }
259
335
  .map(&:without_base)
260
- .map do |link| # We map @url.to_host into / because it's a duplicate.
336
+ .map do |link| # Map @url.to_host into / as it's a duplicate.
261
337
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
262
338
  end
263
339
 
264
340
  Wgit::Utils.process_arr(links)
265
341
  end
266
342
 
267
- # Get all the internal links of this Document and append them to this
268
- # Document's base URL making them absolute. Also see
343
+ # Returns all internal links from this Document in absolute form by
344
+ # appending them to self's #base_url. Also see
269
345
  # Wgit::Document#internal_links.
270
346
  #
271
- # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
272
- # form.
273
- def internal_full_links
274
- links = internal_links
275
- return [] if links.empty?
276
-
277
- links.map { |link| base_url(link: link).concat(link) }
347
+ # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
348
+ def internal_absolute_links
349
+ internal_links.map { |link| base_url(link: link).concat(link) }
278
350
  end
279
351
 
280
- # Get all the external links of this Document. External meaning a link to
281
- # a different host.
352
+ # Returns all external links from this Document in absolute form. External
353
+ # meaning a link to a different host.
282
354
  #
283
- # @return [Array<Wgit::Url>] self's external/absolute URL's.
355
+ # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
284
356
  def external_links
285
357
  return [] if @links.empty?
286
358
 
287
359
  links = @links
288
- .reject { |link| link.relative_link?(host: @url.to_base) }
360
+ .reject { |link| link.is_relative?(host: @url.to_base) }
289
361
  .map(&:without_trailing_slash)
290
362
 
291
363
  Wgit::Utils.process_arr(links)
292
364
  end
293
365
 
294
- # Searches against the @text for the given search query.
366
+ # Searches the @text for the given query and returns the results.
367
+ #
295
368
  # The number of search hits for each sentenence are recorded internally
296
369
  # and used to rank/sort the search results before being returned. Where
297
370
  # the Wgit::Database#search method search all documents for the most hits,
@@ -302,24 +375,33 @@ module Wgit
302
375
  # original sentence, which ever is less. The algorithm obviously ensures
303
376
  # that the search query is visible somewhere in the sentence.
304
377
  #
305
- # @param query [String] The value to search the document's text against.
378
+ # @param query [String, Object#to_s] The value to search the document's
379
+ # @text for.
380
+ # @param case_sensitive [Boolean] Whether character case must match.
381
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
382
+ # for separately.
306
383
  # @param sentence_limit [Integer] The max length of each search result
307
384
  # sentence.
308
- # @return [Array<String>] Representing the search results.
309
- def search(query, sentence_limit = 80)
385
+ # @return [Array<String>] A subset of @text, matching the query.
386
+ def search(
387
+ query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
388
+ )
389
+ query = query.to_s
310
390
  raise 'A search query must be provided' if query.empty?
311
391
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
312
392
 
393
+ query = query.gsub(' ', '|') unless whole_sentence
394
+ regex = Regexp.new(query, !case_sensitive)
313
395
  results = {}
314
- regex = Regexp.new(query, Regexp::IGNORECASE)
315
396
 
316
397
  @text.each do |sentence|
317
398
  hits = sentence.scan(regex).count
318
- next unless hits > 0
399
+ next unless hits.positive?
319
400
 
320
401
  sentence.strip!
321
- index = sentence.index(regex)
402
+ index = sentence.index(regex) # Index of first match.
322
403
  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
404
+
323
405
  results[sentence] = hits
324
406
  end
325
407
 
@@ -334,112 +416,33 @@ module Wgit
334
416
  # functionality. The original text is returned; no other reference to it
335
417
  # is kept thereafter.
336
418
  #
337
- # @param query [String] The value to search the document's text against.
419
+ # @param query [String, Object#to_s] The value to search the document's
420
+ # @text for.
421
+ # @param case_sensitive [Boolean] Whether character case must match.
422
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
423
+ # for separately.
338
424
  # @param sentence_limit [Integer] The max length of each search result
339
425
  # sentence.
340
426
  # @return [String] This Document's original @text value.
341
- def search!(query, sentence_limit = 80)
427
+ def search!(
428
+ query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
429
+ )
342
430
  orig_text = @text
343
- @text = search(query, sentence_limit)
344
- orig_text
345
- end
346
-
347
- ### Document (Class) methods ###
431
+ @text = search(
432
+ query, case_sensitive: case_sensitive,
433
+ whole_sentence: whole_sentence, sentence_limit: sentence_limit
434
+ )
348
435
 
349
- # Uses Document.text_elements to build an xpath String, used to obtain
350
- # all of the combined text on a webpage.
351
- #
352
- # @return [String] An xpath String to obtain a webpage's text elements.
353
- def self.text_elements_xpath
354
- xpath = ''
355
- return xpath if Wgit::Document.text_elements.empty?
356
-
357
- el_xpath = '//%s/text()'
358
- Wgit::Document.text_elements.each_with_index do |el, i|
359
- xpath += ' | ' unless i == 0
360
- xpath += format(el_xpath, el)
361
- end
362
- xpath
363
- end
364
-
365
- # Initialises a private instance variable with the xpath or database object
366
- # result(s). When initialising from HTML, a true singleton value will only
367
- # ever return one result otherwise all xpath results are returned in an
368
- # Array. When initialising from a database object, the value is taken as
369
- # is and singleton is only used to define the default empty value.
370
- # If a value cannot be found (in either the HTML or database object), then
371
- # a default will be used. The default is: singleton ? nil : [].
372
- #
373
- # Note that defined extensions work for both documents being crawled from
374
- # the WWW and for documents being retrieved from the database. This
375
- # effectively implements ORM like behavior using this class.
376
- #
377
- # @param var [Symbol] The name of the variable to be initialised.
378
- # @param xpath [String, Object#call] The xpath used to find the element(s)
379
- # of the webpage. Pass a callable object (proc etc.) if you want the
380
- # xpath value to be derived on Document initialisation (instead of when
381
- # the extension is defined). The call method must return a valid xpath
382
- # String.
383
- # @param options [Hash] The options to define an extension with.
384
- # @option options [Boolean] :singleton The singleton option determines
385
- # whether or not the result(s) should be in an Array. If multiple
386
- # results are found and singleton is true then the first result will be
387
- # used. Defaults to true.
388
- # @option options [Boolean] :text_content_only The text_content_only option
389
- # if true will use the text content of the Nokogiri result object,
390
- # otherwise the Nokogiri object itself is returned. Defaults to true.
391
- # @yield [Object, Symbol] Yields the value about to be assigned to the new
392
- # var and the source of the value (either :html or :object aka database).
393
- # The return value of the block becomes the new var value, unless nil.
394
- # Return nil if you want to inspect but not change the var value. The
395
- # block gets executed when a Document is initialized from html or an
396
- # object.
397
- # @return [Symbol] The first half of the newly defined method names e.g.
398
- # if var == "title" then :init_title is returned.
399
- def self.define_extension(var, xpath, options = {}, &block)
400
- default_options = { singleton: true, text_content_only: true }
401
- options = default_options.merge(options)
402
-
403
- # Define the private init_*_from_html method for HTML.
404
- # Gets the HTML's xpath value and creates a var for it.
405
- func_name = Document.send(:define_method, "init_#{var}_from_html") do
406
- result = find_in_html(xpath, options, &block)
407
- init_var(var, result)
408
- end
409
- Document.send :private, func_name
410
-
411
- # Define the private init_*_from_object method for a Database object.
412
- # Gets the Object's "key" value and creates a var for it.
413
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
414
- result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
415
- init_var(var, result)
416
- end
417
- Document.send :private, func_name
418
-
419
- "init_#{var}".to_sym
420
- end
421
-
422
- # Removes the init_* methods created when an extension is defined.
423
- # Therefore, this is the opposing method to Document.define_extension.
424
- # Returns true if successful or false if the method(s) cannot be found.
425
- #
426
- # @param var [Symbol] The extension variable already defined.
427
- # @return [Boolean] True if the extension var was found and removed;
428
- # otherwise false.
429
- def self.remove_extension(var)
430
- Document.send(:remove_method, "init_#{var}_from_html")
431
- Document.send(:remove_method, "init_#{var}_from_object")
432
- true
433
- rescue NameError
434
- false
436
+ orig_text
435
437
  end
436
438
 
437
439
  protected
438
440
 
439
441
  # Initializes the nokogiri object using @html, which cannot be nil.
440
442
  # Override this method to custom configure the Nokogiri object returned.
441
- # Gets called from Wgit::Document.new.
443
+ # Gets called from Wgit::Document.new upon initialization.
442
444
  #
445
+ # @raise [StandardError] If @html isn't set.
443
446
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
444
447
  def init_nokogiri
445
448
  raise '@html must be set' unless @html
@@ -459,31 +462,30 @@ module Wgit
459
462
  # Object) : results (Array).
460
463
  # @param text_content_only [Boolean] text_content_only ? result.content
461
464
  # (String) : result (Nokogiri Object).
462
- # @yield [String/Object, Symbol] Given the value before it's set as an
463
- # instance variable so that you can inspect/alter the value if desired.
464
- # Return nil from the block if you don't want to override the value. Also
465
- # given the source which is always :html.
465
+ # @yield [value, source] Given the value (String/Object) before it's set as
466
+ # an instance variable so that you can inspect/alter the value if
467
+ # desired. Return nil from the block if you don't want to override the
468
+ # value. Also given the source (Symbol) which is always :html.
466
469
  # @return [String, Object] The value found in the html or the default value
467
470
  # (singleton ? nil : []).
468
471
  def find_in_html(xpath, singleton: true, text_content_only: true)
469
- xpath = xpath.call if xpath.respond_to?(:call)
472
+ default = singleton ? nil : []
473
+ xpath = xpath.call if xpath.respond_to?(:call)
470
474
  results = @doc.xpath(xpath)
471
475
 
472
- if results && !results.empty?
473
- result = if singleton
474
- text_content_only ? results.first.content : results.first
475
- else
476
- text_content_only ? results.map(&:content) : results
477
- end
478
- else
479
- result = singleton ? nil : []
480
- end
476
+ return default if results.nil? || results.empty?
477
+
478
+ result = if singleton
479
+ text_content_only ? results.first.content : results.first
480
+ else
481
+ text_content_only ? results.map(&:content) : results
482
+ end
481
483
 
482
484
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
483
485
 
484
486
  if block_given?
485
487
  new_result = yield(result, :html)
486
- result = new_result if new_result
488
+ result = new_result unless new_result.nil?
487
489
  end
488
490
 
489
491
  result
@@ -494,22 +496,23 @@ module Wgit
494
496
  # @param obj [Object#fetch] The object containing the key/value.
495
497
  # @param key [String] Used to find the value in the obj.
496
498
  # @param singleton [Boolean] True if a single value, false otherwise.
497
- # @yield [String/Object, Symbol] Given the value before it's set as an
498
- # instance variable so that you can inspect/alter the value if desired.
499
- # Return nil from the block if you don't want to override the value. Also
500
- # given the source which is always :object.
499
+ # @yield [value, source] Given the value (String/Object) before it's set as
500
+ # an instance variable so that you can inspect/alter the value if
501
+ # desired. Return nil from the block if you don't want to override the
502
+ # value. Also given the source (Symbol) which is always :object.
501
503
  # @return [String, Object] The value found in the obj or the default value
502
504
  # (singleton ? nil : []).
503
505
  def find_in_object(obj, key, singleton: true)
504
506
  assert_respond_to(obj, :fetch)
505
507
 
506
508
  default = singleton ? nil : []
507
- result = obj.fetch(key.to_s, default)
509
+ result = obj.fetch(key.to_s, default)
510
+
508
511
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
509
512
 
510
513
  if block_given?
511
514
  new_result = yield(result, :object)
512
- result = new_result if new_result
515
+ result = new_result unless new_result.nil?
513
516
  end
514
517
 
515
518
  result
@@ -517,6 +520,54 @@ module Wgit
517
520
 
518
521
  private
519
522
 
523
+ # Initialise the Document from URL and HTML Strings.
524
+ def init_from_strings(url, html)
525
+ assert_types(html, [String, NilClass])
526
+
527
+ # We already know url.is_a?(String) so parse into Url unless already so.
528
+ @url = Wgit::Url.parse(url)
529
+ @html = html || ''
530
+ @doc = init_nokogiri
531
+ @score = 0.0
532
+
533
+ process_url_and_html
534
+
535
+ # Dynamically run the init_*_from_html methods.
536
+ Document.private_instance_methods(false).each do |method|
537
+ if method.to_s.start_with?('init_') &&
538
+ method.to_s.end_with?('_from_html')
539
+ send(method) unless method == __method__
540
+ end
541
+ end
542
+ end
543
+
544
+ # Initialise the Document from a Hash like Object containing Strings as
545
+ # keys e.g. database collection object or Hash.
546
+ def init_from_object(obj)
547
+ assert_respond_to(obj, :fetch)
548
+
549
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
550
+ @html = obj.fetch('html', '')
551
+ @doc = init_nokogiri
552
+ @score = obj.fetch('score', 0.0)
553
+
554
+ process_url_and_html
555
+
556
+ # Dynamically run the init_*_from_object methods.
557
+ Document.private_instance_methods(false).each do |method|
558
+ if method.to_s.start_with?('init_') &&
559
+ method.to_s.end_with?('_from_object')
560
+ send(method, obj) unless method == __method__
561
+ end
562
+ end
563
+ end
564
+
565
+ # Ensure the @url and @html Strings are correctly encoded etc.
566
+ def process_url_and_html
567
+ @url = Wgit::Utils.process_str(@url)
568
+ @html = Wgit::Utils.process_str(@html)
569
+ end
570
+
520
571
  # Initialises an instance variable and defines a getter method for it.
521
572
  #
522
573
  # @param var [Symbol] The name of the variable to be initialized.
@@ -535,19 +586,8 @@ module Wgit
535
586
  end
536
587
  end
537
588
 
538
- # Ensure the @url and @html Strings are correctly encoded etc.
539
- def process_url_and_html
540
- @url = Wgit::Utils.process_str(@url)
541
- @html = Wgit::Utils.process_str(@html)
542
- end
543
-
544
- alias relative_links internal_links
545
- alias relative_urls internal_links
546
- alias relative_full_links internal_full_links
547
- alias relative_full_urls internal_full_links
548
- alias internal_absolute_links internal_full_links
549
- alias relative_absolute_links internal_full_links
550
- alias relative_absolute_urls internal_full_links
551
- alias external_urls external_links
589
+ alias internal_urls internal_links
590
+ alias internal_absolute_urls internal_absolute_links
591
+ alias external_urls external_links
552
592
  end
553
593
  end