wgit 0.7.0 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/document.rb CHANGED
@@ -6,29 +6,41 @@ require 'json'
6
6
  require 'set'
7
7
 
8
8
  module Wgit
9
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
10
10
  # will work e.g. images etc. Also doubles as a search result when
11
11
  # loading Documents from the database via `Wgit::Database#search`.
12
12
  #
13
13
  # The initialize method dynamically initializes instance variables from the
14
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
15
- # Document class can be easily extended allowing you to pull out the bits of
16
- # a webpage that are important to you. See `Wgit::Document.define_extension`.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
17
17
  class Document
18
18
  include Assertable
19
19
 
20
- # Regex for the allowed var names when defining an extension.
21
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
22
22
 
23
- # The xpath used to extract the visible text on a page.
24
- TEXT_ELEMENTS_XPATH = '//*/text()'.freeze
23
+ # Set of text elements used to build Document#text.
24
+ @text_elements = Set.new(%i[
25
+ a abbr address article aside b bdi bdo blockquote button caption cite
26
+ code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
+ h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
+ option output p pre q rb rt ruby s samp section small span strong sub
29
+ summary sup td textarea th time u ul var wbr
30
+ ])
25
31
 
26
- # Set of Symbols representing the defined Document extensions.
27
- @extensions = Set.new
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
28
34
 
29
35
  class << self
30
- # Class level attr_reader for the Document defined extensions.
31
- attr_reader :extensions
36
+ # Set of HTML elements that make up the visible text on a page. These
37
+ # elements are used to initialize the Wgit::Document#text. See the
38
+ # README.md for how to add to this Set dynamically.
39
+ attr_reader :text_elements
40
+
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
32
44
  end
33
45
 
34
46
  # The URL of the webpage, an instance of Wgit::Url.
@@ -38,7 +50,7 @@ module Wgit
38
50
  attr_reader :html
39
51
 
40
52
  # The Nokogiri::HTML document object initialized from @html.
41
- attr_reader :doc
53
+ attr_reader :parser
42
54
 
43
55
  # The score is only used following a `Database#search` and records matches.
44
56
  attr_reader :score
@@ -50,7 +62,7 @@ module Wgit
50
62
  #
51
63
  # During initialisation, the Document will call any private
52
64
  # `init_*_from_html` and `init_*_from_object` methods it can find. See the
53
- # README.md and Wgit::Document.define_extension method for more details.
65
+ # Wgit::Document.define_extractor method for more details.
54
66
  #
55
67
  # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
56
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
@@ -72,31 +84,54 @@ module Wgit
72
84
 
73
85
  ### Document Class Methods ###
74
86
 
75
- # Defines an extension, which is a way to serialise HTML elements into
76
- # instance variables upon Document initialization. See the default
77
- # extensions defined in 'document_extensions.rb' as examples.
87
+ # Uses Document.text_elements to build an xpath String, used to obtain
88
+ # all of the combined visual text on a webpage.
78
89
  #
79
- # Note that defined extensions work for both Documents initialized from
90
+ # @return [String] An xpath String to obtain a webpage's text elements.
91
+ def self.text_elements_xpath
92
+ xpath = ''
93
+ return xpath if Wgit::Document.text_elements.empty?
94
+
95
+ el_xpath = '//%s/text()'
96
+ Wgit::Document.text_elements.each_with_index do |el, i|
97
+ xpath += ' | ' unless i.zero?
98
+ xpath += format(el_xpath, el)
99
+ end
100
+
101
+ xpath
102
+ end
103
+
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
110
+ #
111
+ # Note that defined extractors work for both Documents initialized from
80
112
  # HTML (via Wgit::Crawler methods) and from database objects.
81
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
82
114
  # the xpath or database object result(s).
83
115
  #
84
116
  # When initialising from HTML, a singleton value of true will only
85
- # ever return one result; otherwise all xpath results are returned in an
86
- # Array. When initialising from a database object, the value is taken as
87
- # is and singleton is only used to define the default empty value.
88
- # If a value cannot be found (in either the HTML or database object), then
89
- # a default will be used. The default value is: `singleton ? nil : []`.
90
- #
91
- # @param var [Symbol] The name of the variable to be initialised.
117
+ # ever return the first result found; otherwise all the results are
118
+ # returned in an Array. When initialising from a database object, the value
119
+ # is taken as is and singleton is only used to define the default empty
120
+ # value. If a value cannot be found (in either the HTML or database
121
+ # object), then a default will be used. The default value is:
122
+ # `singleton ? nil : []`.
123
+ #
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
92
127
  # @param xpath [String, #call] The xpath used to find the element(s)
93
128
  # of the webpage. Only used when initializing from HTML.
94
129
  #
95
130
  # Pass a callable object (proc etc.) if you want the
96
131
  # xpath value to be derived on Document initialisation (instead of when
97
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
98
133
  # String.
99
- # @param opts [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
100
135
  # options are only used when intializing from HTML, not the database.
101
136
  # @option opts [Boolean] :singleton The singleton option determines
102
137
  # whether or not the result(s) should be in an Array. If multiple
@@ -105,56 +140,62 @@ module Wgit
105
140
  # @option opts [Boolean] :text_content_only The text_content_only option
106
141
  # if true will use the text content of the Nokogiri result object,
107
142
  # otherwise the Nokogiri object itself is returned. Defaults to true.
108
- # @yieldparam value [Object] The value to be assigned to the new var.
109
- # @yieldparam source [Wgit::Document, Object] The source of the value.
110
- # @yieldparam type [Symbol] The source type, either :document or (DB)
111
- # :object.
112
- # @yieldreturn [Object] The return value of the block becomes the new var
113
- # value, unless nil. Return nil if you want to inspect but not change the
114
- # var value. The block is executed when a Wgit::Document is initialized,
115
- # regardless of the source.
143
+ # @yield The block is executed when a Wgit::Document is initialized,
144
+ # regardless of the source. Use it (optionally) to process the result
145
+ # value.
146
+ # @yieldparam value [Object] The result value to be assigned to the new
147
+ # `var`.
148
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
149
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
150
+ # `:object`.
151
+ # @yieldreturn [Object] The return value of the block becomes the new var's
152
+ # value. Return the block's value param unchanged if you want to inspect.
116
153
  # @raise [StandardError] If the var param isn't valid.
117
154
  # @return [Symbol] The given var Symbol if successful.
118
- def self.define_extension(var, xpath, opts = {}, &block)
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
119
156
  var = var.to_sym
120
157
  defaults = { singleton: true, text_content_only: true }
121
158
  opts = defaults.merge(opts)
122
159
 
123
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
124
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
125
162
 
126
163
  # Define the private init_*_from_html method for HTML.
127
164
  # Gets the HTML's xpath value and creates a var for it.
128
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
129
- result = find_in_html(xpath, opts, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
130
167
  init_var(var, result)
131
168
  end
132
169
  Document.send(:private, func_name)
133
170
 
134
171
  # Define the private init_*_from_object method for a Database object.
135
172
  # Gets the Object's 'key' value and creates a var for it.
136
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
137
- result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
138
179
  init_var(var, result)
139
180
  end
140
181
  Document.send(:private, func_name)
141
182
 
142
- @extensions << var
183
+ @extractors << var
143
184
  var
144
185
  end
145
186
 
146
- # Removes the init_* methods created when an extension is defined.
147
- # Therefore, this is the opposing method to Document.define_extension.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
148
189
  # Returns true if successful or false if the method(s) cannot be found.
149
190
  #
150
- # @param var [Symbol] The extension variable already defined.
151
- # @return [Boolean] True if the extension var was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
152
193
  # otherwise false.
153
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
154
195
  Document.send(:remove_method, "init_#{var}_from_html")
155
196
  Document.send(:remove_method, "init_#{var}_from_object")
156
197
 
157
- @extensions.delete(var.to_sym)
198
+ @extractors.delete(var.to_sym)
158
199
  true
159
200
  rescue NameError
160
201
  false
@@ -173,7 +214,7 @@ module Wgit
173
214
  (@url == other.url) && (@html == other.html)
174
215
  end
175
216
 
176
- # Is a shortcut for calling Document#html[range].
217
+ # Shortcut for calling Document#html[range].
177
218
  #
178
219
  # @param range [Range] The range of @html to return.
179
220
  # @return [String] The given range of @html.
@@ -183,9 +224,9 @@ module Wgit
183
224
 
184
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
185
226
  # <base> element's href value or @url (if @base is nil). If @base is
186
- # present and relative, then @url.to_base + @base is returned. This method
187
- # should be used instead of `doc.url.to_base` etc. when manually building
188
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
189
230
  #
190
231
  # Provide the `link:` parameter to get the correct base URL for that type
191
232
  # of link. For example, a link of `#top` would always return @url because
@@ -204,12 +245,16 @@ module Wgit
204
245
  # @return [Wgit::Url] The base URL of this Document e.g.
205
246
  # 'http://example.com/public'.
206
247
  def base_url(link: nil)
207
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
208
248
  if @url.relative? && @base.nil?
209
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
210
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
211
256
 
212
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
213
258
 
214
259
  if link
215
260
  link = Wgit::Url.new(link)
@@ -221,7 +266,7 @@ module Wgit
221
266
  end
222
267
  end
223
268
 
224
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
225
270
  base_url.omit_fragment.omit_query
226
271
  end
227
272
 
@@ -235,7 +280,7 @@ module Wgit
235
280
  def to_h(include_html: false, include_score: true)
236
281
  ignore = include_html ? [] : ['@html']
237
282
  ignore << '@score' unless include_score
238
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
239
284
 
240
285
  Wgit::Utils.to_h(self, ignore: ignore)
241
286
  end
@@ -252,7 +297,7 @@ module Wgit
252
297
 
253
298
  # Returns a Hash containing this Document's instance variables and
254
299
  # their #length (if they respond to it). Works dynamically so that any
255
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
256
301
  # the returned Hash as well. The number of text snippets as well as total
257
302
  # number of textual bytes are always included in the returned Hash.
258
303
  #
@@ -262,8 +307,8 @@ module Wgit
262
307
  instance_variables.each do |var|
263
308
  # Add up the total bytes of text as well as the length.
264
309
  if var == :@text
265
- hash[:text_snippets] = @text.length
266
- hash[:text_bytes] = @text.sum(&:length)
310
+ hash[:text] = @text.length
311
+ hash[:text_bytes] = @text.sum(&:length)
267
312
  # Else take the var's #length method return value.
268
313
  else
269
314
  next unless instance_variable_get(var).respond_to?(:length)
@@ -292,25 +337,43 @@ module Wgit
292
337
  end
293
338
 
294
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
295
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
296
341
  #
297
342
  # @param xpath [String] The xpath to search the @html with.
298
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
299
344
  def xpath(xpath)
300
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
301
346
  end
302
347
 
303
- # Uses Nokogiri's css method to search the doc's html and return the
304
- # results.
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
355
+ end
356
+
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
305
359
  #
306
360
  # @param selector [String] The CSS selector to search the @html with.
307
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
308
362
  def css(selector)
309
- @doc.css(selector)
363
+ @parser.css(selector)
364
+ end
365
+
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
310
373
  end
311
374
 
312
- # Returns all internal links from this Document in relative form. Internal
313
- # meaning a link to another document on the same host.
375
+ # Returns all unique internal links from this Document in relative form.
376
+ # Internal meaning a link to another document on the same host.
314
377
  #
315
378
  # This Document's host is used to determine if an absolute URL is actually
316
379
  # a relative link e.g. For a Document representing
@@ -319,41 +382,48 @@ module Wgit
319
382
  # as an internal link because both Documents live on the same host. Also
320
383
  # see Wgit::Document#internal_absolute_links.
321
384
  #
322
- # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
385
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
323
386
  def internal_links
324
387
  return [] if @links.empty?
325
388
 
326
389
  links = @links
327
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
328
391
  .map(&:omit_base)
329
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
330
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
331
394
  end
332
395
 
333
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
334
397
  end
335
398
 
336
- # Returns all internal links from this Document in absolute form by
399
+ # Returns all unique internal links from this Document in absolute form by
337
400
  # appending them to self's #base_url. Also see
338
401
  # Wgit::Document#internal_links.
339
402
  #
340
- # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
403
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
341
404
  def internal_absolute_links
342
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
343
406
  end
344
407
 
345
- # Returns all external links from this Document in absolute form. External
346
- # meaning a link to a different host.
408
+ # Returns all unique external links from this Document in absolute form.
409
+ # External meaning a link to a different host.
347
410
  #
348
- # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
411
+ # @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
349
412
  def external_links
350
413
  return [] if @links.empty?
351
414
 
352
415
  links = @links
353
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .map do |link|
417
+ if link.scheme_relative?
418
+ link.prefix_scheme(@url.to_scheme.to_sym)
419
+ else
420
+ link
421
+ end
422
+ end
423
+ .reject { |link| link.relative?(host: @url.to_origin) }
354
424
  .map(&:omit_trailing_slash)
355
425
 
356
- Wgit::Utils.process_arr(links)
426
+ Wgit::Utils.sanitize(links)
357
427
  end
358
428
 
359
429
  # Searches the @text for the given query and returns the results.
@@ -368,8 +438,8 @@ module Wgit
368
438
  # original sentence, which ever is less. The algorithm obviously ensures
369
439
  # that the search query is visible somewhere in the sentence.
370
440
  #
371
- # @param query [String, #to_s] The value to search the document's
372
- # @text for.
441
+ # @param query [Regexp, #to_s] The regex or text value to search the
442
+ # document's @text for.
373
443
  # @param case_sensitive [Boolean] Whether character case must match.
374
444
  # @param whole_sentence [Boolean] Whether multiple words should be searched
375
445
  # for separately.
@@ -379,12 +449,16 @@ module Wgit
379
449
  def search(
380
450
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
381
451
  )
382
- query = query.to_s
383
- raise 'A search query must be provided' if query.empty?
384
452
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
385
453
 
386
- query = query.gsub(' ', '|') unless whole_sentence
387
- regex = Regexp.new(query, !case_sensitive)
454
+ if query.is_a?(Regexp)
455
+ regex = query
456
+ else # respond_to? #to_s == true
457
+ query = query.to_s
458
+ query = query.gsub(' ', '|') unless whole_sentence
459
+ regex = Regexp.new(query, !case_sensitive)
460
+ end
461
+
388
462
  results = {}
389
463
 
390
464
  @text.each do |sentence|
@@ -411,8 +485,8 @@ module Wgit
411
485
  # functionality. The original text is returned; no other reference to it
412
486
  # is kept thereafter.
413
487
  #
414
- # @param query [String, #to_s] The value to search the document's
415
- # @text for.
488
+ # @param query [Regexp, #to_s] The regex or text value to search the
489
+ # document's @text for.
416
490
  # @param case_sensitive [Boolean] Whether character case must match.
417
491
  # @param whole_sentence [Boolean] Whether multiple words should be searched
418
492
  # for separately.
@@ -431,85 +505,95 @@ module Wgit
431
505
  orig_text
432
506
  end
433
507
 
508
+ # Extracts a value/object from this Document's @html using the given xpath
509
+ # parameter.
510
+ #
511
+ # @param xpath [String, #call] Used to find the value/object in @html.
512
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
+ # Object) : results (Array).
514
+ # @param text_content_only [Boolean] text_content_only ? result.content
515
+ # (String) : result (Nokogiri Object).
516
+ # @return [String, Object] The value found in the html or the default value
517
+ # (singleton ? nil : []).
518
+ def extract(xpath, singleton: true, text_content_only: true)
519
+ send(
520
+ :extract_from_html, xpath,
521
+ singleton: singleton, text_content_only: text_content_only
522
+ )
523
+ end
524
+
434
525
  protected
435
526
 
436
527
  # Initializes the nokogiri object using @html, which cannot be nil.
437
528
  # Override this method to custom configure the Nokogiri object returned.
438
529
  # Gets called from Wgit::Document.new upon initialization.
439
530
  #
531
+ # @yield [config] The given block is passed to Nokogiri::HTML for
532
+ # initialisation.
440
533
  # @raise [StandardError] If @html isn't set.
441
534
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
442
- def init_nokogiri
535
+ def init_nokogiri(&block)
443
536
  raise '@html must be set' unless @html
444
537
 
445
- Nokogiri::HTML(@html) do |config|
446
- # TODO: Remove #'s below when crawling in production.
447
- # config.options = Nokogiri::XML::ParseOptions::STRICT |
448
- # Nokogiri::XML::ParseOptions::NONET
449
- end
538
+ Nokogiri::HTML(@html, &block)
450
539
  end
451
540
 
452
- # Returns a value/object from this Document's @html using the given xpath
541
+ # Extracts a value/object from this Document's @html using the given xpath
453
542
  # parameter.
454
543
  #
455
- # @param xpath [String] Used to find the value/object in @html.
544
+ # @param xpath [String, #call] Used to find the value/object in @html.
456
545
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
457
546
  # Object) : results (Array).
458
547
  # @param text_content_only [Boolean] text_content_only ? result.content
459
548
  # (String) : result (Nokogiri Object).
460
- # @yield [value, source] Given the value (String/Object) before it's set as
461
- # an instance variable so that you can inspect/alter the value if
462
- # desired. Return nil from the block if you don't want to override the
463
- # value. Also given the source (Symbol) which is always :document.
549
+ # @yield The block is executed when a Wgit::Document is initialized,
550
+ # regardless of the source. Use it (optionally) to process the result
551
+ # value.
552
+ # @yieldparam value [Object] The result value to be returned.
553
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
+ # `:object`.
556
+ # @yieldreturn [Object] The return value of the block gets returned. Return
557
+ # the block's `value` param unchanged if you simply want to inspect it.
464
558
  # @return [String, Object] The value found in the html or the default value
465
559
  # (singleton ? nil : []).
466
- def find_in_html(xpath, singleton: true, text_content_only: true)
467
- default = singleton ? nil : []
468
- xpath = xpath.call if xpath.respond_to?(:call)
469
- results = @doc.xpath(xpath)
470
-
471
- return default if results.nil? || results.empty?
472
-
473
- result = if singleton
474
- text_content_only ? results.first.content : results.first
475
- else
476
- text_content_only ? results.map(&:content) : results
477
- end
560
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
561
+ xpath = xpath.call if xpath.respond_to?(:call)
562
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
478
563
 
479
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
480
-
481
- if block_given?
482
- new_result = yield(result, self, :document)
483
- result = new_result unless new_result.nil?
564
+ if text_content_only
565
+ result = singleton ? result&.content : result.map(&:content)
484
566
  end
485
567
 
568
+ Wgit::Utils.sanitize(result)
569
+ result = yield(result, self, :document) if block_given?
486
570
  result
487
571
  end
488
572
 
489
- # Returns a value from the obj using the given key via obj#fetch.
573
+ # Returns a value from the obj using the given key via `obj#fetch`.
490
574
  #
491
575
  # @param obj [#fetch] The object containing the key/value.
492
576
  # @param key [String] Used to find the value in the obj.
493
577
  # @param singleton [Boolean] True if a single value, false otherwise.
494
- # @yield [value, source] Given the value (String/Object) before it's set as
495
- # an instance variable so that you can inspect/alter the value if
496
- # desired. Return nil from the block if you don't want to override the
497
- # value. Also given the source (Symbol) which is always :object.
578
+ # @yield The block is executed when a Wgit::Document is initialized,
579
+ # regardless of the source. Use it (optionally) to process the result
580
+ # value.
581
+ # @yieldparam value [Object] The result value to be returned.
582
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
583
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
584
+ # `:object`.
585
+ # @yieldreturn [Object] The return value of the block gets returned. Return
586
+ # the block's `value` param unchanged if you simply want to inspect it.
498
587
  # @return [String, Object] The value found in the obj or the default value
499
588
  # (singleton ? nil : []).
500
- def find_in_object(obj, key, singleton: true)
589
+ def extract_from_object(obj, key, singleton: true)
501
590
  assert_respond_to(obj, :fetch)
502
591
 
503
592
  default = singleton ? nil : []
504
593
  result = obj.fetch(key.to_s, default)
505
594
 
506
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
507
-
508
- if block_given?
509
- new_result = yield(result, obj, :object)
510
- result = new_result unless new_result.nil?
511
- end
512
-
595
+ Wgit::Utils.sanitize(result)
596
+ result = yield(result, obj, :object) if block_given?
513
597
  result
514
598
  end
515
599
 
@@ -523,12 +607,12 @@ module Wgit
523
607
  url = Wgit::Url.parse(url)
524
608
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
525
609
 
526
- @url = url
527
- @html = html || ''
528
- @doc = init_nokogiri
529
- @score = 0.0
610
+ @url = url
611
+ @html = html || ''
612
+ @parser = init_nokogiri
613
+ @score = 0.0
530
614
 
531
- Wgit::Utils.process_str(@html, encode: encode)
615
+ Wgit::Utils.sanitize(@html, encode: encode)
532
616
 
533
617
  # Dynamically run the init_*_from_html methods.
534
618
  Document.private_instance_methods(false).each do |method|
@@ -544,12 +628,12 @@ module Wgit
544
628
  def init_from_object(obj, encode: true)
545
629
  assert_respond_to(obj, :fetch)
546
630
 
547
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
548
- @html = obj.fetch('html', '')
549
- @doc = init_nokogiri
550
- @score = obj.fetch('score', 0.0)
631
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
632
+ @html = obj.fetch('html', '')
633
+ @parser = init_nokogiri
634
+ @score = obj.fetch('score', 0.0)
551
635
 
552
- Wgit::Utils.process_str(@html, encode: encode)
636
+ Wgit::Utils.sanitize(@html, encode: encode)
553
637
 
554
638
  # Dynamically run the init_*_from_object methods.
555
639
  Document.private_instance_methods(false).each do |method|
@@ -560,11 +644,11 @@ module Wgit
560
644
  end
561
645
  end
562
646
 
563
- # Initialises an instance variable and defines a getter method for it.
647
+ # Initialises an instance variable and defines an accessor method for it.
564
648
  #
565
649
  # @param var [Symbol] The name of the variable to be initialized.
566
650
  # @param value [Object] The newly initialized variable's value.
567
- # @return [Symbol] The name of the newly created getter method.
651
+ # @return [Symbol] The name of the defined getter method.
568
652
  def init_var(var, value)
569
653
  # instance_var_name starts with @, var_name doesn't.
570
654
  var = var.to_s
@@ -572,10 +656,9 @@ module Wgit
572
656
  instance_var_name = "@#{var_name}".to_sym
573
657
 
574
658
  instance_variable_set(instance_var_name, value)
659
+ Wgit::Document.attr_accessor(var_name)
575
660
 
576
- Document.send(:define_method, var_name) do
577
- instance_variable_get(instance_var_name)
578
- end
661
+ var_name
579
662
  end
580
663
 
581
664
  alias content html