wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/document.rb CHANGED
@@ -6,29 +6,41 @@ require 'json'
6
6
  require 'set'
7
7
 
8
8
  module Wgit
9
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
10
10
  # will work e.g. images etc. Also doubles as a search result when
11
11
  # loading Documents from the database via `Wgit::Database#search`.
12
12
  #
13
13
  # The initialize method dynamically initializes instance variables from the
14
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
15
- # Document class can be easily extended allowing you to pull out the bits of
16
- # a webpage that are important to you. See `Wgit::Document.define_extension`.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
17
17
  class Document
18
18
  include Assertable
19
19
 
20
- # Regex for the allowed var names when defining an extension.
21
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
22
22
 
23
- # The xpath used to extract the visible text on a page.
24
- TEXT_ELEMENTS_XPATH = '//*/text()'.freeze
23
+ # Set of text elements used to build Document#text.
24
+ @text_elements = Set.new(%i[
25
+ a abbr address article aside b bdi bdo blockquote button caption cite
26
+ code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
+ h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
+ option output p pre q rb rt ruby s samp section small span strong sub
29
+ summary sup td textarea th time u ul var wbr
30
+ ])
25
31
 
26
- # Set of Symbols representing the defined Document extensions.
27
- @extensions = Set.new
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
28
34
 
29
35
  class << self
30
- # Class level attr_reader for the Document defined extensions.
31
- attr_reader :extensions
36
+ # Set of HTML elements that make up the visible text on a page. These
37
+ # elements are used to initialize the Wgit::Document#text. See the
38
+ # README.md for how to add to this Set dynamically.
39
+ attr_reader :text_elements
40
+
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
32
44
  end
33
45
 
34
46
  # The URL of the webpage, an instance of Wgit::Url.
@@ -38,7 +50,7 @@ module Wgit
38
50
  attr_reader :html
39
51
 
40
52
  # The Nokogiri::HTML document object initialized from @html.
41
- attr_reader :doc
53
+ attr_reader :parser
42
54
 
43
55
  # The score is only used following a `Database#search` and records matches.
44
56
  attr_reader :score
@@ -50,7 +62,7 @@ module Wgit
50
62
  #
51
63
  # During initialisation, the Document will call any private
52
64
  # `init_*_from_html` and `init_*_from_object` methods it can find. See the
53
- # README.md and Wgit::Document.define_extension method for more details.
65
+ # Wgit::Document.define_extractor method for more details.
54
66
  #
55
67
  # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
56
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
@@ -72,31 +84,54 @@ module Wgit
72
84
 
73
85
  ### Document Class Methods ###
74
86
 
75
- # Defines an extension, which is a way to serialise HTML elements into
76
- # instance variables upon Document initialization. See the default
77
- # extensions defined in 'document_extensions.rb' as examples.
87
+ # Uses Document.text_elements to build an xpath String, used to obtain
88
+ # all of the combined visual text on a webpage.
78
89
  #
79
- # Note that defined extensions work for both Documents initialized from
90
+ # @return [String] An xpath String to obtain a webpage's text elements.
91
+ def self.text_elements_xpath
92
+ xpath = ''
93
+ return xpath if Wgit::Document.text_elements.empty?
94
+
95
+ el_xpath = '//%s/text()'
96
+ Wgit::Document.text_elements.each_with_index do |el, i|
97
+ xpath += ' | ' unless i.zero?
98
+ xpath += format(el_xpath, el)
99
+ end
100
+
101
+ xpath
102
+ end
103
+
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
110
+ #
111
+ # Note that defined extractors work for both Documents initialized from
80
112
  # HTML (via Wgit::Crawler methods) and from database objects.
81
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
82
114
  # the xpath or database object result(s).
83
115
  #
84
116
  # When initialising from HTML, a singleton value of true will only
85
- # ever return one result; otherwise all xpath results are returned in an
86
- # Array. When initialising from a database object, the value is taken as
87
- # is and singleton is only used to define the default empty value.
88
- # If a value cannot be found (in either the HTML or database object), then
89
- # a default will be used. The default value is: `singleton ? nil : []`.
90
- #
91
- # @param var [Symbol] The name of the variable to be initialised.
117
+ # ever return the first result found; otherwise all the results are
118
+ # returned in an Array. When initialising from a database object, the value
119
+ # is taken as is and singleton is only used to define the default empty
120
+ # value. If a value cannot be found (in either the HTML or database
121
+ # object), then a default will be used. The default value is:
122
+ # `singleton ? nil : []`.
123
+ #
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
92
127
  # @param xpath [String, #call] The xpath used to find the element(s)
93
128
  # of the webpage. Only used when initializing from HTML.
94
129
  #
95
130
  # Pass a callable object (proc etc.) if you want the
96
131
  # xpath value to be derived on Document initialisation (instead of when
97
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
98
133
  # String.
99
- # @param opts [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
100
135
  # options are only used when intializing from HTML, not the database.
101
136
  # @option opts [Boolean] :singleton The singleton option determines
102
137
  # whether or not the result(s) should be in an Array. If multiple
@@ -105,56 +140,62 @@ module Wgit
105
140
  # @option opts [Boolean] :text_content_only The text_content_only option
106
141
  # if true will use the text content of the Nokogiri result object,
107
142
  # otherwise the Nokogiri object itself is returned. Defaults to true.
108
- # @yieldparam value [Object] The value to be assigned to the new var.
109
- # @yieldparam source [Wgit::Document, Object] The source of the value.
110
- # @yieldparam type [Symbol] The source type, either :document or (DB)
111
- # :object.
112
- # @yieldreturn [Object] The return value of the block becomes the new var
113
- # value, unless nil. Return nil if you want to inspect but not change the
114
- # var value. The block is executed when a Wgit::Document is initialized,
115
- # regardless of the source.
143
+ # @yield The block is executed when a Wgit::Document is initialized,
144
+ # regardless of the source. Use it (optionally) to process the result
145
+ # value.
146
+ # @yieldparam value [Object] The result value to be assigned to the new
147
+ # `var`.
148
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
149
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
150
+ # `:object`.
151
+ # @yieldreturn [Object] The return value of the block becomes the new var's
152
+ # value. Return the block's value param unchanged if you want to inspect.
116
153
  # @raise [StandardError] If the var param isn't valid.
117
154
  # @return [Symbol] The given var Symbol if successful.
118
- def self.define_extension(var, xpath, opts = {}, &block)
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
119
156
  var = var.to_sym
120
157
  defaults = { singleton: true, text_content_only: true }
121
158
  opts = defaults.merge(opts)
122
159
 
123
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
124
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
125
162
 
126
163
  # Define the private init_*_from_html method for HTML.
127
164
  # Gets the HTML's xpath value and creates a var for it.
128
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
129
- result = find_in_html(xpath, opts, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
130
167
  init_var(var, result)
131
168
  end
132
169
  Document.send(:private, func_name)
133
170
 
134
171
  # Define the private init_*_from_object method for a Database object.
135
172
  # Gets the Object's 'key' value and creates a var for it.
136
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
137
- result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
138
179
  init_var(var, result)
139
180
  end
140
181
  Document.send(:private, func_name)
141
182
 
142
- @extensions << var
183
+ @extractors << var
143
184
  var
144
185
  end
145
186
 
146
- # Removes the init_* methods created when an extension is defined.
147
- # Therefore, this is the opposing method to Document.define_extension.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
148
189
  # Returns true if successful or false if the method(s) cannot be found.
149
190
  #
150
- # @param var [Symbol] The extension variable already defined.
151
- # @return [Boolean] True if the extension var was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
152
193
  # otherwise false.
153
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
154
195
  Document.send(:remove_method, "init_#{var}_from_html")
155
196
  Document.send(:remove_method, "init_#{var}_from_object")
156
197
 
157
- @extensions.delete(var.to_sym)
198
+ @extractors.delete(var.to_sym)
158
199
  true
159
200
  rescue NameError
160
201
  false
@@ -173,7 +214,7 @@ module Wgit
173
214
  (@url == other.url) && (@html == other.html)
174
215
  end
175
216
 
176
- # Is a shortcut for calling Document#html[range].
217
+ # Shortcut for calling Document#html[range].
177
218
  #
178
219
  # @param range [Range] The range of @html to return.
179
220
  # @return [String] The given range of @html.
@@ -183,9 +224,9 @@ module Wgit
183
224
 
184
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
185
226
  # <base> element's href value or @url (if @base is nil). If @base is
186
- # present and relative, then @url.to_base + @base is returned. This method
187
- # should be used instead of `doc.url.to_base` etc. when manually building
188
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
189
230
  #
190
231
  # Provide the `link:` parameter to get the correct base URL for that type
191
232
  # of link. For example, a link of `#top` would always return @url because
@@ -204,12 +245,16 @@ module Wgit
204
245
  # @return [Wgit::Url] The base URL of this Document e.g.
205
246
  # 'http://example.com/public'.
206
247
  def base_url(link: nil)
207
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
208
248
  if @url.relative? && @base.nil?
209
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
210
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
211
256
 
212
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
213
258
 
214
259
  if link
215
260
  link = Wgit::Url.new(link)
@@ -221,7 +266,7 @@ module Wgit
221
266
  end
222
267
  end
223
268
 
224
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
225
270
  base_url.omit_fragment.omit_query
226
271
  end
227
272
 
@@ -235,7 +280,7 @@ module Wgit
235
280
  def to_h(include_html: false, include_score: true)
236
281
  ignore = include_html ? [] : ['@html']
237
282
  ignore << '@score' unless include_score
238
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
239
284
 
240
285
  Wgit::Utils.to_h(self, ignore: ignore)
241
286
  end
@@ -252,7 +297,7 @@ module Wgit
252
297
 
253
298
  # Returns a Hash containing this Document's instance variables and
254
299
  # their #length (if they respond to it). Works dynamically so that any
255
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
256
301
  # the returned Hash as well. The number of text snippets as well as total
257
302
  # number of textual bytes are always included in the returned Hash.
258
303
  #
@@ -262,8 +307,8 @@ module Wgit
262
307
  instance_variables.each do |var|
263
308
  # Add up the total bytes of text as well as the length.
264
309
  if var == :@text
265
- hash[:text_snippets] = @text.length
266
- hash[:text_bytes] = @text.sum(&:length)
310
+ hash[:text] = @text.length
311
+ hash[:text_bytes] = @text.sum(&:length)
267
312
  # Else take the var's #length method return value.
268
313
  else
269
314
  next unless instance_variable_get(var).respond_to?(:length)
@@ -292,25 +337,43 @@ module Wgit
292
337
  end
293
338
 
294
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
295
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
296
341
  #
297
342
  # @param xpath [String] The xpath to search the @html with.
298
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
299
344
  def xpath(xpath)
300
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
301
346
  end
302
347
 
303
- # Uses Nokogiri's css method to search the doc's html and return the
304
- # results.
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
355
+ end
356
+
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
305
359
  #
306
360
  # @param selector [String] The CSS selector to search the @html with.
307
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
308
362
  def css(selector)
309
- @doc.css(selector)
363
+ @parser.css(selector)
364
+ end
365
+
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
310
373
  end
311
374
 
312
- # Returns all internal links from this Document in relative form. Internal
313
- # meaning a link to another document on the same host.
375
+ # Returns all unique internal links from this Document in relative form.
376
+ # Internal meaning a link to another document on the same host.
314
377
  #
315
378
  # This Document's host is used to determine if an absolute URL is actually
316
379
  # a relative link e.g. For a Document representing
@@ -319,41 +382,48 @@ module Wgit
319
382
  # as an internal link because both Documents live on the same host. Also
320
383
  # see Wgit::Document#internal_absolute_links.
321
384
  #
322
- # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
385
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
323
386
  def internal_links
324
387
  return [] if @links.empty?
325
388
 
326
389
  links = @links
327
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
328
391
  .map(&:omit_base)
329
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
330
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
331
394
  end
332
395
 
333
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
334
397
  end
335
398
 
336
- # Returns all internal links from this Document in absolute form by
399
+ # Returns all unique internal links from this Document in absolute form by
337
400
  # appending them to self's #base_url. Also see
338
401
  # Wgit::Document#internal_links.
339
402
  #
340
- # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
403
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
341
404
  def internal_absolute_links
342
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
343
406
  end
344
407
 
345
- # Returns all external links from this Document in absolute form. External
346
- # meaning a link to a different host.
408
+ # Returns all unique external links from this Document in absolute form.
409
+ # External meaning a link to a different host.
347
410
  #
348
- # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
411
+ # @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
349
412
  def external_links
350
413
  return [] if @links.empty?
351
414
 
352
415
  links = @links
353
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .map do |link|
417
+ if link.scheme_relative?
418
+ link.prefix_scheme(@url.to_scheme.to_sym)
419
+ else
420
+ link
421
+ end
422
+ end
423
+ .reject { |link| link.relative?(host: @url.to_origin) }
354
424
  .map(&:omit_trailing_slash)
355
425
 
356
- Wgit::Utils.process_arr(links)
426
+ Wgit::Utils.sanitize(links)
357
427
  end
358
428
 
359
429
  # Searches the @text for the given query and returns the results.
@@ -368,8 +438,8 @@ module Wgit
368
438
  # original sentence, which ever is less. The algorithm obviously ensures
369
439
  # that the search query is visible somewhere in the sentence.
370
440
  #
371
- # @param query [String, #to_s] The value to search the document's
372
- # @text for.
441
+ # @param query [Regexp, #to_s] The regex or text value to search the
442
+ # document's @text for.
373
443
  # @param case_sensitive [Boolean] Whether character case must match.
374
444
  # @param whole_sentence [Boolean] Whether multiple words should be searched
375
445
  # for separately.
@@ -379,12 +449,16 @@ module Wgit
379
449
  def search(
380
450
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
381
451
  )
382
- query = query.to_s
383
- raise 'A search query must be provided' if query.empty?
384
452
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
385
453
 
386
- query = query.gsub(' ', '|') unless whole_sentence
387
- regex = Regexp.new(query, !case_sensitive)
454
+ if query.is_a?(Regexp)
455
+ regex = query
456
+ else # respond_to? #to_s == true
457
+ query = query.to_s
458
+ query = query.gsub(' ', '|') unless whole_sentence
459
+ regex = Regexp.new(query, !case_sensitive)
460
+ end
461
+
388
462
  results = {}
389
463
 
390
464
  @text.each do |sentence|
@@ -411,8 +485,8 @@ module Wgit
411
485
  # functionality. The original text is returned; no other reference to it
412
486
  # is kept thereafter.
413
487
  #
414
- # @param query [String, #to_s] The value to search the document's
415
- # @text for.
488
+ # @param query [Regexp, #to_s] The regex or text value to search the
489
+ # document's @text for.
416
490
  # @param case_sensitive [Boolean] Whether character case must match.
417
491
  # @param whole_sentence [Boolean] Whether multiple words should be searched
418
492
  # for separately.
@@ -431,85 +505,95 @@ module Wgit
431
505
  orig_text
432
506
  end
433
507
 
508
+ # Extracts a value/object from this Document's @html using the given xpath
509
+ # parameter.
510
+ #
511
+ # @param xpath [String, #call] Used to find the value/object in @html.
512
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
+ # Object) : results (Array).
514
+ # @param text_content_only [Boolean] text_content_only ? result.content
515
+ # (String) : result (Nokogiri Object).
516
+ # @return [String, Object] The value found in the html or the default value
517
+ # (singleton ? nil : []).
518
+ def extract(xpath, singleton: true, text_content_only: true)
519
+ send(
520
+ :extract_from_html, xpath,
521
+ singleton: singleton, text_content_only: text_content_only
522
+ )
523
+ end
524
+
434
525
  protected
435
526
 
436
527
  # Initializes the nokogiri object using @html, which cannot be nil.
437
528
  # Override this method to custom configure the Nokogiri object returned.
438
529
  # Gets called from Wgit::Document.new upon initialization.
439
530
  #
531
+ # @yield [config] The given block is passed to Nokogiri::HTML for
532
+ # initialisation.
440
533
  # @raise [StandardError] If @html isn't set.
441
534
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
442
- def init_nokogiri
535
+ def init_nokogiri(&block)
443
536
  raise '@html must be set' unless @html
444
537
 
445
- Nokogiri::HTML(@html) do |config|
446
- # TODO: Remove #'s below when crawling in production.
447
- # config.options = Nokogiri::XML::ParseOptions::STRICT |
448
- # Nokogiri::XML::ParseOptions::NONET
449
- end
538
+ Nokogiri::HTML(@html, &block)
450
539
  end
451
540
 
452
- # Returns a value/object from this Document's @html using the given xpath
541
+ # Extracts a value/object from this Document's @html using the given xpath
453
542
  # parameter.
454
543
  #
455
- # @param xpath [String] Used to find the value/object in @html.
544
+ # @param xpath [String, #call] Used to find the value/object in @html.
456
545
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
457
546
  # Object) : results (Array).
458
547
  # @param text_content_only [Boolean] text_content_only ? result.content
459
548
  # (String) : result (Nokogiri Object).
460
- # @yield [value, source] Given the value (String/Object) before it's set as
461
- # an instance variable so that you can inspect/alter the value if
462
- # desired. Return nil from the block if you don't want to override the
463
- # value. Also given the source (Symbol) which is always :document.
549
+ # @yield The block is executed when a Wgit::Document is initialized,
550
+ # regardless of the source. Use it (optionally) to process the result
551
+ # value.
552
+ # @yieldparam value [Object] The result value to be returned.
553
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
+ # `:object`.
556
+ # @yieldreturn [Object] The return value of the block gets returned. Return
557
+ # the block's `value` param unchanged if you simply want to inspect it.
464
558
  # @return [String, Object] The value found in the html or the default value
465
559
  # (singleton ? nil : []).
466
- def find_in_html(xpath, singleton: true, text_content_only: true)
467
- default = singleton ? nil : []
468
- xpath = xpath.call if xpath.respond_to?(:call)
469
- results = @doc.xpath(xpath)
470
-
471
- return default if results.nil? || results.empty?
472
-
473
- result = if singleton
474
- text_content_only ? results.first.content : results.first
475
- else
476
- text_content_only ? results.map(&:content) : results
477
- end
560
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
561
+ xpath = xpath.call if xpath.respond_to?(:call)
562
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
478
563
 
479
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
480
-
481
- if block_given?
482
- new_result = yield(result, self, :document)
483
- result = new_result unless new_result.nil?
564
+ if text_content_only
565
+ result = singleton ? result&.content : result.map(&:content)
484
566
  end
485
567
 
568
+ Wgit::Utils.sanitize(result)
569
+ result = yield(result, self, :document) if block_given?
486
570
  result
487
571
  end
488
572
 
489
- # Returns a value from the obj using the given key via obj#fetch.
573
+ # Returns a value from the obj using the given key via `obj#fetch`.
490
574
  #
491
575
  # @param obj [#fetch] The object containing the key/value.
492
576
  # @param key [String] Used to find the value in the obj.
493
577
  # @param singleton [Boolean] True if a single value, false otherwise.
494
- # @yield [value, source] Given the value (String/Object) before it's set as
495
- # an instance variable so that you can inspect/alter the value if
496
- # desired. Return nil from the block if you don't want to override the
497
- # value. Also given the source (Symbol) which is always :object.
578
+ # @yield The block is executed when a Wgit::Document is initialized,
579
+ # regardless of the source. Use it (optionally) to process the result
580
+ # value.
581
+ # @yieldparam value [Object] The result value to be returned.
582
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
583
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
584
+ # `:object`.
585
+ # @yieldreturn [Object] The return value of the block gets returned. Return
586
+ # the block's `value` param unchanged if you simply want to inspect it.
498
587
  # @return [String, Object] The value found in the obj or the default value
499
588
  # (singleton ? nil : []).
500
- def find_in_object(obj, key, singleton: true)
589
+ def extract_from_object(obj, key, singleton: true)
501
590
  assert_respond_to(obj, :fetch)
502
591
 
503
592
  default = singleton ? nil : []
504
593
  result = obj.fetch(key.to_s, default)
505
594
 
506
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
507
-
508
- if block_given?
509
- new_result = yield(result, obj, :object)
510
- result = new_result unless new_result.nil?
511
- end
512
-
595
+ Wgit::Utils.sanitize(result)
596
+ result = yield(result, obj, :object) if block_given?
513
597
  result
514
598
  end
515
599
 
@@ -523,12 +607,12 @@ module Wgit
523
607
  url = Wgit::Url.parse(url)
524
608
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
525
609
 
526
- @url = url
527
- @html = html || ''
528
- @doc = init_nokogiri
529
- @score = 0.0
610
+ @url = url
611
+ @html = html || ''
612
+ @parser = init_nokogiri
613
+ @score = 0.0
530
614
 
531
- Wgit::Utils.process_str(@html, encode: encode)
615
+ Wgit::Utils.sanitize(@html, encode: encode)
532
616
 
533
617
  # Dynamically run the init_*_from_html methods.
534
618
  Document.private_instance_methods(false).each do |method|
@@ -544,12 +628,12 @@ module Wgit
544
628
  def init_from_object(obj, encode: true)
545
629
  assert_respond_to(obj, :fetch)
546
630
 
547
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
548
- @html = obj.fetch('html', '')
549
- @doc = init_nokogiri
550
- @score = obj.fetch('score', 0.0)
631
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
632
+ @html = obj.fetch('html', '')
633
+ @parser = init_nokogiri
634
+ @score = obj.fetch('score', 0.0)
551
635
 
552
- Wgit::Utils.process_str(@html, encode: encode)
636
+ Wgit::Utils.sanitize(@html, encode: encode)
553
637
 
554
638
  # Dynamically run the init_*_from_object methods.
555
639
  Document.private_instance_methods(false).each do |method|
@@ -560,11 +644,11 @@ module Wgit
560
644
  end
561
645
  end
562
646
 
563
- # Initialises an instance variable and defines a getter method for it.
647
+ # Initialises an instance variable and defines an accessor method for it.
564
648
  #
565
649
  # @param var [Symbol] The name of the variable to be initialized.
566
650
  # @param value [Object] The newly initialized variable's value.
567
- # @return [Symbol] The name of the newly created getter method.
651
+ # @return [Symbol] The name of the defined getter method.
568
652
  def init_var(var, value)
569
653
  # instance_var_name starts with @, var_name doesn't.
570
654
  var = var.to_s
@@ -572,10 +656,9 @@ module Wgit
572
656
  instance_var_name = "@#{var_name}".to_sym
573
657
 
574
658
  instance_variable_set(instance_var_name, value)
659
+ Wgit::Document.attr_accessor(var_name)
575
660
 
576
- Document.send(:define_method, var_name) do
577
- instance_variable_get(instance_var_name)
578
- end
661
+ var_name
579
662
  end
580
663
 
581
664
  alias content html