wgit 0.5.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,7 @@ module Wgit
14
14
  raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
15
15
 
16
16
  model = url.to_h
17
-
18
- Wgit::Utils.remove_non_bson_types(model)
17
+ select_bson_types(model)
19
18
  end
20
19
 
21
20
  # The data model for a Wgit::Document collection object.
@@ -28,7 +27,7 @@ module Wgit
28
27
  model = doc.to_h(include_html: false, include_score: false)
29
28
  model['url'] = url(doc.url) # Expand Url String into full object.
30
29
 
31
- Wgit::Utils.remove_non_bson_types(model)
30
+ select_bson_types(model)
32
31
  end
33
32
 
34
33
  # Common fields when inserting a record into the DB.
@@ -49,5 +48,13 @@ module Wgit
49
48
  date_modified: Wgit::Utils.time_stamp
50
49
  }
51
50
  end
51
+
52
+ # Returns the model having removed non bson types (for use with MongoDB).
53
+ #
54
+ # @param model_hash [Hash] The model Hash to sanitize.
55
+ # @return [Hash] The model Hash with non bson types removed.
56
+ def self.select_bson_types(model_hash)
57
+ model_hash.select { |_k, v| v.respond_to?(:bson_type) }
58
+ end
52
59
  end
53
60
  end
@@ -3,45 +3,56 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
+ require 'set'
6
7
 
7
8
  module Wgit
8
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
9
10
  # will work e.g. images etc. Also doubles as a search result when
10
- # loading Documents from the database via Wgit::Database#search.
11
+ # loading Documents from the database via `Wgit::Database#search`.
11
12
  #
12
13
  # The initialize method dynamically initializes instance variables from the
13
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
- # Document class can be easily extended allowing you to pull out the bits of
15
- # a webpage that are important to you. See Wgit::Document.define_extension.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
16
17
  class Document
17
18
  include Assertable
18
19
 
19
- # Regex for the allowed var names when defining an extension.
20
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
21
22
 
22
- # The HTML elements that make up the visible text on a page.
23
- # These elements are used to initialize the @text of the Document.
24
- # See the README.md for how to add to this Array dynamically.
25
- @text_elements = %i[
26
- dd div dl dt figcaption figure hr li
27
- main ol p pre span ul h1 h2 h3 h4 h5
28
- ]
23
+ # Set of text elements used to build Document#text.
24
+ @text_elements = Set.new(%i[
25
+ a abbr address article aside b bdi bdo blockquote button caption cite
26
+ code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
+ h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
+ option output p pre q rb rt ruby s samp section small span strong sub
29
+ summary sup td textarea th time u ul var wbr
30
+ ])
31
+
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
29
34
 
30
35
  class << self
31
- # Class level instance reader method for @text_elements.
36
+ # Set of HTML elements that make up the visible text on a page. These
37
+ # elements are used to initialize the Wgit::Document#text. See the
38
+ # README.md for how to add to this Set dynamically.
32
39
  attr_reader :text_elements
40
+
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
33
44
  end
34
45
 
35
46
  # The URL of the webpage, an instance of Wgit::Url.
36
47
  attr_reader :url
37
48
 
38
- # The HTML of the webpage, an instance of String.
49
+ # The content/HTML of the document, an instance of String.
39
50
  attr_reader :html
40
51
 
41
52
  # The Nokogiri::HTML document object initialized from @html.
42
- attr_reader :doc
53
+ attr_reader :parser
43
54
 
44
- # The score is only used following a Database#search and records matches.
55
+ # The score is only used following a `Database#search` and records matches.
45
56
  attr_reader :score
46
57
 
47
58
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -50,29 +61,31 @@ module Wgit
50
61
  # pages retrieved from the database.
51
62
  #
52
63
  # During initialisation, the Document will call any private
53
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
54
- # README.md and Wgit::Document.define_extension method for more details.
64
+ # `init_*_from_html` and `init_*_from_object` methods it can find. See the
65
+ # Wgit::Document.define_extractor method for more details.
55
66
  #
56
- # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
67
+ # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
57
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
58
69
  # MongoDB collection object. The Object's :fetch method should support
59
70
  # Strings as keys.
60
- # @param html [String, NilClass] The crawled web page's HTML. This param is
61
- # only used if url_or_obj is a String representing the web page's URL.
62
- # Otherwise, the HTML comes from the database object. A html of nil will
63
- # be defaulted to an empty String.
64
- def initialize(url_or_obj, html = '', encode_html: true)
71
+ # @param html [String, NilClass] The crawled web page's content/HTML. This
72
+ # param is only used if url_or_obj is a String representing the web
73
+ # page's URL. Otherwise, the HTML comes from the database object. A html
74
+ # of nil will be defaulted to an empty String.
75
+ # @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
76
+ # false if the Document content is an image etc.
77
+ def initialize(url_or_obj, html = '', encode: true)
65
78
  if url_or_obj.is_a?(String)
66
- init_from_strings(url_or_obj, html, encode_html: encode_html)
79
+ init_from_strings(url_or_obj, html, encode: encode)
67
80
  else
68
- init_from_object(url_or_obj, encode_html: encode_html)
81
+ init_from_object(url_or_obj, encode: encode)
69
82
  end
70
83
  end
71
84
 
72
85
  ### Document Class Methods ###
73
86
 
74
87
  # Uses Document.text_elements to build an xpath String, used to obtain
75
- # all of the combined text on a webpage.
88
+ # all of the combined visual text on a webpage.
76
89
  #
77
90
  # @return [String] An xpath String to obtain a webpage's text elements.
78
91
  def self.text_elements_xpath
@@ -88,86 +101,101 @@ module Wgit
88
101
  xpath
89
102
  end
90
103
 
91
- # Defines an extension, which is a way to serialise HTML elements into
92
- # instance variables upon Document initialization. See the default
93
- # extensions defined in 'document_extensions.rb' as examples.
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
94
110
  #
95
- # Note that defined extensions work for both Documents initialized from
111
+ # Note that defined extractors work for both Documents initialized from
96
112
  # HTML (via Wgit::Crawler methods) and from database objects.
97
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
98
114
  # the xpath or database object result(s).
99
115
  #
100
116
  # When initialising from HTML, a singleton value of true will only
101
- # ever return one result; otherwise all xpath results are returned in an
102
- # Array. When initialising from a database object, the value is taken as
103
- # is and singleton is only used to define the default empty value.
104
- # If a value cannot be found (in either the HTML or database object), then
105
- # a default will be used. The default value is: `singleton ? nil : []`.
106
- #
107
- # @param var [Symbol] The name of the variable to be initialised.
108
- # @param xpath [String, Object#call] The xpath used to find the element(s)
117
+ # ever return the first result found; otherwise all the results are
118
+ # returned in an Array. When initialising from a database object, the value
119
+ # is taken as is and singleton is only used to define the default empty
120
+ # value. If a value cannot be found (in either the HTML or database
121
+ # object), then a default will be used. The default value is:
122
+ # `singleton ? nil : []`.
123
+ #
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
127
+ # @param xpath [String, #call] The xpath used to find the element(s)
109
128
  # of the webpage. Only used when initializing from HTML.
110
129
  #
111
130
  # Pass a callable object (proc etc.) if you want the
112
131
  # xpath value to be derived on Document initialisation (instead of when
113
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
114
133
  # String.
115
- # @param options [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
116
135
  # options are only used when intializing from HTML, not the database.
117
- # @option options [Boolean] :singleton The singleton option determines
136
+ # @option opts [Boolean] :singleton The singleton option determines
118
137
  # whether or not the result(s) should be in an Array. If multiple
119
138
  # results are found and singleton is true then the first result will be
120
139
  # used. Defaults to true.
121
- # @option options [Boolean] :text_content_only The text_content_only option
140
+ # @option opts [Boolean] :text_content_only The text_content_only option
122
141
  # if true will use the text content of the Nokogiri result object,
123
142
  # otherwise the Nokogiri object itself is returned. Defaults to true.
124
- # @yield [value, source, type] Yields the value (Object) about to be
125
- # assigned to the new var, the source of the value (Wgit::Document or DB
126
- # Object) and the source type (Symbol of either :document or :object).
127
- #
128
- # The return value of the block becomes the new var value, unless nil.
129
- # Return nil if you want to inspect but not change the var value. The
130
- # block is executed when a Wgit::Document is initialized.
143
+ # @yield The block is executed when a Wgit::Document is initialized,
144
+ # regardless of the source. Use it (optionally) to process the result
145
+ # value.
146
+ # @yieldparam value [Object] The result value to be assigned to the new
147
+ # `var`.
148
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
149
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
150
+ # `:object`.
151
+ # @yieldreturn [Object] The return value of the block becomes the new var's
152
+ # value. Return the block's value param unchanged if you want to inspect.
131
153
  # @raise [StandardError] If the var param isn't valid.
132
- # @return [Symbol] The given var Symbol.
133
- def self.define_extension(var, xpath, options = {}, &block)
154
+ # @return [Symbol] The given var Symbol if successful.
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
134
156
  var = var.to_sym
135
- default_options = { singleton: true, text_content_only: true }
136
- options = default_options.merge(options)
157
+ defaults = { singleton: true, text_content_only: true }
158
+ opts = defaults.merge(opts)
137
159
 
138
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
139
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
140
162
 
141
163
  # Define the private init_*_from_html method for HTML.
142
164
  # Gets the HTML's xpath value and creates a var for it.
143
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
144
- result = find_in_html(xpath, options, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
145
167
  init_var(var, result)
146
168
  end
147
- Document.send :private, func_name
169
+ Document.send(:private, func_name)
148
170
 
149
171
  # Define the private init_*_from_object method for a Database object.
150
172
  # Gets the Object's 'key' value and creates a var for it.
151
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
152
- result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
153
179
  init_var(var, result)
154
180
  end
155
- Document.send :private, func_name
181
+ Document.send(:private, func_name)
156
182
 
183
+ @extractors << var
157
184
  var
158
185
  end
159
186
 
160
- # Removes the init_* methods created when an extension is defined.
161
- # Therefore, this is the opposing method to Document.define_extension.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
162
189
  # Returns true if successful or false if the method(s) cannot be found.
163
190
  #
164
- # @param var [Symbol] The extension variable already defined.
165
- # @return [Boolean] True if the extension var was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
166
193
  # otherwise false.
167
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
168
195
  Document.send(:remove_method, "init_#{var}_from_html")
169
196
  Document.send(:remove_method, "init_#{var}_from_object")
170
197
 
198
+ @extractors.delete(var.to_sym)
171
199
  true
172
200
  rescue NameError
173
201
  false
@@ -186,7 +214,7 @@ module Wgit
186
214
  (@url == other.url) && (@html == other.html)
187
215
  end
188
216
 
189
- # Is a shortcut for calling Document#html[range].
217
+ # Shortcut for calling Document#html[range].
190
218
  #
191
219
  # @param range [Range] The range of @html to return.
192
220
  # @return [String] The given range of @html.
@@ -196,9 +224,9 @@ module Wgit
196
224
 
197
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
198
226
  # <base> element's href value or @url (if @base is nil). If @base is
199
- # present and relative, then @url.to_base + @base is returned. This method
200
- # should be used instead of `doc.url.to_base` etc. when manually building
201
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
202
230
  #
203
231
  # Provide the `link:` parameter to get the correct base URL for that type
204
232
  # of link. For example, a link of `#top` would always return @url because
@@ -217,12 +245,16 @@ module Wgit
217
245
  # @return [Wgit::Url] The base URL of this Document e.g.
218
246
  # 'http://example.com/public'.
219
247
  def base_url(link: nil)
220
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
221
248
  if @url.relative? && @base.nil?
222
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
223
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
224
256
 
225
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
226
258
 
227
259
  if link
228
260
  link = Wgit::Url.new(link)
@@ -234,7 +266,7 @@ module Wgit
234
266
  end
235
267
  end
236
268
 
237
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
238
270
  base_url.omit_fragment.omit_query
239
271
  end
240
272
 
@@ -248,7 +280,7 @@ module Wgit
248
280
  def to_h(include_html: false, include_score: true)
249
281
  ignore = include_html ? [] : ['@html']
250
282
  ignore << '@score' unless include_score
251
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
252
284
 
253
285
  Wgit::Utils.to_h(self, ignore: ignore)
254
286
  end
@@ -265,7 +297,7 @@ module Wgit
265
297
 
266
298
  # Returns a Hash containing this Document's instance variables and
267
299
  # their #length (if they respond to it). Works dynamically so that any
268
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
269
301
  # the returned Hash as well. The number of text snippets as well as total
270
302
  # number of textual bytes are always included in the returned Hash.
271
303
  #
@@ -275,8 +307,8 @@ module Wgit
275
307
  instance_variables.each do |var|
276
308
  # Add up the total bytes of text as well as the length.
277
309
  if var == :@text
278
- hash[:text_snippets] = @text.length
279
- hash[:text_bytes] = @text.sum(&:length)
310
+ hash[:text] = @text.length
311
+ hash[:text_bytes] = @text.sum(&:length)
280
312
  # Else take the var's #length method return value.
281
313
  else
282
314
  next unless instance_variable_get(var).respond_to?(:length)
@@ -305,25 +337,43 @@ module Wgit
305
337
  end
306
338
 
307
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
308
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
309
341
  #
310
342
  # @param xpath [String] The xpath to search the @html with.
311
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
312
344
  def xpath(xpath)
313
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
346
+ end
347
+
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
314
355
  end
315
356
 
316
- # Uses Nokogiri's css method to search the doc's html and return the
317
- # results.
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
318
359
  #
319
360
  # @param selector [String] The CSS selector to search the @html with.
320
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
321
362
  def css(selector)
322
- @doc.css(selector)
363
+ @parser.css(selector)
323
364
  end
324
365
 
325
- # Returns all internal links from this Document in relative form. Internal
326
- # meaning a link to another document on the same host.
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
373
+ end
374
+
375
+ # Returns all unique internal links from this Document in relative form.
376
+ # Internal meaning a link to another document on the same host.
327
377
  #
328
378
  # This Document's host is used to determine if an absolute URL is actually
329
379
  # a relative link e.g. For a Document representing
@@ -332,41 +382,41 @@ module Wgit
332
382
  # as an internal link because both Documents live on the same host. Also
333
383
  # see Wgit::Document#internal_absolute_links.
334
384
  #
335
- # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
385
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
336
386
  def internal_links
337
387
  return [] if @links.empty?
338
388
 
339
389
  links = @links
340
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
341
391
  .map(&:omit_base)
342
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
343
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
344
394
  end
345
395
 
346
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
347
397
  end
348
398
 
349
- # Returns all internal links from this Document in absolute form by
399
+ # Returns all unique internal links from this Document in absolute form by
350
400
  # appending them to self's #base_url. Also see
351
401
  # Wgit::Document#internal_links.
352
402
  #
353
- # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
403
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
354
404
  def internal_absolute_links
355
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
356
406
  end
357
407
 
358
- # Returns all external links from this Document in absolute form. External
359
- # meaning a link to a different host.
408
+ # Returns all unique external links from this Document in absolute form.
409
+ # External meaning a link to a different host.
360
410
  #
361
- # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
411
+ # @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
362
412
  def external_links
363
413
  return [] if @links.empty?
364
414
 
365
415
  links = @links
366
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .reject { |link| link.relative?(host: @url.to_origin) }
367
417
  .map(&:omit_trailing_slash)
368
418
 
369
- Wgit::Utils.process_arr(links)
419
+ Wgit::Utils.sanitize(links)
370
420
  end
371
421
 
372
422
  # Searches the @text for the given query and returns the results.
@@ -381,8 +431,8 @@ module Wgit
381
431
  # original sentence, which ever is less. The algorithm obviously ensures
382
432
  # that the search query is visible somewhere in the sentence.
383
433
  #
384
- # @param query [String, Object#to_s] The value to search the document's
385
- # @text for.
434
+ # @param query [Regexp, #to_s] The regex or text value to search the
435
+ # document's @text for.
386
436
  # @param case_sensitive [Boolean] Whether character case must match.
387
437
  # @param whole_sentence [Boolean] Whether multiple words should be searched
388
438
  # for separately.
@@ -390,21 +440,27 @@ module Wgit
390
440
  # sentence.
391
441
  # @return [Array<String>] A subset of @text, matching the query.
392
442
  def search(
393
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
443
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
394
444
  )
395
- query = query.to_s
396
- raise 'A search query must be provided' if query.empty?
397
445
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
398
446
 
399
- query = query.gsub(' ', '|') unless whole_sentence
400
- regex = Regexp.new(query, !case_sensitive)
447
+ if query.is_a?(Regexp)
448
+ regex = query
449
+ else # respond_to? #to_s == true
450
+ query = query.to_s
451
+ query = query.gsub(' ', '|') unless whole_sentence
452
+ regex = Regexp.new(query, !case_sensitive)
453
+ end
454
+
401
455
  results = {}
402
456
 
403
457
  @text.each do |sentence|
458
+ sentence = sentence.strip
459
+ next if results[sentence]
460
+
404
461
  hits = sentence.scan(regex).count
405
462
  next unless hits.positive?
406
463
 
407
- sentence.strip!
408
464
  index = sentence.index(regex) # Index of first match.
409
465
  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
410
466
 
@@ -422,8 +478,8 @@ module Wgit
422
478
  # functionality. The original text is returned; no other reference to it
423
479
  # is kept thereafter.
424
480
  #
425
- # @param query [String, Object#to_s] The value to search the document's
426
- # @text for.
481
+ # @param query [Regexp, #to_s] The regex or text value to search the
482
+ # document's @text for.
427
483
  # @param case_sensitive [Boolean] Whether character case must match.
428
484
  # @param whole_sentence [Boolean] Whether multiple words should be searched
429
485
  # for separately.
@@ -431,7 +487,7 @@ module Wgit
431
487
  # sentence.
432
488
  # @return [String] This Document's original @text value.
433
489
  def search!(
434
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
490
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
435
491
  )
436
492
  orig_text = @text
437
493
  @text = search(
@@ -442,104 +498,114 @@ module Wgit
442
498
  orig_text
443
499
  end
444
500
 
501
+ # Extracts a value/object from this Document's @html using the given xpath
502
+ # parameter.
503
+ #
504
+ # @param xpath [String, #call] Used to find the value/object in @html.
505
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
506
+ # Object) : results (Array).
507
+ # @param text_content_only [Boolean] text_content_only ? result.content
508
+ # (String) : result (Nokogiri Object).
509
+ # @return [String, Object] The value found in the html or the default value
510
+ # (singleton ? nil : []).
511
+ def extract(xpath, singleton: true, text_content_only: true)
512
+ send(
513
+ :extract_from_html, xpath,
514
+ singleton: singleton, text_content_only: text_content_only
515
+ )
516
+ end
517
+
445
518
  protected
446
519
 
447
520
  # Initializes the nokogiri object using @html, which cannot be nil.
448
521
  # Override this method to custom configure the Nokogiri object returned.
449
522
  # Gets called from Wgit::Document.new upon initialization.
450
523
  #
524
+ # @yield [config] The given block is passed to Nokogiri::HTML for
525
+ # initialisation.
451
526
  # @raise [StandardError] If @html isn't set.
452
527
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
453
- def init_nokogiri
528
+ def init_nokogiri(&block)
454
529
  raise '@html must be set' unless @html
455
530
 
456
- Nokogiri::HTML(@html) do |config|
457
- # TODO: Remove #'s below when crawling in production.
458
- # config.options = Nokogiri::XML::ParseOptions::STRICT |
459
- # Nokogiri::XML::ParseOptions::NONET
460
- end
531
+ Nokogiri::HTML(@html, &block)
461
532
  end
462
533
 
463
- # Returns a value/object from this Document's @html using the given xpath
534
+ # Extracts a value/object from this Document's @html using the given xpath
464
535
  # parameter.
465
536
  #
466
- # @param xpath [String] Used to find the value/object in @html.
537
+ # @param xpath [String, #call] Used to find the value/object in @html.
467
538
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
468
539
  # Object) : results (Array).
469
540
  # @param text_content_only [Boolean] text_content_only ? result.content
470
541
  # (String) : result (Nokogiri Object).
471
- # @yield [value, source] Given the value (String/Object) before it's set as
472
- # an instance variable so that you can inspect/alter the value if
473
- # desired. Return nil from the block if you don't want to override the
474
- # value. Also given the source (Symbol) which is always :document.
542
+ # @yield The block is executed when a Wgit::Document is initialized,
543
+ # regardless of the source. Use it (optionally) to process the result
544
+ # value.
545
+ # @yieldparam value [Object] The result value to be returned.
546
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
547
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
548
+ # `:object`.
549
+ # @yieldreturn [Object] The return value of the block gets returned. Return
550
+ # the block's `value` param unchanged if you simply want to inspect it.
475
551
  # @return [String, Object] The value found in the html or the default value
476
552
  # (singleton ? nil : []).
477
- def find_in_html(xpath, singleton: true, text_content_only: true)
478
- default = singleton ? nil : []
479
- xpath = xpath.call if xpath.respond_to?(:call)
480
- results = @doc.xpath(xpath)
481
-
482
- return default if results.nil? || results.empty?
483
-
484
- result = if singleton
485
- text_content_only ? results.first.content : results.first
486
- else
487
- text_content_only ? results.map(&:content) : results
488
- end
489
-
490
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
553
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
554
+ xpath = xpath.call if xpath.respond_to?(:call)
555
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
491
556
 
492
- if block_given?
493
- new_result = yield(result, self, :document)
494
- result = new_result unless new_result.nil?
557
+ if text_content_only
558
+ result = singleton ? result&.content : result.map(&:content)
495
559
  end
496
560
 
561
+ Wgit::Utils.sanitize(result)
562
+ result = yield(result, self, :document) if block_given?
497
563
  result
498
564
  end
499
565
 
500
- # Returns a value from the obj using the given key via obj#fetch.
566
+ # Returns a value from the obj using the given key via `obj#fetch`.
501
567
  #
502
- # @param obj [Object#fetch] The object containing the key/value.
568
+ # @param obj [#fetch] The object containing the key/value.
503
569
  # @param key [String] Used to find the value in the obj.
504
570
  # @param singleton [Boolean] True if a single value, false otherwise.
505
- # @yield [value, source] Given the value (String/Object) before it's set as
506
- # an instance variable so that you can inspect/alter the value if
507
- # desired. Return nil from the block if you don't want to override the
508
- # value. Also given the source (Symbol) which is always :object.
571
+ # @yield The block is executed when a Wgit::Document is initialized,
572
+ # regardless of the source. Use it (optionally) to process the result
573
+ # value.
574
+ # @yieldparam value [Object] The result value to be returned.
575
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
576
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
577
+ # `:object`.
578
+ # @yieldreturn [Object] The return value of the block gets returned. Return
579
+ # the block's `value` param unchanged if you simply want to inspect it.
509
580
  # @return [String, Object] The value found in the obj or the default value
510
581
  # (singleton ? nil : []).
511
- def find_in_object(obj, key, singleton: true)
582
+ def extract_from_object(obj, key, singleton: true)
512
583
  assert_respond_to(obj, :fetch)
513
584
 
514
585
  default = singleton ? nil : []
515
586
  result = obj.fetch(key.to_s, default)
516
587
 
517
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
518
-
519
- if block_given?
520
- new_result = yield(result, obj, :object)
521
- result = new_result unless new_result.nil?
522
- end
523
-
588
+ Wgit::Utils.sanitize(result)
589
+ result = yield(result, obj, :object) if block_given?
524
590
  result
525
591
  end
526
592
 
527
593
  private
528
594
 
529
595
  # Initialise the Document from URL and HTML Strings.
530
- def init_from_strings(url, html, encode_html: true)
596
+ def init_from_strings(url, html, encode: true)
531
597
  assert_types(html, [String, NilClass])
532
598
 
533
599
  # We already know url.is_a?(String) so parse into Url unless already so.
534
600
  url = Wgit::Url.parse(url)
535
601
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
536
602
 
537
- @url = url
538
- @html = html || ''
539
- @doc = init_nokogiri
540
- @score = 0.0
603
+ @url = url
604
+ @html = html || ''
605
+ @parser = init_nokogiri
606
+ @score = 0.0
541
607
 
542
- Wgit::Utils.process_str(@html, encode: encode_html)
608
+ Wgit::Utils.sanitize(@html, encode: encode)
543
609
 
544
610
  # Dynamically run the init_*_from_html methods.
545
611
  Document.private_instance_methods(false).each do |method|
@@ -552,15 +618,15 @@ module Wgit
552
618
 
553
619
  # Initialise the Document from a Hash like Object containing Strings as
554
620
  # keys e.g. database collection object or Hash.
555
- def init_from_object(obj, encode_html: true)
621
+ def init_from_object(obj, encode: true)
556
622
  assert_respond_to(obj, :fetch)
557
623
 
558
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
559
- @html = obj.fetch('html', '')
560
- @doc = init_nokogiri
561
- @score = obj.fetch('score', 0.0)
624
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
625
+ @html = obj.fetch('html', '')
626
+ @parser = init_nokogiri
627
+ @score = obj.fetch('score', 0.0)
562
628
 
563
- Wgit::Utils.process_str(@html, encode: encode_html)
629
+ Wgit::Utils.sanitize(@html, encode: encode)
564
630
 
565
631
  # Dynamically run the init_*_from_object methods.
566
632
  Document.private_instance_methods(false).each do |method|
@@ -571,11 +637,11 @@ module Wgit
571
637
  end
572
638
  end
573
639
 
574
- # Initialises an instance variable and defines a getter method for it.
640
+ # Initialises an instance variable and defines an accessor method for it.
575
641
  #
576
642
  # @param var [Symbol] The name of the variable to be initialized.
577
643
  # @param value [Object] The newly initialized variable's value.
578
- # @return [Symbol] The name of the newly created getter method.
644
+ # @return [Symbol] The name of the defined getter method.
579
645
  def init_var(var, value)
580
646
  # instance_var_name starts with @, var_name doesn't.
581
647
  var = var.to_s
@@ -583,10 +649,9 @@ module Wgit
583
649
  instance_var_name = "@#{var_name}".to_sym
584
650
 
585
651
  instance_variable_set(instance_var_name, value)
652
+ Wgit::Document.attr_accessor(var_name)
586
653
 
587
- Document.send(:define_method, var_name) do
588
- instance_variable_get(instance_var_name)
589
- end
654
+ var_name
590
655
  end
591
656
 
592
657
  alias content html