wgit 0.5.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,7 @@ module Wgit
14
14
  raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
15
15
 
16
16
  model = url.to_h
17
-
18
- Wgit::Utils.remove_non_bson_types(model)
17
+ select_bson_types(model)
19
18
  end
20
19
 
21
20
  # The data model for a Wgit::Document collection object.
@@ -28,7 +27,7 @@ module Wgit
28
27
  model = doc.to_h(include_html: false, include_score: false)
29
28
  model['url'] = url(doc.url) # Expand Url String into full object.
30
29
 
31
- Wgit::Utils.remove_non_bson_types(model)
30
+ select_bson_types(model)
32
31
  end
33
32
 
34
33
  # Common fields when inserting a record into the DB.
@@ -49,5 +48,13 @@ module Wgit
49
48
  date_modified: Wgit::Utils.time_stamp
50
49
  }
51
50
  end
51
+
52
+ # Returns the model having removed non bson types (for use with MongoDB).
53
+ #
54
+ # @param model_hash [Hash] The model Hash to sanitize.
55
+ # @return [Hash] The model Hash with non bson types removed.
56
+ def self.select_bson_types(model_hash)
57
+ model_hash.select { |_k, v| v.respond_to?(:bson_type) }
58
+ end
52
59
  end
53
60
  end
data/lib/wgit/document.rb CHANGED
@@ -3,45 +3,56 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
+ require 'set'
6
7
 
7
8
  module Wgit
8
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
9
10
  # will work e.g. images etc. Also doubles as a search result when
10
- # loading Documents from the database via Wgit::Database#search.
11
+ # loading Documents from the database via `Wgit::Database#search`.
11
12
  #
12
13
  # The initialize method dynamically initializes instance variables from the
13
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
- # Document class can be easily extended allowing you to pull out the bits of
15
- # a webpage that are important to you. See Wgit::Document.define_extension.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
16
17
  class Document
17
18
  include Assertable
18
19
 
19
- # Regex for the allowed var names when defining an extension.
20
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
21
22
 
22
- # The HTML elements that make up the visible text on a page.
23
- # These elements are used to initialize the @text of the Document.
24
- # See the README.md for how to add to this Array dynamically.
25
- @text_elements = %i[
26
- dd div dl dt figcaption figure hr li
27
- main ol p pre span ul h1 h2 h3 h4 h5
28
- ]
23
+ # Set of text elements used to build Document#text.
24
+ @text_elements = Set.new(%i[
25
+ a abbr address article aside b bdi bdo blockquote button caption cite
26
+ code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
+ h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
+ option output p pre q rb rt ruby s samp section small span strong sub
29
+ summary sup td textarea th time u ul var wbr
30
+ ])
31
+
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
29
34
 
30
35
  class << self
31
- # Class level instance reader method for @text_elements.
36
+ # Set of HTML elements that make up the visible text on a page. These
37
+ # elements are used to initialize the Wgit::Document#text. See the
38
+ # README.md for how to add to this Set dynamically.
32
39
  attr_reader :text_elements
40
+
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
33
44
  end
34
45
 
35
46
  # The URL of the webpage, an instance of Wgit::Url.
36
47
  attr_reader :url
37
48
 
38
- # The HTML of the webpage, an instance of String.
49
+ # The content/HTML of the document, an instance of String.
39
50
  attr_reader :html
40
51
 
41
52
  # The Nokogiri::HTML document object initialized from @html.
42
- attr_reader :doc
53
+ attr_reader :parser
43
54
 
44
- # The score is only used following a Database#search and records matches.
55
+ # The score is only used following a `Database#search` and records matches.
45
56
  attr_reader :score
46
57
 
47
58
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -50,29 +61,31 @@ module Wgit
50
61
  # pages retrieved from the database.
51
62
  #
52
63
  # During initialisation, the Document will call any private
53
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
54
- # README.md and Wgit::Document.define_extension method for more details.
64
+ # `init_*_from_html` and `init_*_from_object` methods it can find. See the
65
+ # Wgit::Document.define_extractor method for more details.
55
66
  #
56
- # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
67
+ # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
57
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
58
69
  # MongoDB collection object. The Object's :fetch method should support
59
70
  # Strings as keys.
60
- # @param html [String, NilClass] The crawled web page's HTML. This param is
61
- # only used if url_or_obj is a String representing the web page's URL.
62
- # Otherwise, the HTML comes from the database object. A html of nil will
63
- # be defaulted to an empty String.
64
- def initialize(url_or_obj, html = '', encode_html: true)
71
+ # @param html [String, NilClass] The crawled web page's content/HTML. This
72
+ # param is only used if url_or_obj is a String representing the web
73
+ # page's URL. Otherwise, the HTML comes from the database object. A html
74
+ # of nil will be defaulted to an empty String.
75
+ # @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
76
+ # false if the Document content is an image etc.
77
+ def initialize(url_or_obj, html = '', encode: true)
65
78
  if url_or_obj.is_a?(String)
66
- init_from_strings(url_or_obj, html, encode_html: encode_html)
79
+ init_from_strings(url_or_obj, html, encode: encode)
67
80
  else
68
- init_from_object(url_or_obj, encode_html: encode_html)
81
+ init_from_object(url_or_obj, encode: encode)
69
82
  end
70
83
  end
71
84
 
72
85
  ### Document Class Methods ###
73
86
 
74
87
  # Uses Document.text_elements to build an xpath String, used to obtain
75
- # all of the combined text on a webpage.
88
+ # all of the combined visual text on a webpage.
76
89
  #
77
90
  # @return [String] An xpath String to obtain a webpage's text elements.
78
91
  def self.text_elements_xpath
@@ -88,86 +101,101 @@ module Wgit
88
101
  xpath
89
102
  end
90
103
 
91
- # Defines an extension, which is a way to serialise HTML elements into
92
- # instance variables upon Document initialization. See the default
93
- # extensions defined in 'document_extensions.rb' as examples.
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
94
110
  #
95
- # Note that defined extensions work for both Documents initialized from
111
+ # Note that defined extractors work for both Documents initialized from
96
112
  # HTML (via Wgit::Crawler methods) and from database objects.
97
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
98
114
  # the xpath or database object result(s).
99
115
  #
100
116
  # When initialising from HTML, a singleton value of true will only
101
- # ever return one result; otherwise all xpath results are returned in an
102
- # Array. When initialising from a database object, the value is taken as
103
- # is and singleton is only used to define the default empty value.
104
- # If a value cannot be found (in either the HTML or database object), then
105
- # a default will be used. The default value is: `singleton ? nil : []`.
106
- #
107
- # @param var [Symbol] The name of the variable to be initialised.
108
- # @param xpath [String, Object#call] The xpath used to find the element(s)
117
+ # ever return the first result found; otherwise all the results are
118
+ # returned in an Array. When initialising from a database object, the value
119
+ # is taken as is and singleton is only used to define the default empty
120
+ # value. If a value cannot be found (in either the HTML or database
121
+ # object), then a default will be used. The default value is:
122
+ # `singleton ? nil : []`.
123
+ #
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
127
+ # @param xpath [String, #call] The xpath used to find the element(s)
109
128
  # of the webpage. Only used when initializing from HTML.
110
129
  #
111
130
  # Pass a callable object (proc etc.) if you want the
112
131
  # xpath value to be derived on Document initialisation (instead of when
113
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
114
133
  # String.
115
- # @param options [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
116
135
  # options are only used when intializing from HTML, not the database.
117
- # @option options [Boolean] :singleton The singleton option determines
136
+ # @option opts [Boolean] :singleton The singleton option determines
118
137
  # whether or not the result(s) should be in an Array. If multiple
119
138
  # results are found and singleton is true then the first result will be
120
139
  # used. Defaults to true.
121
- # @option options [Boolean] :text_content_only The text_content_only option
140
+ # @option opts [Boolean] :text_content_only The text_content_only option
122
141
  # if true will use the text content of the Nokogiri result object,
123
142
  # otherwise the Nokogiri object itself is returned. Defaults to true.
124
- # @yield [value, source, type] Yields the value (Object) about to be
125
- # assigned to the new var, the source of the value (Wgit::Document or DB
126
- # Object) and the source type (Symbol of either :document or :object).
127
- #
128
- # The return value of the block becomes the new var value, unless nil.
129
- # Return nil if you want to inspect but not change the var value. The
130
- # block is executed when a Wgit::Document is initialized.
143
+ # @yield The block is executed when a Wgit::Document is initialized,
144
+ # regardless of the source. Use it (optionally) to process the result
145
+ # value.
146
+ # @yieldparam value [Object] The result value to be assigned to the new
147
+ # `var`.
148
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
149
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
150
+ # `:object`.
151
+ # @yieldreturn [Object] The return value of the block becomes the new var's
152
+ # value. Return the block's value param unchanged if you want to inspect.
131
153
  # @raise [StandardError] If the var param isn't valid.
132
- # @return [Symbol] The given var Symbol.
133
- def self.define_extension(var, xpath, options = {}, &block)
154
+ # @return [Symbol] The given var Symbol if successful.
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
134
156
  var = var.to_sym
135
- default_options = { singleton: true, text_content_only: true }
136
- options = default_options.merge(options)
157
+ defaults = { singleton: true, text_content_only: true }
158
+ opts = defaults.merge(opts)
137
159
 
138
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
139
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
140
162
 
141
163
  # Define the private init_*_from_html method for HTML.
142
164
  # Gets the HTML's xpath value and creates a var for it.
143
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
144
- result = find_in_html(xpath, options, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
145
167
  init_var(var, result)
146
168
  end
147
- Document.send :private, func_name
169
+ Document.send(:private, func_name)
148
170
 
149
171
  # Define the private init_*_from_object method for a Database object.
150
172
  # Gets the Object's 'key' value and creates a var for it.
151
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
152
- result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
153
179
  init_var(var, result)
154
180
  end
155
- Document.send :private, func_name
181
+ Document.send(:private, func_name)
156
182
 
183
+ @extractors << var
157
184
  var
158
185
  end
159
186
 
160
- # Removes the init_* methods created when an extension is defined.
161
- # Therefore, this is the opposing method to Document.define_extension.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
162
189
  # Returns true if successful or false if the method(s) cannot be found.
163
190
  #
164
- # @param var [Symbol] The extension variable already defined.
165
- # @return [Boolean] True if the extension var was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
166
193
  # otherwise false.
167
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
168
195
  Document.send(:remove_method, "init_#{var}_from_html")
169
196
  Document.send(:remove_method, "init_#{var}_from_object")
170
197
 
198
+ @extractors.delete(var.to_sym)
171
199
  true
172
200
  rescue NameError
173
201
  false
@@ -186,7 +214,7 @@ module Wgit
186
214
  (@url == other.url) && (@html == other.html)
187
215
  end
188
216
 
189
- # Is a shortcut for calling Document#html[range].
217
+ # Shortcut for calling Document#html[range].
190
218
  #
191
219
  # @param range [Range] The range of @html to return.
192
220
  # @return [String] The given range of @html.
@@ -196,9 +224,9 @@ module Wgit
196
224
 
197
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
198
226
  # <base> element's href value or @url (if @base is nil). If @base is
199
- # present and relative, then @url.to_base + @base is returned. This method
200
- # should be used instead of `doc.url.to_base` etc. when manually building
201
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
202
230
  #
203
231
  # Provide the `link:` parameter to get the correct base URL for that type
204
232
  # of link. For example, a link of `#top` would always return @url because
@@ -217,12 +245,16 @@ module Wgit
217
245
  # @return [Wgit::Url] The base URL of this Document e.g.
218
246
  # 'http://example.com/public'.
219
247
  def base_url(link: nil)
220
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
221
248
  if @url.relative? && @base.nil?
222
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
223
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
224
256
 
225
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
226
258
 
227
259
  if link
228
260
  link = Wgit::Url.new(link)
@@ -234,7 +266,7 @@ module Wgit
234
266
  end
235
267
  end
236
268
 
237
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
238
270
  base_url.omit_fragment.omit_query
239
271
  end
240
272
 
@@ -248,7 +280,7 @@ module Wgit
248
280
  def to_h(include_html: false, include_score: true)
249
281
  ignore = include_html ? [] : ['@html']
250
282
  ignore << '@score' unless include_score
251
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
252
284
 
253
285
  Wgit::Utils.to_h(self, ignore: ignore)
254
286
  end
@@ -265,7 +297,7 @@ module Wgit
265
297
 
266
298
  # Returns a Hash containing this Document's instance variables and
267
299
  # their #length (if they respond to it). Works dynamically so that any
268
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
269
301
  # the returned Hash as well. The number of text snippets as well as total
270
302
  # number of textual bytes are always included in the returned Hash.
271
303
  #
@@ -275,8 +307,8 @@ module Wgit
275
307
  instance_variables.each do |var|
276
308
  # Add up the total bytes of text as well as the length.
277
309
  if var == :@text
278
- hash[:text_snippets] = @text.length
279
- hash[:text_bytes] = @text.sum(&:length)
310
+ hash[:text] = @text.length
311
+ hash[:text_bytes] = @text.sum(&:length)
280
312
  # Else take the var's #length method return value.
281
313
  else
282
314
  next unless instance_variable_get(var).respond_to?(:length)
@@ -305,25 +337,43 @@ module Wgit
305
337
  end
306
338
 
307
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
308
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
309
341
  #
310
342
  # @param xpath [String] The xpath to search the @html with.
311
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
312
344
  def xpath(xpath)
313
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
346
+ end
347
+
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
314
355
  end
315
356
 
316
- # Uses Nokogiri's css method to search the doc's html and return the
317
- # results.
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
318
359
  #
319
360
  # @param selector [String] The CSS selector to search the @html with.
320
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
321
362
  def css(selector)
322
- @doc.css(selector)
363
+ @parser.css(selector)
323
364
  end
324
365
 
325
- # Returns all internal links from this Document in relative form. Internal
326
- # meaning a link to another document on the same host.
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
373
+ end
374
+
375
+ # Returns all unique internal links from this Document in relative form.
376
+ # Internal meaning a link to another document on the same host.
327
377
  #
328
378
  # This Document's host is used to determine if an absolute URL is actually
329
379
  # a relative link e.g. For a Document representing
@@ -332,41 +382,48 @@ module Wgit
332
382
  # as an internal link because both Documents live on the same host. Also
333
383
  # see Wgit::Document#internal_absolute_links.
334
384
  #
335
- # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
385
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
336
386
  def internal_links
337
387
  return [] if @links.empty?
338
388
 
339
389
  links = @links
340
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
341
391
  .map(&:omit_base)
342
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
343
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
344
394
  end
345
395
 
346
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
347
397
  end
348
398
 
349
- # Returns all internal links from this Document in absolute form by
399
+ # Returns all unique internal links from this Document in absolute form by
350
400
  # appending them to self's #base_url. Also see
351
401
  # Wgit::Document#internal_links.
352
402
  #
353
- # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
403
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
354
404
  def internal_absolute_links
355
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
356
406
  end
357
407
 
358
- # Returns all external links from this Document in absolute form. External
359
- # meaning a link to a different host.
408
+ # Returns all unique external links from this Document in absolute form.
409
+ # External meaning a link to a different host.
360
410
  #
361
- # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
411
+ # @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
362
412
  def external_links
363
413
  return [] if @links.empty?
364
414
 
365
415
  links = @links
366
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .map do |link|
417
+ if link.scheme_relative?
418
+ link.prefix_scheme(@url.to_scheme.to_sym)
419
+ else
420
+ link
421
+ end
422
+ end
423
+ .reject { |link| link.relative?(host: @url.to_origin) }
367
424
  .map(&:omit_trailing_slash)
368
425
 
369
- Wgit::Utils.process_arr(links)
426
+ Wgit::Utils.sanitize(links)
370
427
  end
371
428
 
372
429
  # Searches the @text for the given query and returns the results.
@@ -381,8 +438,8 @@ module Wgit
381
438
  # original sentence, which ever is less. The algorithm obviously ensures
382
439
  # that the search query is visible somewhere in the sentence.
383
440
  #
384
- # @param query [String, Object#to_s] The value to search the document's
385
- # @text for.
441
+ # @param query [Regexp, #to_s] The regex or text value to search the
442
+ # document's @text for.
386
443
  # @param case_sensitive [Boolean] Whether character case must match.
387
444
  # @param whole_sentence [Boolean] Whether multiple words should be searched
388
445
  # for separately.
@@ -390,21 +447,27 @@ module Wgit
390
447
  # sentence.
391
448
  # @return [Array<String>] A subset of @text, matching the query.
392
449
  def search(
393
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
450
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
394
451
  )
395
- query = query.to_s
396
- raise 'A search query must be provided' if query.empty?
397
452
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
398
453
 
399
- query = query.gsub(' ', '|') unless whole_sentence
400
- regex = Regexp.new(query, !case_sensitive)
454
+ if query.is_a?(Regexp)
455
+ regex = query
456
+ else # respond_to? #to_s == true
457
+ query = query.to_s
458
+ query = query.gsub(' ', '|') unless whole_sentence
459
+ regex = Regexp.new(query, !case_sensitive)
460
+ end
461
+
401
462
  results = {}
402
463
 
403
464
  @text.each do |sentence|
465
+ sentence = sentence.strip
466
+ next if results[sentence]
467
+
404
468
  hits = sentence.scan(regex).count
405
469
  next unless hits.positive?
406
470
 
407
- sentence.strip!
408
471
  index = sentence.index(regex) # Index of first match.
409
472
  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
410
473
 
@@ -422,8 +485,8 @@ module Wgit
422
485
  # functionality. The original text is returned; no other reference to it
423
486
  # is kept thereafter.
424
487
  #
425
- # @param query [String, Object#to_s] The value to search the document's
426
- # @text for.
488
+ # @param query [Regexp, #to_s] The regex or text value to search the
489
+ # document's @text for.
427
490
  # @param case_sensitive [Boolean] Whether character case must match.
428
491
  # @param whole_sentence [Boolean] Whether multiple words should be searched
429
492
  # for separately.
@@ -431,7 +494,7 @@ module Wgit
431
494
  # sentence.
432
495
  # @return [String] This Document's original @text value.
433
496
  def search!(
434
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
497
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
435
498
  )
436
499
  orig_text = @text
437
500
  @text = search(
@@ -442,104 +505,114 @@ module Wgit
442
505
  orig_text
443
506
  end
444
507
 
508
+ # Extracts a value/object from this Document's @html using the given xpath
509
+ # parameter.
510
+ #
511
+ # @param xpath [String, #call] Used to find the value/object in @html.
512
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
+ # Object) : results (Array).
514
+ # @param text_content_only [Boolean] text_content_only ? result.content
515
+ # (String) : result (Nokogiri Object).
516
+ # @return [String, Object] The value found in the html or the default value
517
+ # (singleton ? nil : []).
518
+ def extract(xpath, singleton: true, text_content_only: true)
519
+ send(
520
+ :extract_from_html, xpath,
521
+ singleton: singleton, text_content_only: text_content_only
522
+ )
523
+ end
524
+
445
525
  protected
446
526
 
447
527
  # Initializes the nokogiri object using @html, which cannot be nil.
448
528
  # Override this method to custom configure the Nokogiri object returned.
449
529
  # Gets called from Wgit::Document.new upon initialization.
450
530
  #
531
+ # @yield [config] The given block is passed to Nokogiri::HTML for
532
+ # initialisation.
451
533
  # @raise [StandardError] If @html isn't set.
452
534
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
453
- def init_nokogiri
535
+ def init_nokogiri(&block)
454
536
  raise '@html must be set' unless @html
455
537
 
456
- Nokogiri::HTML(@html) do |config|
457
- # TODO: Remove #'s below when crawling in production.
458
- # config.options = Nokogiri::XML::ParseOptions::STRICT |
459
- # Nokogiri::XML::ParseOptions::NONET
460
- end
538
+ Nokogiri::HTML(@html, &block)
461
539
  end
462
540
 
463
- # Returns a value/object from this Document's @html using the given xpath
541
+ # Extracts a value/object from this Document's @html using the given xpath
464
542
  # parameter.
465
543
  #
466
- # @param xpath [String] Used to find the value/object in @html.
544
+ # @param xpath [String, #call] Used to find the value/object in @html.
467
545
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
468
546
  # Object) : results (Array).
469
547
  # @param text_content_only [Boolean] text_content_only ? result.content
470
548
  # (String) : result (Nokogiri Object).
471
- # @yield [value, source] Given the value (String/Object) before it's set as
472
- # an instance variable so that you can inspect/alter the value if
473
- # desired. Return nil from the block if you don't want to override the
474
- # value. Also given the source (Symbol) which is always :document.
549
+ # @yield The block is executed when a Wgit::Document is initialized,
550
+ # regardless of the source. Use it (optionally) to process the result
551
+ # value.
552
+ # @yieldparam value [Object] The result value to be returned.
553
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
+ # `:object`.
556
+ # @yieldreturn [Object] The return value of the block gets returned. Return
557
+ # the block's `value` param unchanged if you simply want to inspect it.
475
558
  # @return [String, Object] The value found in the html or the default value
476
559
  # (singleton ? nil : []).
477
- def find_in_html(xpath, singleton: true, text_content_only: true)
478
- default = singleton ? nil : []
479
- xpath = xpath.call if xpath.respond_to?(:call)
480
- results = @doc.xpath(xpath)
481
-
482
- return default if results.nil? || results.empty?
483
-
484
- result = if singleton
485
- text_content_only ? results.first.content : results.first
486
- else
487
- text_content_only ? results.map(&:content) : results
488
- end
489
-
490
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
560
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
561
+ xpath = xpath.call if xpath.respond_to?(:call)
562
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
491
563
 
492
- if block_given?
493
- new_result = yield(result, self, :document)
494
- result = new_result unless new_result.nil?
564
+ if text_content_only
565
+ result = singleton ? result&.content : result.map(&:content)
495
566
  end
496
567
 
568
+ Wgit::Utils.sanitize(result)
569
+ result = yield(result, self, :document) if block_given?
497
570
  result
498
571
  end
499
572
 
500
- # Returns a value from the obj using the given key via obj#fetch.
573
+ # Returns a value from the obj using the given key via `obj#fetch`.
501
574
  #
502
- # @param obj [Object#fetch] The object containing the key/value.
575
+ # @param obj [#fetch] The object containing the key/value.
503
576
  # @param key [String] Used to find the value in the obj.
504
577
  # @param singleton [Boolean] True if a single value, false otherwise.
505
- # @yield [value, source] Given the value (String/Object) before it's set as
506
- # an instance variable so that you can inspect/alter the value if
507
- # desired. Return nil from the block if you don't want to override the
508
- # value. Also given the source (Symbol) which is always :object.
578
+ # @yield The block is executed when a Wgit::Document is initialized,
579
+ # regardless of the source. Use it (optionally) to process the result
580
+ # value.
581
+ # @yieldparam value [Object] The result value to be returned.
582
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
583
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
584
+ # `:object`.
585
+ # @yieldreturn [Object] The return value of the block gets returned. Return
586
+ # the block's `value` param unchanged if you simply want to inspect it.
509
587
  # @return [String, Object] The value found in the obj or the default value
510
588
  # (singleton ? nil : []).
511
- def find_in_object(obj, key, singleton: true)
589
+ def extract_from_object(obj, key, singleton: true)
512
590
  assert_respond_to(obj, :fetch)
513
591
 
514
592
  default = singleton ? nil : []
515
593
  result = obj.fetch(key.to_s, default)
516
594
 
517
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
518
-
519
- if block_given?
520
- new_result = yield(result, obj, :object)
521
- result = new_result unless new_result.nil?
522
- end
523
-
595
+ Wgit::Utils.sanitize(result)
596
+ result = yield(result, obj, :object) if block_given?
524
597
  result
525
598
  end
526
599
 
527
600
  private
528
601
 
529
602
  # Initialise the Document from URL and HTML Strings.
530
- def init_from_strings(url, html, encode_html: true)
603
+ def init_from_strings(url, html, encode: true)
531
604
  assert_types(html, [String, NilClass])
532
605
 
533
606
  # We already know url.is_a?(String) so parse into Url unless already so.
534
607
  url = Wgit::Url.parse(url)
535
608
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
536
609
 
537
- @url = url
538
- @html = html || ''
539
- @doc = init_nokogiri
540
- @score = 0.0
610
+ @url = url
611
+ @html = html || ''
612
+ @parser = init_nokogiri
613
+ @score = 0.0
541
614
 
542
- Wgit::Utils.process_str(@html, encode: encode_html)
615
+ Wgit::Utils.sanitize(@html, encode: encode)
543
616
 
544
617
  # Dynamically run the init_*_from_html methods.
545
618
  Document.private_instance_methods(false).each do |method|
@@ -552,15 +625,15 @@ module Wgit
552
625
 
553
626
  # Initialise the Document from a Hash like Object containing Strings as
554
627
  # keys e.g. database collection object or Hash.
555
- def init_from_object(obj, encode_html: true)
628
+ def init_from_object(obj, encode: true)
556
629
  assert_respond_to(obj, :fetch)
557
630
 
558
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
559
- @html = obj.fetch('html', '')
560
- @doc = init_nokogiri
561
- @score = obj.fetch('score', 0.0)
631
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
632
+ @html = obj.fetch('html', '')
633
+ @parser = init_nokogiri
634
+ @score = obj.fetch('score', 0.0)
562
635
 
563
- Wgit::Utils.process_str(@html, encode: encode_html)
636
+ Wgit::Utils.sanitize(@html, encode: encode)
564
637
 
565
638
  # Dynamically run the init_*_from_object methods.
566
639
  Document.private_instance_methods(false).each do |method|
@@ -571,11 +644,11 @@ module Wgit
571
644
  end
572
645
  end
573
646
 
574
- # Initialises an instance variable and defines a getter method for it.
647
+ # Initialises an instance variable and defines an accessor method for it.
575
648
  #
576
649
  # @param var [Symbol] The name of the variable to be initialized.
577
650
  # @param value [Object] The newly initialized variable's value.
578
- # @return [Symbol] The name of the newly created getter method.
651
+ # @return [Symbol] The name of the defined getter method.
579
652
  def init_var(var, value)
580
653
  # instance_var_name starts with @, var_name doesn't.
581
654
  var = var.to_s
@@ -583,10 +656,9 @@ module Wgit
583
656
  instance_var_name = "@#{var_name}".to_sym
584
657
 
585
658
  instance_variable_set(instance_var_name, value)
659
+ Wgit::Document.attr_accessor(var_name)
586
660
 
587
- Document.send(:define_method, var_name) do
588
- instance_variable_get(instance_var_name)
589
- end
661
+ var_name
590
662
  end
591
663
 
592
664
  alias content html