wgit 0.5.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,8 +14,7 @@ module Wgit
14
14
  raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
15
15
 
16
16
  model = url.to_h
17
-
18
- Wgit::Utils.remove_non_bson_types(model)
17
+ select_bson_types(model)
19
18
  end
20
19
 
21
20
  # The data model for a Wgit::Document collection object.
@@ -28,7 +27,7 @@ module Wgit
28
27
  model = doc.to_h(include_html: false, include_score: false)
29
28
  model['url'] = url(doc.url) # Expand Url String into full object.
30
29
 
31
- Wgit::Utils.remove_non_bson_types(model)
30
+ select_bson_types(model)
32
31
  end
33
32
 
34
33
  # Common fields when inserting a record into the DB.
@@ -49,5 +48,13 @@ module Wgit
49
48
  date_modified: Wgit::Utils.time_stamp
50
49
  }
51
50
  end
51
+
52
+ # Returns the model having removed non bson types (for use with MongoDB).
53
+ #
54
+ # @param model_hash [Hash] The model Hash to sanitize.
55
+ # @return [Hash] The model Hash with non bson types removed.
56
+ def self.select_bson_types(model_hash)
57
+ model_hash.select { |_k, v| v.respond_to?(:bson_type) }
58
+ end
52
59
  end
53
60
  end
data/lib/wgit/document.rb CHANGED
@@ -3,45 +3,56 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
+ require 'set'
6
7
 
7
8
  module Wgit
8
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
9
10
  # will work e.g. images etc. Also doubles as a search result when
10
- # loading Documents from the database via Wgit::Database#search.
11
+ # loading Documents from the database via `Wgit::Database#search`.
11
12
  #
12
13
  # The initialize method dynamically initializes instance variables from the
13
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
- # Document class can be easily extended allowing you to pull out the bits of
15
- # a webpage that are important to you. See Wgit::Document.define_extension.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
16
17
  class Document
17
18
  include Assertable
18
19
 
19
- # Regex for the allowed var names when defining an extension.
20
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
21
22
 
22
- # The HTML elements that make up the visible text on a page.
23
- # These elements are used to initialize the @text of the Document.
24
- # See the README.md for how to add to this Array dynamically.
25
- @text_elements = %i[
26
- dd div dl dt figcaption figure hr li
27
- main ol p pre span ul h1 h2 h3 h4 h5
28
- ]
23
+ # Set of text elements used to build Document#text.
24
+ @text_elements = Set.new(%i[
25
+ a abbr address article aside b bdi bdo blockquote button caption cite
26
+ code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
+ h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
+ option output p pre q rb rt ruby s samp section small span strong sub
29
+ summary sup td textarea th time u ul var wbr
30
+ ])
31
+
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
29
34
 
30
35
  class << self
31
- # Class level instance reader method for @text_elements.
36
+ # Set of HTML elements that make up the visible text on a page. These
37
+ # elements are used to initialize the Wgit::Document#text. See the
38
+ # README.md for how to add to this Set dynamically.
32
39
  attr_reader :text_elements
40
+
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
33
44
  end
34
45
 
35
46
  # The URL of the webpage, an instance of Wgit::Url.
36
47
  attr_reader :url
37
48
 
38
- # The HTML of the webpage, an instance of String.
49
+ # The content/HTML of the document, an instance of String.
39
50
  attr_reader :html
40
51
 
41
52
  # The Nokogiri::HTML document object initialized from @html.
42
- attr_reader :doc
53
+ attr_reader :parser
43
54
 
44
- # The score is only used following a Database#search and records matches.
55
+ # The score is only used following a `Database#search` and records matches.
45
56
  attr_reader :score
46
57
 
47
58
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -50,29 +61,31 @@ module Wgit
50
61
  # pages retrieved from the database.
51
62
  #
52
63
  # During initialisation, the Document will call any private
53
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
54
- # README.md and Wgit::Document.define_extension method for more details.
64
+ # `init_*_from_html` and `init_*_from_object` methods it can find. See the
65
+ # Wgit::Document.define_extractor method for more details.
55
66
  #
56
- # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
67
+ # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
57
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
58
69
  # MongoDB collection object. The Object's :fetch method should support
59
70
  # Strings as keys.
60
- # @param html [String, NilClass] The crawled web page's HTML. This param is
61
- # only used if url_or_obj is a String representing the web page's URL.
62
- # Otherwise, the HTML comes from the database object. A html of nil will
63
- # be defaulted to an empty String.
64
- def initialize(url_or_obj, html = '', encode_html: true)
71
+ # @param html [String, NilClass] The crawled web page's content/HTML. This
72
+ # param is only used if url_or_obj is a String representing the web
73
+ # page's URL. Otherwise, the HTML comes from the database object. A html
74
+ # of nil will be defaulted to an empty String.
75
+ # @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
76
+ # false if the Document content is an image etc.
77
+ def initialize(url_or_obj, html = '', encode: true)
65
78
  if url_or_obj.is_a?(String)
66
- init_from_strings(url_or_obj, html, encode_html: encode_html)
79
+ init_from_strings(url_or_obj, html, encode: encode)
67
80
  else
68
- init_from_object(url_or_obj, encode_html: encode_html)
81
+ init_from_object(url_or_obj, encode: encode)
69
82
  end
70
83
  end
71
84
 
72
85
  ### Document Class Methods ###
73
86
 
74
87
  # Uses Document.text_elements to build an xpath String, used to obtain
75
- # all of the combined text on a webpage.
88
+ # all of the combined visual text on a webpage.
76
89
  #
77
90
  # @return [String] An xpath String to obtain a webpage's text elements.
78
91
  def self.text_elements_xpath
@@ -88,86 +101,101 @@ module Wgit
88
101
  xpath
89
102
  end
90
103
 
91
- # Defines an extension, which is a way to serialise HTML elements into
92
- # instance variables upon Document initialization. See the default
93
- # extensions defined in 'document_extensions.rb' as examples.
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
94
110
  #
95
- # Note that defined extensions work for both Documents initialized from
111
+ # Note that defined extractors work for both Documents initialized from
96
112
  # HTML (via Wgit::Crawler methods) and from database objects.
97
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
98
114
  # the xpath or database object result(s).
99
115
  #
100
116
  # When initialising from HTML, a singleton value of true will only
101
- # ever return one result; otherwise all xpath results are returned in an
102
- # Array. When initialising from a database object, the value is taken as
103
- # is and singleton is only used to define the default empty value.
104
- # If a value cannot be found (in either the HTML or database object), then
105
- # a default will be used. The default value is: `singleton ? nil : []`.
106
- #
107
- # @param var [Symbol] The name of the variable to be initialised.
108
- # @param xpath [String, Object#call] The xpath used to find the element(s)
117
+ # ever return the first result found; otherwise all the results are
118
+ # returned in an Array. When initialising from a database object, the value
119
+ # is taken as is and singleton is only used to define the default empty
120
+ # value. If a value cannot be found (in either the HTML or database
121
+ # object), then a default will be used. The default value is:
122
+ # `singleton ? nil : []`.
123
+ #
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
127
+ # @param xpath [String, #call] The xpath used to find the element(s)
109
128
  # of the webpage. Only used when initializing from HTML.
110
129
  #
111
130
  # Pass a callable object (proc etc.) if you want the
112
131
  # xpath value to be derived on Document initialisation (instead of when
113
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
114
133
  # String.
115
- # @param options [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
116
135
  # options are only used when intializing from HTML, not the database.
117
- # @option options [Boolean] :singleton The singleton option determines
136
+ # @option opts [Boolean] :singleton The singleton option determines
118
137
  # whether or not the result(s) should be in an Array. If multiple
119
138
  # results are found and singleton is true then the first result will be
120
139
  # used. Defaults to true.
121
- # @option options [Boolean] :text_content_only The text_content_only option
140
+ # @option opts [Boolean] :text_content_only The text_content_only option
122
141
  # if true will use the text content of the Nokogiri result object,
123
142
  # otherwise the Nokogiri object itself is returned. Defaults to true.
124
- # @yield [value, source, type] Yields the value (Object) about to be
125
- # assigned to the new var, the source of the value (Wgit::Document or DB
126
- # Object) and the source type (Symbol of either :document or :object).
127
- #
128
- # The return value of the block becomes the new var value, unless nil.
129
- # Return nil if you want to inspect but not change the var value. The
130
- # block is executed when a Wgit::Document is initialized.
143
+ # @yield The block is executed when a Wgit::Document is initialized,
144
+ # regardless of the source. Use it (optionally) to process the result
145
+ # value.
146
+ # @yieldparam value [Object] The result value to be assigned to the new
147
+ # `var`.
148
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
149
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
150
+ # `:object`.
151
+ # @yieldreturn [Object] The return value of the block becomes the new var's
152
+ # value. Return the block's value param unchanged if you want to inspect.
131
153
  # @raise [StandardError] If the var param isn't valid.
132
- # @return [Symbol] The given var Symbol.
133
- def self.define_extension(var, xpath, options = {}, &block)
154
+ # @return [Symbol] The given var Symbol if successful.
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
134
156
  var = var.to_sym
135
- default_options = { singleton: true, text_content_only: true }
136
- options = default_options.merge(options)
157
+ defaults = { singleton: true, text_content_only: true }
158
+ opts = defaults.merge(opts)
137
159
 
138
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
139
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
140
162
 
141
163
  # Define the private init_*_from_html method for HTML.
142
164
  # Gets the HTML's xpath value and creates a var for it.
143
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
144
- result = find_in_html(xpath, options, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
145
167
  init_var(var, result)
146
168
  end
147
- Document.send :private, func_name
169
+ Document.send(:private, func_name)
148
170
 
149
171
  # Define the private init_*_from_object method for a Database object.
150
172
  # Gets the Object's 'key' value and creates a var for it.
151
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
152
- result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
153
179
  init_var(var, result)
154
180
  end
155
- Document.send :private, func_name
181
+ Document.send(:private, func_name)
156
182
 
183
+ @extractors << var
157
184
  var
158
185
  end
159
186
 
160
- # Removes the init_* methods created when an extension is defined.
161
- # Therefore, this is the opposing method to Document.define_extension.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
162
189
  # Returns true if successful or false if the method(s) cannot be found.
163
190
  #
164
- # @param var [Symbol] The extension variable already defined.
165
- # @return [Boolean] True if the extension var was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
166
193
  # otherwise false.
167
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
168
195
  Document.send(:remove_method, "init_#{var}_from_html")
169
196
  Document.send(:remove_method, "init_#{var}_from_object")
170
197
 
198
+ @extractors.delete(var.to_sym)
171
199
  true
172
200
  rescue NameError
173
201
  false
@@ -186,7 +214,7 @@ module Wgit
186
214
  (@url == other.url) && (@html == other.html)
187
215
  end
188
216
 
189
- # Is a shortcut for calling Document#html[range].
217
+ # Shortcut for calling Document#html[range].
190
218
  #
191
219
  # @param range [Range] The range of @html to return.
192
220
  # @return [String] The given range of @html.
@@ -196,9 +224,9 @@ module Wgit
196
224
 
197
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
198
226
  # <base> element's href value or @url (if @base is nil). If @base is
199
- # present and relative, then @url.to_base + @base is returned. This method
200
- # should be used instead of `doc.url.to_base` etc. when manually building
201
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
202
230
  #
203
231
  # Provide the `link:` parameter to get the correct base URL for that type
204
232
  # of link. For example, a link of `#top` would always return @url because
@@ -217,12 +245,16 @@ module Wgit
217
245
  # @return [Wgit::Url] The base URL of this Document e.g.
218
246
  # 'http://example.com/public'.
219
247
  def base_url(link: nil)
220
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
221
248
  if @url.relative? && @base.nil?
222
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
223
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
224
256
 
225
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
226
258
 
227
259
  if link
228
260
  link = Wgit::Url.new(link)
@@ -234,7 +266,7 @@ module Wgit
234
266
  end
235
267
  end
236
268
 
237
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
238
270
  base_url.omit_fragment.omit_query
239
271
  end
240
272
 
@@ -248,7 +280,7 @@ module Wgit
248
280
  def to_h(include_html: false, include_score: true)
249
281
  ignore = include_html ? [] : ['@html']
250
282
  ignore << '@score' unless include_score
251
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
252
284
 
253
285
  Wgit::Utils.to_h(self, ignore: ignore)
254
286
  end
@@ -265,7 +297,7 @@ module Wgit
265
297
 
266
298
  # Returns a Hash containing this Document's instance variables and
267
299
  # their #length (if they respond to it). Works dynamically so that any
268
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
269
301
  # the returned Hash as well. The number of text snippets as well as total
270
302
  # number of textual bytes are always included in the returned Hash.
271
303
  #
@@ -275,8 +307,8 @@ module Wgit
275
307
  instance_variables.each do |var|
276
308
  # Add up the total bytes of text as well as the length.
277
309
  if var == :@text
278
- hash[:text_snippets] = @text.length
279
- hash[:text_bytes] = @text.sum(&:length)
310
+ hash[:text] = @text.length
311
+ hash[:text_bytes] = @text.sum(&:length)
280
312
  # Else take the var's #length method return value.
281
313
  else
282
314
  next unless instance_variable_get(var).respond_to?(:length)
@@ -305,25 +337,43 @@ module Wgit
305
337
  end
306
338
 
307
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
308
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
309
341
  #
310
342
  # @param xpath [String] The xpath to search the @html with.
311
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
312
344
  def xpath(xpath)
313
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
346
+ end
347
+
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
314
355
  end
315
356
 
316
- # Uses Nokogiri's css method to search the doc's html and return the
317
- # results.
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
318
359
  #
319
360
  # @param selector [String] The CSS selector to search the @html with.
320
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
321
362
  def css(selector)
322
- @doc.css(selector)
363
+ @parser.css(selector)
323
364
  end
324
365
 
325
- # Returns all internal links from this Document in relative form. Internal
326
- # meaning a link to another document on the same host.
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
373
+ end
374
+
375
+ # Returns all unique internal links from this Document in relative form.
376
+ # Internal meaning a link to another document on the same host.
327
377
  #
328
378
  # This Document's host is used to determine if an absolute URL is actually
329
379
  # a relative link e.g. For a Document representing
@@ -332,41 +382,48 @@ module Wgit
332
382
  # as an internal link because both Documents live on the same host. Also
333
383
  # see Wgit::Document#internal_absolute_links.
334
384
  #
335
- # @return [Array<Wgit::Url>] Self's internal Url's in relative form.
385
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
336
386
  def internal_links
337
387
  return [] if @links.empty?
338
388
 
339
389
  links = @links
340
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
341
391
  .map(&:omit_base)
342
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
343
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
344
394
  end
345
395
 
346
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
347
397
  end
348
398
 
349
- # Returns all internal links from this Document in absolute form by
399
+ # Returns all unique internal links from this Document in absolute form by
350
400
  # appending them to self's #base_url. Also see
351
401
  # Wgit::Document#internal_links.
352
402
  #
353
- # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
403
+ # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
354
404
  def internal_absolute_links
355
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
356
406
  end
357
407
 
358
- # Returns all external links from this Document in absolute form. External
359
- # meaning a link to a different host.
408
+ # Returns all unique external links from this Document in absolute form.
409
+ # External meaning a link to a different host.
360
410
  #
361
- # @return [Array<Wgit::Url>] Self's external Url's in absolute form.
411
+ # @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
362
412
  def external_links
363
413
  return [] if @links.empty?
364
414
 
365
415
  links = @links
366
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .map do |link|
417
+ if link.scheme_relative?
418
+ link.prefix_scheme(@url.to_scheme.to_sym)
419
+ else
420
+ link
421
+ end
422
+ end
423
+ .reject { |link| link.relative?(host: @url.to_origin) }
367
424
  .map(&:omit_trailing_slash)
368
425
 
369
- Wgit::Utils.process_arr(links)
426
+ Wgit::Utils.sanitize(links)
370
427
  end
371
428
 
372
429
  # Searches the @text for the given query and returns the results.
@@ -381,8 +438,8 @@ module Wgit
381
438
  # original sentence, which ever is less. The algorithm obviously ensures
382
439
  # that the search query is visible somewhere in the sentence.
383
440
  #
384
- # @param query [String, Object#to_s] The value to search the document's
385
- # @text for.
441
+ # @param query [Regexp, #to_s] The regex or text value to search the
442
+ # document's @text for.
386
443
  # @param case_sensitive [Boolean] Whether character case must match.
387
444
  # @param whole_sentence [Boolean] Whether multiple words should be searched
388
445
  # for separately.
@@ -390,21 +447,27 @@ module Wgit
390
447
  # sentence.
391
448
  # @return [Array<String>] A subset of @text, matching the query.
392
449
  def search(
393
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
450
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
394
451
  )
395
- query = query.to_s
396
- raise 'A search query must be provided' if query.empty?
397
452
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
398
453
 
399
- query = query.gsub(' ', '|') unless whole_sentence
400
- regex = Regexp.new(query, !case_sensitive)
454
+ if query.is_a?(Regexp)
455
+ regex = query
456
+ else # respond_to? #to_s == true
457
+ query = query.to_s
458
+ query = query.gsub(' ', '|') unless whole_sentence
459
+ regex = Regexp.new(query, !case_sensitive)
460
+ end
461
+
401
462
  results = {}
402
463
 
403
464
  @text.each do |sentence|
465
+ sentence = sentence.strip
466
+ next if results[sentence]
467
+
404
468
  hits = sentence.scan(regex).count
405
469
  next unless hits.positive?
406
470
 
407
- sentence.strip!
408
471
  index = sentence.index(regex) # Index of first match.
409
472
  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
410
473
 
@@ -422,8 +485,8 @@ module Wgit
422
485
  # functionality. The original text is returned; no other reference to it
423
486
  # is kept thereafter.
424
487
  #
425
- # @param query [String, Object#to_s] The value to search the document's
426
- # @text for.
488
+ # @param query [Regexp, #to_s] The regex or text value to search the
489
+ # document's @text for.
427
490
  # @param case_sensitive [Boolean] Whether character case must match.
428
491
  # @param whole_sentence [Boolean] Whether multiple words should be searched
429
492
  # for separately.
@@ -431,7 +494,7 @@ module Wgit
431
494
  # sentence.
432
495
  # @return [String] This Document's original @text value.
433
496
  def search!(
434
- query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
497
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
435
498
  )
436
499
  orig_text = @text
437
500
  @text = search(
@@ -442,104 +505,114 @@ module Wgit
442
505
  orig_text
443
506
  end
444
507
 
508
+ # Extracts a value/object from this Document's @html using the given xpath
509
+ # parameter.
510
+ #
511
+ # @param xpath [String, #call] Used to find the value/object in @html.
512
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
+ # Object) : results (Array).
514
+ # @param text_content_only [Boolean] text_content_only ? result.content
515
+ # (String) : result (Nokogiri Object).
516
+ # @return [String, Object] The value found in the html or the default value
517
+ # (singleton ? nil : []).
518
+ def extract(xpath, singleton: true, text_content_only: true)
519
+ send(
520
+ :extract_from_html, xpath,
521
+ singleton: singleton, text_content_only: text_content_only
522
+ )
523
+ end
524
+
445
525
  protected
446
526
 
447
527
  # Initializes the nokogiri object using @html, which cannot be nil.
448
528
  # Override this method to custom configure the Nokogiri object returned.
449
529
  # Gets called from Wgit::Document.new upon initialization.
450
530
  #
531
+ # @yield [config] The given block is passed to Nokogiri::HTML for
532
+ # initialisation.
451
533
  # @raise [StandardError] If @html isn't set.
452
534
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
453
- def init_nokogiri
535
+ def init_nokogiri(&block)
454
536
  raise '@html must be set' unless @html
455
537
 
456
- Nokogiri::HTML(@html) do |config|
457
- # TODO: Remove #'s below when crawling in production.
458
- # config.options = Nokogiri::XML::ParseOptions::STRICT |
459
- # Nokogiri::XML::ParseOptions::NONET
460
- end
538
+ Nokogiri::HTML(@html, &block)
461
539
  end
462
540
 
463
- # Returns a value/object from this Document's @html using the given xpath
541
+ # Extracts a value/object from this Document's @html using the given xpath
464
542
  # parameter.
465
543
  #
466
- # @param xpath [String] Used to find the value/object in @html.
544
+ # @param xpath [String, #call] Used to find the value/object in @html.
467
545
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
468
546
  # Object) : results (Array).
469
547
  # @param text_content_only [Boolean] text_content_only ? result.content
470
548
  # (String) : result (Nokogiri Object).
471
- # @yield [value, source] Given the value (String/Object) before it's set as
472
- # an instance variable so that you can inspect/alter the value if
473
- # desired. Return nil from the block if you don't want to override the
474
- # value. Also given the source (Symbol) which is always :document.
549
+ # @yield The block is executed when a Wgit::Document is initialized,
550
+ # regardless of the source. Use it (optionally) to process the result
551
+ # value.
552
+ # @yieldparam value [Object] The result value to be returned.
553
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
+ # `:object`.
556
+ # @yieldreturn [Object] The return value of the block gets returned. Return
557
+ # the block's `value` param unchanged if you simply want to inspect it.
475
558
  # @return [String, Object] The value found in the html or the default value
476
559
  # (singleton ? nil : []).
477
- def find_in_html(xpath, singleton: true, text_content_only: true)
478
- default = singleton ? nil : []
479
- xpath = xpath.call if xpath.respond_to?(:call)
480
- results = @doc.xpath(xpath)
481
-
482
- return default if results.nil? || results.empty?
483
-
484
- result = if singleton
485
- text_content_only ? results.first.content : results.first
486
- else
487
- text_content_only ? results.map(&:content) : results
488
- end
489
-
490
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
560
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
561
+ xpath = xpath.call if xpath.respond_to?(:call)
562
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
491
563
 
492
- if block_given?
493
- new_result = yield(result, self, :document)
494
- result = new_result unless new_result.nil?
564
+ if text_content_only
565
+ result = singleton ? result&.content : result.map(&:content)
495
566
  end
496
567
 
568
+ Wgit::Utils.sanitize(result)
569
+ result = yield(result, self, :document) if block_given?
497
570
  result
498
571
  end
499
572
 
500
- # Returns a value from the obj using the given key via obj#fetch.
573
+ # Returns a value from the obj using the given key via `obj#fetch`.
501
574
  #
502
- # @param obj [Object#fetch] The object containing the key/value.
575
+ # @param obj [#fetch] The object containing the key/value.
503
576
  # @param key [String] Used to find the value in the obj.
504
577
  # @param singleton [Boolean] True if a single value, false otherwise.
505
- # @yield [value, source] Given the value (String/Object) before it's set as
506
- # an instance variable so that you can inspect/alter the value if
507
- # desired. Return nil from the block if you don't want to override the
508
- # value. Also given the source (Symbol) which is always :object.
578
+ # @yield The block is executed when a Wgit::Document is initialized,
579
+ # regardless of the source. Use it (optionally) to process the result
580
+ # value.
581
+ # @yieldparam value [Object] The result value to be returned.
582
+ # @yieldparam source [Wgit::Document, Object] The source of the `value`.
583
+ # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
584
+ # `:object`.
585
+ # @yieldreturn [Object] The return value of the block gets returned. Return
586
+ # the block's `value` param unchanged if you simply want to inspect it.
509
587
  # @return [String, Object] The value found in the obj or the default value
510
588
  # (singleton ? nil : []).
511
- def find_in_object(obj, key, singleton: true)
589
+ def extract_from_object(obj, key, singleton: true)
512
590
  assert_respond_to(obj, :fetch)
513
591
 
514
592
  default = singleton ? nil : []
515
593
  result = obj.fetch(key.to_s, default)
516
594
 
517
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
518
-
519
- if block_given?
520
- new_result = yield(result, obj, :object)
521
- result = new_result unless new_result.nil?
522
- end
523
-
595
+ Wgit::Utils.sanitize(result)
596
+ result = yield(result, obj, :object) if block_given?
524
597
  result
525
598
  end
526
599
 
527
600
  private
528
601
 
529
602
  # Initialise the Document from URL and HTML Strings.
530
- def init_from_strings(url, html, encode_html: true)
603
+ def init_from_strings(url, html, encode: true)
531
604
  assert_types(html, [String, NilClass])
532
605
 
533
606
  # We already know url.is_a?(String) so parse into Url unless already so.
534
607
  url = Wgit::Url.parse(url)
535
608
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
536
609
 
537
- @url = url
538
- @html = html || ''
539
- @doc = init_nokogiri
540
- @score = 0.0
610
+ @url = url
611
+ @html = html || ''
612
+ @parser = init_nokogiri
613
+ @score = 0.0
541
614
 
542
- Wgit::Utils.process_str(@html, encode: encode_html)
615
+ Wgit::Utils.sanitize(@html, encode: encode)
543
616
 
544
617
  # Dynamically run the init_*_from_html methods.
545
618
  Document.private_instance_methods(false).each do |method|
@@ -552,15 +625,15 @@ module Wgit
552
625
 
553
626
  # Initialise the Document from a Hash like Object containing Strings as
554
627
  # keys e.g. database collection object or Hash.
555
- def init_from_object(obj, encode_html: true)
628
+ def init_from_object(obj, encode: true)
556
629
  assert_respond_to(obj, :fetch)
557
630
 
558
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
559
- @html = obj.fetch('html', '')
560
- @doc = init_nokogiri
561
- @score = obj.fetch('score', 0.0)
631
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
632
+ @html = obj.fetch('html', '')
633
+ @parser = init_nokogiri
634
+ @score = obj.fetch('score', 0.0)
562
635
 
563
- Wgit::Utils.process_str(@html, encode: encode_html)
636
+ Wgit::Utils.sanitize(@html, encode: encode)
564
637
 
565
638
  # Dynamically run the init_*_from_object methods.
566
639
  Document.private_instance_methods(false).each do |method|
@@ -571,11 +644,11 @@ module Wgit
571
644
  end
572
645
  end
573
646
 
574
- # Initialises an instance variable and defines a getter method for it.
647
+ # Initialises an instance variable and defines an accessor method for it.
575
648
  #
576
649
  # @param var [Symbol] The name of the variable to be initialized.
577
650
  # @param value [Object] The newly initialized variable's value.
578
- # @return [Symbol] The name of the newly created getter method.
651
+ # @return [Symbol] The name of the defined getter method.
579
652
  def init_var(var, value)
580
653
  # instance_var_name starts with @, var_name doesn't.
581
654
  var = var.to_s
@@ -583,10 +656,9 @@ module Wgit
583
656
  instance_var_name = "@#{var_name}".to_sym
584
657
 
585
658
  instance_variable_set(instance_var_name, value)
659
+ Wgit::Document.attr_accessor(var_name)
586
660
 
587
- Document.send(:define_method, var_name) do
588
- instance_variable_get(instance_var_name)
589
- end
661
+ var_name
590
662
  end
591
663
 
592
664
  alias content html