wgit 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
+ require 'set'
8
9
  require 'typhoeus'
9
10
 
10
11
  module Wgit
@@ -14,17 +15,26 @@ module Wgit
14
15
  class Crawler
15
16
  include Assertable
16
17
 
18
+ # The URL file extensions (from `<a>` hrefs) which will be crawled by
19
+ # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
20
+ # doesn't keep the crawl of the site going. All URL's without a file
21
+ # extension will be crawled, because they're assumed to be HTML.
22
+ SUPPORTED_FILE_EXTENSIONS = Set.new(%w[
23
+ asp aspx cfm cgi htm html htmlx jsp php
24
+ ])
25
+
17
26
  # The amount of allowed redirects before raising an error. Set to 0 to
18
- # disable redirects completely.
27
+ # disable redirects completely; or you can pass `follow_redirects: false`
28
+ # to any Wgit::Crawler.crawl_* method.
19
29
  attr_accessor :redirect_limit
20
30
 
21
31
  # The maximum amount of time (in seconds) a crawl request has to complete
22
32
  # before raising an error. Set to 0 to disable time outs completely.
23
33
  attr_accessor :time_out
24
34
 
25
- # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
26
- # crawling more than just HTML e.g. images etc.
27
- attr_accessor :encode_html
35
+ # Whether or not to UTF-8 encode the response body once crawled. Set to
36
+ # false if crawling more than just HTML e.g. images.
37
+ attr_accessor :encode
28
38
 
29
39
  # The Wgit::Response of the most recently crawled URL.
30
40
  attr_reader :last_response
@@ -36,18 +46,22 @@ module Wgit
36
46
  # @param time_out [Integer, Float] The maximum amount of time (in seconds)
37
47
  # a crawl request has to complete before raising an error. Set to 0 to
38
48
  # disable time outs completely.
39
- # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
40
- # crawled. Set to false if crawling more than just HTML e.g. images etc.
41
- def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
49
+ # @param encode [Boolean] Whether or not to UTF-8 encode the response body
50
+ # once crawled. Set to false if crawling more than just HTML e.g. images.
51
+ def initialize(redirect_limit: 5, time_out: 5, encode: true)
42
52
  @redirect_limit = redirect_limit
43
53
  @time_out = time_out
44
- @encode_html = encode_html
54
+ @encode = encode
45
55
  end
46
56
 
47
57
  # Crawls an entire website's HTML pages by recursively going through
48
- # its internal <a> links. Each crawled Document is yielded to a block. Use
49
- # the allow and disallow paths params to partially and selectively crawl a
50
- # site.
58
+ # its internal `<a>` links. Each crawled Document is yielded to a block.
59
+ # Use `doc.empty?` to determine if the crawled link is valid.
60
+ #
61
+ # Use the allow and disallow paths params to partially and selectively
62
+ # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
63
+ # Note that each path must NOT start with a slash; the only exception being
64
+ # a `/` on its own with no other characters, referring to the index page.
51
65
  #
52
66
  # Only redirects to the same host are followed. For example, the Url
53
67
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -61,65 +75,64 @@ module Wgit
61
75
  # It is recommended that this URL be the index page of the site to give a
62
76
  # greater chance of finding all pages within that site/host.
63
77
  # @param allow_paths [String, Array<String>] Filters links by selecting
64
- # them only if their path includes one of allow_paths.
78
+ # them if their path `File.fnmatch?` one of allow_paths.
65
79
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
66
- # them if their path includes one of disallow_paths.
80
+ # them if their path `File.fnmatch?` one of disallow_paths.
67
81
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
68
82
  # A block is the only way to interact with each crawled Document.
83
+ # Use `doc.empty?` to determine if the page is valid.
69
84
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
70
- # from all of the site's pages or nil if the url could not be
85
+ # from all of the site's pages or nil if the given url could not be
71
86
  # crawled successfully.
72
87
  def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
73
88
  doc = crawl_url(url, &block)
74
89
  return nil if doc.nil?
75
90
 
76
- crawl_opts = { follow_external_redirects: false, host: url.to_base }
77
- link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
78
-
91
+ path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
79
92
  alt_url = url.end_with?('/') ? url.chop : url + '/'
80
- crawled = [url, alt_url]
81
- externals = doc.external_links
82
- internals = get_internal_links(doc, link_opts)
83
93
 
84
- return doc.external_links.uniq if internals.empty?
94
+ crawled = Set.new([url, alt_url])
95
+ externals = Set.new(doc.external_links)
96
+ internals = Set.new(get_internal_links(doc, path_opts))
85
97
 
86
- loop do
87
- crawled.uniq!
88
- internals.uniq!
98
+ return externals.to_a if internals.empty?
89
99
 
100
+ loop do
90
101
  links = internals - crawled
91
102
  break if links.empty?
92
103
 
93
104
  links.each do |link|
94
105
  orig_link = link.dup
95
- doc = crawl_url(link, crawl_opts, &block)
106
+ doc = crawl_url(link, follow_redirects: :host, &block)
96
107
 
97
- crawled.push(orig_link, link) # Push both in case of redirects.
108
+ crawled += [orig_link, link] # Push both links in case of redirects.
98
109
  next if doc.nil?
99
110
 
100
- internals.concat(get_internal_links(doc, link_opts))
101
- externals.concat(doc.external_links)
111
+ internals += get_internal_links(doc, path_opts)
112
+ externals += doc.external_links
102
113
  end
103
114
  end
104
115
 
105
- externals.uniq
116
+ externals.to_a
106
117
  end
107
118
 
108
119
  # Crawls one or more individual urls using Wgit::Crawler#crawl_url
109
120
  # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
110
121
  #
111
122
  # @param urls [*Wgit::Url] The Url's to crawl.
123
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
124
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
125
+ # e.g. :host only allows redirects within the same host. Choose from
126
+ # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
127
+ # This value will be used for all urls crawled.
112
128
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
113
129
  # way to interact with them.
114
130
  # @raise [StandardError] If no urls are provided.
115
131
  # @return [Wgit::Document] The last Document crawled.
116
- def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
132
+ def crawl_urls(*urls, follow_redirects: true, &block)
117
133
  raise 'You must provide at least one Url' if urls.empty?
118
134
 
119
- opts = {
120
- follow_external_redirects: follow_external_redirects,
121
- host: host
122
- }
135
+ opts = { follow_redirects: follow_redirects }
123
136
  doc = nil
124
137
 
125
138
  Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
@@ -131,33 +144,22 @@ module Wgit
131
144
  # occurs.
132
145
  #
133
146
  # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
134
- # @param follow_external_redirects [Boolean] Whether or not to follow
135
- # an external redirect. External meaning to a different host. False will
136
- # return nil for such a crawl. If false, you must also provide a `host:`
137
- # parameter.
138
- # @param host [Wgit::Url, String] Specify the host by which
139
- # an absolute redirect is determined to be internal or not. Must be
140
- # absolute and contain a protocol prefix. For example, a `host:` of
141
- # 'http://www.example.com' will only allow redirects for Url's with a
142
- # `to_host` value of 'www.example.com'.
147
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
148
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
149
+ # e.g. :host only allows redirects within the same host. Choose from
150
+ # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
143
151
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
144
152
  # crawl was successful or not. Therefore, Document#url etc. can be used.
145
153
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
146
154
  # crawl was unsuccessful.
147
- def crawl_url(url, follow_external_redirects: true, host: nil)
155
+ def crawl_url(url, follow_redirects: true)
148
156
  # A String url isn't allowed because it's passed by value not reference,
149
157
  # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
150
158
  assert_type(url, Wgit::Url)
151
- raise 'host cannot be nil if follow_external_redirects is false' \
152
- if !follow_external_redirects && host.nil?
153
159
 
154
- html = fetch(
155
- url,
156
- follow_external_redirects: follow_external_redirects,
157
- host: host
158
- )
160
+ html = fetch(url, follow_redirects: follow_redirects)
161
+ doc = Wgit::Document.new(url, html, encode: @encode)
159
162
 
160
- doc = Wgit::Document.new(url, html, encode_html: @encode_html)
161
163
  yield(doc) if block_given?
162
164
 
163
165
  doc.empty? ? nil : doc
@@ -171,26 +173,18 @@ module Wgit
171
173
  #
172
174
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
173
175
  # reference and gets modified as a result of the fetch/crawl.
174
- # @param follow_external_redirects [Boolean] Whether or not to follow
175
- # an external redirect. False will return nil for such a crawl. If false,
176
- # you must also provide a `host:` parameter.
177
- # @param host [Wgit::Url, String] Specify the host by which
178
- # an absolute redirect is determined to be internal or not. Must be
179
- # absolute and contain a protocol prefix. For example, a `host:` of
180
- # 'http://www.example.com' will only allow redirects for Urls with a
181
- # `to_host` value of 'www.example.com'.
176
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
177
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
178
+ # e.g. :host only allows redirects within the same host. Choose from
179
+ # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
180
+ # @raise [StandardError] If url isn't valid and absolute.
182
181
  # @return [String, nil] The crawled HTML or nil if the crawl was
183
182
  # unsuccessful.
184
- def fetch(url, follow_external_redirects: true, host: nil)
183
+ def fetch(url, follow_redirects: true)
185
184
  response = Wgit::Response.new
185
+ raise "Invalid url: #{url}" if url.invalid?
186
186
 
187
- resolve(
188
- url,
189
- response,
190
- follow_external_redirects: follow_external_redirects,
191
- host: host
192
- )
193
-
187
+ resolve(url, response, follow_redirects: follow_redirects)
194
188
  response.body_or_nil
195
189
  rescue StandardError => e
196
190
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -209,16 +203,15 @@ module Wgit
209
203
  # @param url [Wgit::Url] The URL to GET and resolve.
210
204
  # @param response [Wgit::Response] The response to enrich. Modifies by
211
205
  # reference.
212
- # @param follow_external_redirects [Boolean] Whether or not to follow
213
- # an external redirect. If false, you must also provide a `host:`
214
- # parameter.
215
- # @param host [Wgit::Url, String] Specify the host by which
216
- # an absolute redirect is determined to be internal or not. Must be
217
- # absolute and contain a protocol prefix. For example, a `host:` of
218
- # 'http://www.example.com' will only allow redirects for Urls with a
219
- # `to_host` value of 'www.example.com'.
206
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
207
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
208
+ # e.g. :host only allows redirects within the same host. Choose from
209
+ # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
220
210
  # @raise [StandardError] If a redirect isn't allowed etc.
221
- def resolve(url, response, follow_external_redirects: true, host: nil)
211
+ def resolve(url, response, follow_redirects: true)
212
+ orig_url_base = url.to_url.to_base # Recorded before any redirects.
213
+ follow_redirects, within = redirect?(follow_redirects)
214
+
222
215
  loop do
223
216
  get_response(url, response)
224
217
  break unless response.redirect?
@@ -229,10 +222,11 @@ module Wgit
229
222
 
230
223
  yield(url, response, location) if block_given?
231
224
 
232
- # Validate redirect.
233
- if !follow_external_redirects && !location.relative?(host: host)
234
- raise "External redirect not allowed - Redirected to: \
235
- '#{location}', which is outside of host: '#{host}'"
225
+ # Validate if the redirect is allowed.
226
+ raise "Redirect not allowed: #{location}" unless follow_redirects
227
+
228
+ if within && !location.relative?(within => orig_url_base)
229
+ raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
236
230
  end
237
231
 
238
232
  raise "Too many redirects, exceeded: #{@redirect_limit}" \
@@ -268,15 +262,8 @@ module Wgit
268
262
  response.ip_address = http_response.primary_ip
269
263
  response.add_total_time(http_response.total_time)
270
264
 
271
- # Log (debug) the request/response details.
272
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
273
- log_status = (response.status || 0)
274
- log_total_time = response.total_time.truncate(3)
275
-
276
- Wgit.logger.debug("[http] Request: #{response.url}")
277
- Wgit.logger.debug(
278
- format(resp_template, log_status, response.size, log_total_time)
279
- )
265
+ # Log the request/response details.
266
+ log_http(response)
280
267
 
281
268
  # Handle a failed response.
282
269
  raise "No response (within timeout: #{@time_out} second(s))" \
@@ -304,28 +291,29 @@ module Wgit
304
291
 
305
292
  # Returns a doc's internal HTML page links in absolute form; used when
306
293
  # crawling a site. Use the allow and disallow paths params to partially
307
- # and selectively crawl a site.
294
+ # and selectively crawl a site; the glob syntax is supported e.g.
295
+ # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
308
296
  #
309
297
  # Override this method in a subclass to change how a site
310
- # is crawled; not what is extracted from each page (Document extensions
298
+ # is crawled, not what is extracted from each page (Document extensions
311
299
  # should be used for this purpose instead). Just remember that only HTML
312
- # files containing <a> links can keep the crawl going beyond the base URL.
300
+ # files containing `<a>` links keep the crawl going beyond the base URL.
313
301
  #
314
302
  # @param doc [Wgit::Document] The document from which to extract it's
315
- # internal page links.
303
+ # internal (absolute) page links.
316
304
  # @param allow_paths [String, Array<String>] Filters links by selecting
317
- # them only if their path includes one of allow_paths.
305
+ # them if their path `File.fnmatch?` one of allow_paths.
318
306
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
319
- # them if their path includes one of disallow_paths.
307
+ # them if their path `File.fnmatch?` one of disallow_paths.
320
308
  # @return [Array<Wgit::Url>] The internal page links from doc.
321
309
  def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
322
310
  links = doc
323
311
  .internal_absolute_links
324
312
  .map(&:omit_fragment) # Because fragments don't alter content.
325
313
  .uniq
326
- .reject do |link|
314
+ .select do |link|
327
315
  ext = link.to_extension
328
- ext ? !%w[htm html].include?(ext.downcase) : false
316
+ ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
329
317
  end
330
318
 
331
319
  return links if allow_paths.nil? && disallow_paths.nil?
@@ -335,40 +323,68 @@ module Wgit
335
323
 
336
324
  private
337
325
 
326
+ # Returns whether or not to follow redirects, and within what context e.g.
327
+ # :host, :domain etc.
328
+ def redirect?(follow_redirects)
329
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
330
+
331
+ unless [true, false].include?(follow_redirects)
332
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
333
+ #{follow_redirects}"
334
+ end
335
+
336
+ [follow_redirects, nil]
337
+ end
338
+
339
+ # Log (at debug level) the HTTP request/response details.
340
+ def log_http(response)
341
+ resp_template = '[http] Response: %s (%s bytes in %s seconds)'
342
+ log_status = (response.status || 0)
343
+ log_total_time = response.total_time.truncate(3)
344
+
345
+ Wgit.logger.debug("[http] Request: #{response.url}")
346
+ Wgit.logger.debug(
347
+ format(resp_template, log_status, response.size, log_total_time)
348
+ )
349
+ end
350
+
338
351
  # Validate and filter by the given URL paths.
339
352
  def process_paths(links, allow_paths, disallow_paths)
340
- raise "You can't provide both allow_paths: and disallow_paths: params" \
341
- if allow_paths && disallow_paths
342
-
343
- if allow_paths # White list.
344
- filter_method = :select
345
- paths = allow_paths
346
- else # Black list.
347
- filter_method = :reject
348
- paths = disallow_paths
353
+ if allow_paths
354
+ paths = validate_paths(allow_paths)
355
+ filter_links(links, :select!, paths)
349
356
  end
350
357
 
358
+ if disallow_paths
359
+ paths = validate_paths(disallow_paths)
360
+ filter_links(links, :reject!, paths)
361
+ end
362
+
363
+ links
364
+ end
365
+
366
+ # Validate the paths are suitable for filtering.
367
+ def validate_paths(paths)
351
368
  paths = [paths] unless paths.is_a?(Array)
352
- paths = paths
353
- .compact
354
- .reject(&:empty?)
355
- .uniq
356
- .map { |path| Wgit::Url.new(path).to_path }
369
+ raise 'The provided paths must all be Strings' \
370
+ unless paths.all? { |path| path.is_a?(String) }
357
371
 
372
+ Wgit::Utils.process_arr(paths, encode: false)
358
373
  raise 'The provided paths cannot be empty' if paths.empty?
359
374
 
360
- filter_links_by_path(links, filter_method, paths)
375
+ paths
361
376
  end
362
377
 
363
- # Filters links by selecting or rejecting them based on their path.
364
- def filter_links_by_path(links, filter_method, paths)
378
+ # Filters links by selecting/rejecting them based on their path.
379
+ # Uses File.fnmatch? so that globbing is supported.
380
+ def filter_links(links, filter_method, paths)
365
381
  links.send(filter_method) do |link|
366
- link_path = link.to_path
367
- next(false) unless link_path
382
+ # Turn http://example.com into / meaning index.
383
+ link = link.to_endpoint == '/' ? '/' : link.omit_base
368
384
 
369
385
  match = false
370
- paths.each do |path|
371
- match = link_path.start_with?(path)
386
+ paths.each do |pattern|
387
+ match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
372
388
  break if match
373
389
  end
374
390
 
@@ -7,41 +7,31 @@ require 'json'
7
7
  module Wgit
8
8
  # Class primarily modeling a HTML web document, although other MIME types
9
9
  # will work e.g. images etc. Also doubles as a search result when
10
- # loading Documents from the database via Wgit::Database#search.
10
+ # loading Documents from the database via `Wgit::Database#search`.
11
11
  #
12
12
  # The initialize method dynamically initializes instance variables from the
13
13
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
14
  # Document class can be easily extended allowing you to pull out the bits of
15
- # a webpage that are important to you. See Wgit::Document.define_extension.
15
+ # a webpage that are important to you. See `Wgit::Document.define_extension`.
16
16
  class Document
17
17
  include Assertable
18
18
 
19
19
  # Regex for the allowed var names when defining an extension.
20
20
  REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
21
21
 
22
- # The HTML elements that make up the visible text on a page.
23
- # These elements are used to initialize the @text of the Document.
24
- # See the README.md for how to add to this Array dynamically.
25
- @text_elements = %i[
26
- dd div dl dt figcaption figure hr li
27
- main ol p pre span ul h1 h2 h3 h4 h5
28
- ]
29
-
30
- class << self
31
- # Class level instance reader method for @text_elements.
32
- attr_reader :text_elements
33
- end
22
+ # The xpath used to extract the visible text on a page.
23
+ TEXT_ELEMENTS_XPATH = '//*/text()'.freeze
34
24
 
35
25
  # The URL of the webpage, an instance of Wgit::Url.
36
26
  attr_reader :url
37
27
 
38
- # The HTML of the webpage, an instance of String.
28
+ # The content/HTML of the document, an instance of String.
39
29
  attr_reader :html
40
30
 
41
31
  # The Nokogiri::HTML document object initialized from @html.
42
32
  attr_reader :doc
43
33
 
44
- # The score is only used following a Database#search and records matches.
34
+ # The score is only used following a `Database#search` and records matches.
45
35
  attr_reader :score
46
36
 
47
37
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -50,44 +40,29 @@ module Wgit
50
40
  # pages retrieved from the database.
51
41
  #
52
42
  # During initialisation, the Document will call any private
53
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
43
+ # `init_*_from_html` and `init_*_from_object` methods it can find. See the
54
44
  # README.md and Wgit::Document.define_extension method for more details.
55
45
  #
56
- # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
46
+ # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
57
47
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
58
48
  # MongoDB collection object. The Object's :fetch method should support
59
49
  # Strings as keys.
60
- # @param html [String, NilClass] The crawled web page's HTML. This param is
61
- # only used if url_or_obj is a String representing the web page's URL.
62
- # Otherwise, the HTML comes from the database object. A html of nil will
63
- # be defaulted to an empty String.
64
- def initialize(url_or_obj, html = '', encode_html: true)
50
+ # @param html [String, NilClass] The crawled web page's content/HTML. This
51
+ # param is only used if url_or_obj is a String representing the web
52
+ # page's URL. Otherwise, the HTML comes from the database object. A html
53
+ # of nil will be defaulted to an empty String.
54
+ # @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
55
+ # false if the Document content is an image etc.
56
+ def initialize(url_or_obj, html = '', encode: true)
65
57
  if url_or_obj.is_a?(String)
66
- init_from_strings(url_or_obj, html, encode_html: encode_html)
58
+ init_from_strings(url_or_obj, html, encode: encode)
67
59
  else
68
- init_from_object(url_or_obj, encode_html: encode_html)
60
+ init_from_object(url_or_obj, encode: encode)
69
61
  end
70
62
  end
71
63
 
72
64
  ### Document Class Methods ###
73
65
 
74
- # Uses Document.text_elements to build an xpath String, used to obtain
75
- # all of the combined text on a webpage.
76
- #
77
- # @return [String] An xpath String to obtain a webpage's text elements.
78
- def self.text_elements_xpath
79
- xpath = ''
80
- return xpath if Wgit::Document.text_elements.empty?
81
-
82
- el_xpath = '//%s/text()'
83
- Wgit::Document.text_elements.each_with_index do |el, i|
84
- xpath += ' | ' unless i.zero?
85
- xpath += format(el_xpath, el)
86
- end
87
-
88
- xpath
89
- end
90
-
91
66
  # Defines an extension, which is a way to serialise HTML elements into
92
67
  # instance variables upon Document initialization. See the default
93
68
  # extensions defined in 'document_extensions.rb' as examples.
@@ -105,35 +80,36 @@ module Wgit
105
80
  # a default will be used. The default value is: `singleton ? nil : []`.
106
81
  #
107
82
  # @param var [Symbol] The name of the variable to be initialised.
108
- # @param xpath [String, Object#call] The xpath used to find the element(s)
83
+ # @param xpath [String, #call] The xpath used to find the element(s)
109
84
  # of the webpage. Only used when initializing from HTML.
110
85
  #
111
86
  # Pass a callable object (proc etc.) if you want the
112
87
  # xpath value to be derived on Document initialisation (instead of when
113
88
  # the extension is defined). The call method must return a valid xpath
114
89
  # String.
115
- # @param options [Hash] The options to define an extension with. The
90
+ # @param opts [Hash] The options to define an extension with. The
116
91
  # options are only used when intializing from HTML, not the database.
117
- # @option options [Boolean] :singleton The singleton option determines
92
+ # @option opts [Boolean] :singleton The singleton option determines
118
93
  # whether or not the result(s) should be in an Array. If multiple
119
94
  # results are found and singleton is true then the first result will be
120
95
  # used. Defaults to true.
121
- # @option options [Boolean] :text_content_only The text_content_only option
96
+ # @option opts [Boolean] :text_content_only The text_content_only option
122
97
  # if true will use the text content of the Nokogiri result object,
123
98
  # otherwise the Nokogiri object itself is returned. Defaults to true.
124
- # @yield [value, source, type] Yields the value (Object) about to be
125
- # assigned to the new var, the source of the value (Wgit::Document or DB
126
- # Object) and the source type (Symbol of either :document or :object).
127
- #
128
- # The return value of the block becomes the new var value, unless nil.
129
- # Return nil if you want to inspect but not change the var value. The
130
- # block is executed when a Wgit::Document is initialized.
99
+ # @yieldparam value [Object] The value to be assigned to the new var.
100
+ # @yieldparam source [Wgit::Document, Object] The source of the value.
101
+ # @yieldparam type [Symbol] The source type, either :document or (DB)
102
+ # :object.
103
+ # @yieldreturn [Object] The return value of the block becomes the new var
104
+ # value, unless nil. Return nil if you want to inspect but not change the
105
+ # var value. The block is executed when a Wgit::Document is initialized,
106
+ # regardless of the source.
131
107
  # @raise [StandardError] If the var param isn't valid.
132
- # @return [Symbol] The given var Symbol.
133
- def self.define_extension(var, xpath, options = {}, &block)
108
+ # @return [Symbol] The given var Symbol if successful.
109
+ def self.define_extension(var, xpath, opts = {}, &block)
134
110
  var = var.to_sym
135
- default_options = { singleton: true, text_content_only: true }
136
- options = default_options.merge(options)
111
+ defaults = { singleton: true, text_content_only: true }
112
+ opts = defaults.merge(opts)
137
113
 
138
114
  raise "var must match #{REGEX_EXTENSION_NAME}" unless \
139
115
  var =~ REGEX_EXTENSION_NAME
@@ -141,7 +117,7 @@ module Wgit
141
117
  # Define the private init_*_from_html method for HTML.
142
118
  # Gets the HTML's xpath value and creates a var for it.
143
119
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
144
- result = find_in_html(xpath, options, &block)
120
+ result = find_in_html(xpath, opts, &block)
145
121
  init_var(var, result)
146
122
  end
147
123
  Document.send :private, func_name
@@ -149,7 +125,7 @@ module Wgit
149
125
  # Define the private init_*_from_object method for a Database object.
150
126
  # Gets the Object's 'key' value and creates a var for it.
151
127
  func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
152
- result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
128
+ result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
153
129
  init_var(var, result)
154
130
  end
155
131
  Document.send :private, func_name
@@ -381,7 +357,7 @@ module Wgit
381
357
  # original sentence, which ever is less. The algorithm obviously ensures
382
358
  # that the search query is visible somewhere in the sentence.
383
359
  #
384
- # @param query [String, Object#to_s] The value to search the document's
360
+ # @param query [String, #to_s] The value to search the document's
385
361
  # @text for.
386
362
  # @param case_sensitive [Boolean] Whether character case must match.
387
363
  # @param whole_sentence [Boolean] Whether multiple words should be searched
@@ -401,10 +377,12 @@ module Wgit
401
377
  results = {}
402
378
 
403
379
  @text.each do |sentence|
380
+ sentence = sentence.strip
381
+ next if results[sentence]
382
+
404
383
  hits = sentence.scan(regex).count
405
384
  next unless hits.positive?
406
385
 
407
- sentence.strip!
408
386
  index = sentence.index(regex) # Index of first match.
409
387
  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
410
388
 
@@ -422,7 +400,7 @@ module Wgit
422
400
  # functionality. The original text is returned; no other reference to it
423
401
  # is kept thereafter.
424
402
  #
425
- # @param query [String, Object#to_s] The value to search the document's
403
+ # @param query [String, #to_s] The value to search the document's
426
404
  # @text for.
427
405
  # @param case_sensitive [Boolean] Whether character case must match.
428
406
  # @param whole_sentence [Boolean] Whether multiple words should be searched
@@ -499,7 +477,7 @@ module Wgit
499
477
 
500
478
  # Returns a value from the obj using the given key via obj#fetch.
501
479
  #
502
- # @param obj [Object#fetch] The object containing the key/value.
480
+ # @param obj [#fetch] The object containing the key/value.
503
481
  # @param key [String] Used to find the value in the obj.
504
482
  # @param singleton [Boolean] True if a single value, false otherwise.
505
483
  # @yield [value, source] Given the value (String/Object) before it's set as
@@ -527,7 +505,7 @@ module Wgit
527
505
  private
528
506
 
529
507
  # Initialise the Document from URL and HTML Strings.
530
- def init_from_strings(url, html, encode_html: true)
508
+ def init_from_strings(url, html, encode: true)
531
509
  assert_types(html, [String, NilClass])
532
510
 
533
511
  # We already know url.is_a?(String) so parse into Url unless already so.
@@ -539,7 +517,7 @@ module Wgit
539
517
  @doc = init_nokogiri
540
518
  @score = 0.0
541
519
 
542
- Wgit::Utils.process_str(@html, encode: encode_html)
520
+ Wgit::Utils.process_str(@html, encode: encode)
543
521
 
544
522
  # Dynamically run the init_*_from_html methods.
545
523
  Document.private_instance_methods(false).each do |method|
@@ -552,7 +530,7 @@ module Wgit
552
530
 
553
531
  # Initialise the Document from a Hash like Object containing Strings as
554
532
  # keys e.g. database collection object or Hash.
555
- def init_from_object(obj, encode_html: true)
533
+ def init_from_object(obj, encode: true)
556
534
  assert_respond_to(obj, :fetch)
557
535
 
558
536
  @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -560,7 +538,7 @@ module Wgit
560
538
  @doc = init_nokogiri
561
539
  @score = obj.fetch('score', 0.0)
562
540
 
563
- Wgit::Utils.process_str(@html, encode: encode_html)
541
+ Wgit::Utils.process_str(@html, encode: encode)
564
542
 
565
543
  # Dynamically run the init_*_from_object methods.
566
544
  Document.private_instance_methods(false).each do |method|