wgit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 70e6ec83e53550bcfe180b66248747763314c33738ecd0fabddca65dbb3918b0
4
- data.tar.gz: a1c3d1e8bb6d078731876093cb2beed0ea4da65cb03dff1ead975f714bd3d9b5
3
+ metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
4
+ data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
5
5
  SHA512:
6
- metadata.gz: ab519107506ec0798995cb52f986040da12d1a5c59c0c36f84bf8e09d847fd5ab83b3bd7f47ff95b6e474a35d855b176fdc9d245b1cef356781eadb21a4a84f7
7
- data.tar.gz: '010748005ded444f44812c8b6022d258b60a3485dcef8b78c562012428e3955a7fbfe80f53a570cb9f6b524042388949cf2cf08d6a1b27581f2cfd9b424603b0'
6
+ metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
7
+ data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee
data/lib/wgit/crawler.rb CHANGED
@@ -4,11 +4,13 @@ require_relative 'url'
4
4
  require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
+ require_relative 'response'
7
8
  require 'typhoeus'
8
9
 
9
10
  module Wgit
10
11
  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
11
- # serialising their HTML into Wgit::Document instances.
12
+ # serialising their HTML into Wgit::Document instances. This is the only Wgit
13
+ # class which contains network logic e.g. request/response handling.
12
14
  class Crawler
13
15
  include Assertable
14
16
 
@@ -20,8 +22,11 @@ module Wgit
20
22
  # before raising an error. Set to 0 to disable time outs completely.
21
23
  attr_accessor :time_out
22
24
 
23
- # The Typhoeus::Response of the most recently crawled URL or nil.
24
- # See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
25
+ # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
26
+ # crawling more than just HTML e.g. images etc.
27
+ attr_accessor :encode_html
28
+
29
+ # The Wgit::Response of the most recently crawled URL.
25
30
  attr_reader :last_response
26
31
 
27
32
  # Initializes and returns a Wgit::Crawler instance.
@@ -31,13 +36,18 @@ module Wgit
31
36
  # @param time_out [Integer, Float] The maximum amount of time (in seconds)
32
37
  # a crawl request has to complete before raising an error. Set to 0 to
33
38
  # disable time outs completely.
34
- def initialize(redirect_limit: 5, time_out: 5)
39
+ # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
40
+ # crawled. Set to false if crawling more than just HTML e.g. images etc.
41
+ def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
35
42
  @redirect_limit = redirect_limit
36
43
  @time_out = time_out
44
+ @encode_html = encode_html
37
45
  end
38
46
 
39
47
  # Crawls an entire website's HTML pages by recursively going through
40
- # its internal links. Each crawled Document is yielded to a block.
48
+ # its internal <a> links. Each crawled Document is yielded to a block. Use
49
+ # the allow and disallow paths params to partially and selectively crawl a
50
+ # site.
41
51
  #
42
52
  # Only redirects to the same host are followed. For example, the Url
43
53
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -50,20 +60,26 @@ module Wgit
50
60
  # @param url [Wgit::Url] The base URL of the website to be crawled.
51
61
  # It is recommended that this URL be the index page of the site to give a
52
62
  # greater chance of finding all pages within that site/host.
63
+ # @param allow_paths [String, Array<String>] Filters links by selecting
64
+ # them only if their path includes one of allow_paths.
65
+ # @param disallow_paths [String, Array<String>] Filters links by rejecting
66
+ # them if their path includes one of disallow_paths.
53
67
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
54
68
  # A block is the only way to interact with each crawled Document.
55
69
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
56
70
  # from all of the site's pages or nil if the url could not be
57
71
  # crawled successfully.
58
- def crawl_site(url, &block)
72
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
59
73
  doc = crawl_url(url, &block)
60
74
  return nil if doc.nil?
61
75
 
62
- opts = { follow_external_redirects: false, host: url.to_base }
76
+ crawl_opts = { follow_external_redirects: false, host: url.to_base }
77
+ link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
78
+
63
79
  alt_url = url.end_with?('/') ? url.chop : url + '/'
64
80
  crawled = [url, alt_url]
65
81
  externals = doc.external_links
66
- internals = get_internal_links(doc)
82
+ internals = get_internal_links(doc, link_opts)
67
83
 
68
84
  return doc.external_links.uniq if internals.empty?
69
85
 
@@ -76,12 +92,12 @@ module Wgit
76
92
 
77
93
  links.each do |link|
78
94
  orig_link = link.dup
79
- doc = crawl_url(link, opts, &block)
95
+ doc = crawl_url(link, crawl_opts, &block)
80
96
 
81
97
  crawled.push(orig_link, link) # Push both in case of redirects.
82
98
  next if doc.nil?
83
99
 
84
- internals.concat(get_internal_links(doc))
100
+ internals.concat(get_internal_links(doc, link_opts))
85
101
  externals.concat(doc.external_links)
86
102
  end
87
103
  end
@@ -141,7 +157,7 @@ module Wgit
141
157
  host: host
142
158
  )
143
159
 
144
- doc = Wgit::Document.new(url, html)
160
+ doc = Wgit::Document.new(url, html, encode_html: @encode_html)
145
161
  yield(doc) if block_given?
146
162
 
147
163
  doc.empty? ? nil : doc
@@ -149,7 +165,7 @@ module Wgit
149
165
 
150
166
  protected
151
167
 
152
- # Fetches the url HTML String or nil. Handles any errors that arise
168
+ # Returns the url HTML String or nil. Handles any errors that arise
153
169
  # and sets the @last_response. Errors or any HTTP response that doesn't
154
170
  # return a HTML body will be ignored, returning nil.
155
171
  #
@@ -166,31 +182,33 @@ module Wgit
166
182
  # @return [String, nil] The crawled HTML or nil if the crawl was
167
183
  # unsuccessful.
168
184
  def fetch(url, follow_external_redirects: true, host: nil)
169
- response = nil
170
- crawl_duration = nil
185
+ response = Wgit::Response.new
171
186
 
172
- response = resolve(
187
+ resolve(
173
188
  url,
189
+ response,
174
190
  follow_external_redirects: follow_external_redirects,
175
191
  host: host
176
192
  )
177
- crawl_duration = response.total_time
178
193
 
179
- response.body.empty? ? nil : response.body
194
+ response.body_or_nil
180
195
  rescue StandardError => e
181
- Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
196
+ Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
182
197
 
183
198
  nil
184
199
  ensure
185
- url.crawled = true # Also sets date_crawled underneath.
186
- url.crawl_duration = crawl_duration
187
- @last_response = response
200
+ url.crawled = true # Sets date_crawled underneath.
201
+ url.crawl_duration = response.total_time
202
+
203
+ @last_response = response
188
204
  end
189
205
 
190
- # Resolves the url by handling any redirects. The response object will be
191
- # returned or an error raised.
206
+ # GETs the given url, resolving any redirects. The given response object
207
+ # will be enriched.
192
208
  #
193
- # @param url [Wgit::Url] The URL to resolve.
209
+ # @param url [Wgit::Url] The URL to GET and resolve.
210
+ # @param response [Wgit::Response] The response to enrich. Modifies by
211
+ # reference.
194
212
  # @param follow_external_redirects [Boolean] Whether or not to follow
195
213
  # an external redirect. If false, you must also provide a `host:`
196
214
  # parameter.
@@ -200,91 +218,162 @@ module Wgit
200
218
  # 'http://www.example.com' will only allow redirects for Urls with a
201
219
  # `to_host` value of 'www.example.com'.
202
220
  # @raise [StandardError] If a redirect isn't allowed etc.
203
- # @return [Typhoeus::Response] The HTTP response of the GET request.
204
- def resolve(url, follow_external_redirects: true, host: nil)
205
- response = nil
206
- redirect_count = 0
207
- total_net_time = 0.0
208
-
221
+ def resolve(url, response, follow_external_redirects: true, host: nil)
209
222
  loop do
210
- response = get_response(url)
211
- total_net_time += response.total_time if response.total_time
212
-
213
- # Break unless it's a redirect.
214
- break unless (response.code >= 300) && (response.code < 400)
223
+ get_response(url, response)
224
+ break unless response.redirect?
215
225
 
216
226
  # Handle response 'Location' header.
217
- location = Wgit::Utils.fetch(response.headers, :location, '')
218
- location = Wgit::Url.new(location)
227
+ location = Wgit::Url.new(response.headers.fetch(:location, ''))
219
228
  raise 'Encountered redirect without Location header' if location.empty?
220
229
 
221
230
  yield(url, response, location) if block_given?
222
231
 
223
- # Handle redirect logic.
232
+ # Validate redirect.
224
233
  if !follow_external_redirects && !location.relative?(host: host)
225
234
  raise "External redirect not allowed - Redirected to: \
226
235
  '#{location}', which is outside of host: '#{host}'"
227
236
  end
228
237
 
229
- raise "Too many redirects, exceeded: #{redirect_count}" \
230
- if redirect_count >= @redirect_limit
231
-
232
- redirect_count += 1
238
+ raise "Too many redirects, exceeded: #{@redirect_limit}" \
239
+ if response.redirect_count >= @redirect_limit
233
240
 
234
241
  # Process the location to be crawled next.
235
242
  location = url.to_base.concat(location) if location.relative?
243
+ response.redirections[url.to_s] = location.to_s
236
244
  url.replace(location) # Update the url on redirect.
237
245
  end
238
-
239
- response.options[:redirect_count] = redirect_count
240
- response.options[:total_time] = total_net_time
241
-
242
- response
243
246
  end
244
247
 
245
- # Performs a HTTP GET request and returns the response.
248
+ # Makes a HTTP request and enriches the given Wgit::Response from it.
246
249
  #
247
250
  # @param url [String] The url to GET. Will call url#normalize if possible.
251
+ # @param response [Wgit::Response] The response to enrich. Modifies by
252
+ # reference.
248
253
  # @raise [StandardError] If a response can't be obtained.
249
- # @return [Typhoeus::Response] The HTTP response of the GET request.
250
- def get_response(url)
251
- url = url.normalize if url.respond_to?(:normalize)
254
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
255
+ def get_response(url, response)
256
+ # Perform a HTTP GET request.
257
+ orig_url = url.to_s
258
+ url = url.normalize if url.respond_to?(:normalize)
259
+
260
+ http_response = http_get(url)
261
+
262
+ # Enrich the given Wgit::Response object.
263
+ response.adapter_response = http_response
264
+ response.url = orig_url
265
+ response.status = http_response.code
266
+ response.headers = http_response.headers
267
+ response.body = http_response.body
268
+ response.ip_address = http_response.primary_ip
269
+ response.add_total_time(http_response.total_time)
270
+
271
+ # Log (debug) the request/response details.
272
+ resp_template = '[http] Response: %s (%s bytes in %s seconds)'
273
+ log_status = (response.status || 0)
274
+ log_total_time = response.total_time.truncate(3)
275
+
276
+ Wgit.logger.debug("[http] Request: #{response.url}")
277
+ Wgit.logger.debug(
278
+ format(resp_template, log_status, response.size, log_total_time)
279
+ )
252
280
 
281
+ # Handle a failed response.
282
+ raise "No response (within timeout: #{@time_out} second(s))" \
283
+ if response.failure?
284
+ end
285
+
286
+ # Performs a HTTP GET request and returns the response.
287
+ #
288
+ # @param url [String] The url to GET.
289
+ # @return [Typhoeus::Response] The HTTP response object.
290
+ def http_get(url)
253
291
  opts = {
254
292
  followlocation: false,
255
293
  timeout: @time_out,
256
294
  accept_encoding: 'gzip',
257
295
  headers: {
258
296
  'User-Agent' => "wgit/#{Wgit::VERSION}",
259
- 'Accept' => 'text/html'
297
+ 'Accept' => 'text/html'
260
298
  }
261
299
  }
262
300
 
263
- response = Typhoeus.get(url, opts)
264
-
265
- # Handle response status code.
266
- raise "No response (within timeout: #{@time_out} second(s))" \
267
- if response.code.zero?
268
-
269
- response
301
+ # See https://rubydoc.info/gems/typhoeus for more info.
302
+ Typhoeus.get(url, opts)
270
303
  end
271
304
 
272
305
  # Returns a doc's internal HTML page links in absolute form; used when
273
- # crawling a site. Override this method in a subclass to change how a site
306
+ # crawling a site. Use the allow and disallow paths params to partially
307
+ # and selectively crawl a site.
308
+ #
309
+ # Override this method in a subclass to change how a site
274
310
  # is crawled; not what is extracted from each page (Document extensions
275
- # should be used for this purpose instead).
311
+ # should be used for this purpose instead). Just remember that only HTML
312
+ # files containing <a> links can keep the crawl going beyond the base URL.
276
313
  #
277
314
  # @param doc [Wgit::Document] The document from which to extract it's
278
315
  # internal page links.
316
+ # @param allow_paths [String, Array<String>] Filters links by selecting
317
+ # them only if their path includes one of allow_paths.
318
+ # @param disallow_paths [String, Array<String>] Filters links by rejecting
319
+ # them if their path includes one of disallow_paths.
279
320
  # @return [Array<Wgit::Url>] The internal page links from doc.
280
- def get_internal_links(doc)
281
- doc.internal_absolute_links
282
- .map(&:without_anchor) # Because anchors don't change page content.
283
- .uniq
284
- .reject do |link|
321
+ def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
322
+ links = doc
323
+ .internal_absolute_links
324
+ .map(&:omit_fragment) # Because fragments don't alter content.
325
+ .uniq
326
+ .reject do |link|
285
327
  ext = link.to_extension
286
328
  ext ? !%w[htm html].include?(ext.downcase) : false
287
329
  end
330
+
331
+ return links if allow_paths.nil? && disallow_paths.nil?
332
+
333
+ process_paths(links, allow_paths, disallow_paths)
334
+ end
335
+
336
+ private
337
+
338
+ # Validate and filter by the given URL paths.
339
+ def process_paths(links, allow_paths, disallow_paths)
340
+ raise "You can't provide both allow_paths: and disallow_paths: params" \
341
+ if allow_paths && disallow_paths
342
+
343
+ if allow_paths # White list.
344
+ filter_method = :select
345
+ paths = allow_paths
346
+ else # Black list.
347
+ filter_method = :reject
348
+ paths = disallow_paths
349
+ end
350
+
351
+ paths = [paths] unless paths.is_a?(Array)
352
+ paths = paths
353
+ .compact
354
+ .reject(&:empty?)
355
+ .uniq
356
+ .map { |path| Wgit::Url.new(path).to_path }
357
+
358
+ raise 'The provided paths cannot be empty' if paths.empty?
359
+
360
+ filter_links_by_path(links, filter_method, paths)
361
+ end
362
+
363
+ # Filters links by selecting or rejecting them based on their path.
364
+ def filter_links_by_path(links, filter_method, paths)
365
+ links.send(filter_method) do |link|
366
+ link_path = link.to_path
367
+ next(false) unless link_path
368
+
369
+ match = false
370
+ paths.each do |path|
371
+ match = link_path.start_with?(path)
372
+ break if match
373
+ end
374
+
375
+ match
376
+ end
288
377
  end
289
378
 
290
379
  alias crawl crawl_urls
@@ -220,19 +220,20 @@ module Wgit
220
220
  # @param url [Wgit::Url] The Url to search the DB for.
221
221
  # @return [Boolean] True if url exists, otherwise false.
222
222
  def url?(url)
223
- h = { 'url' => url }
224
- @client[:urls].find(h).any?
223
+ assert_type(url, String) # This includes Wgit::Url's.
224
+ hash = { 'url' => url }
225
+ @client[:urls].find(hash).any?
225
226
  end
226
227
 
227
- # Returns whether or not a record with the given doc 'url' field (which is
228
- # unique) exists in the database's 'documents' collection.
228
+ # Returns whether or not a record with the given doc 'url.url' field
229
+ # (which is unique) exists in the database's 'documents' collection.
229
230
  #
230
231
  # @param doc [Wgit::Document] The Document to search the DB for.
231
232
  # @return [Boolean] True if doc exists, otherwise false.
232
233
  def doc?(doc)
233
- url = doc.respond_to?(:url) ? doc.url : doc
234
- h = { 'url' => url }
235
- @client[:documents].find(h).any?
234
+ assert_type(doc, Wgit::Document)
235
+ hash = { 'url.url' => doc.url }
236
+ @client[:documents].find(hash).any?
236
237
  end
237
238
 
238
239
  ### Update Data ###
@@ -309,7 +310,7 @@ module Wgit
309
310
  # @return [Integer] The number of updated records.
310
311
  def update_doc(doc)
311
312
  assert_type(doc, Wgit::Document)
312
- selection = { url: doc.url }
313
+ selection = { 'url.url' => doc.url }
313
314
  doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
314
315
  update = { '$set' => doc_hash }
315
316
  mutate(true, :documents, selection, update)
@@ -26,7 +26,7 @@ module Wgit
26
26
  raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
27
27
 
28
28
  model = doc.to_h(include_html: false, include_score: false)
29
- model['url'] = self.url(doc.url) # Expand Url String into full object.
29
+ model['url'] = url(doc.url) # Expand Url String into full object.
30
30
 
31
31
  Wgit::Utils.remove_non_bson_types(model)
32
32
  end
@@ -36,7 +36,7 @@ module Wgit
36
36
  # @return [Hash] Insertion fields common to all models.
37
37
  def self.common_insert_data
38
38
  {
39
- date_added: Wgit::Utils.time_stamp,
39
+ date_added: Wgit::Utils.time_stamp,
40
40
  date_modified: Wgit::Utils.time_stamp
41
41
  }
42
42
  end
data/lib/wgit/document.rb CHANGED
@@ -5,7 +5,8 @@ require 'nokogiri'
5
5
  require 'json'
6
6
 
7
7
  module Wgit
8
- # Class modeling a HTML web document. Also doubles as a search result when
8
+ # Class primarily modeling a HTML web document, although other MIME types
9
+ # will work e.g. images etc. Also doubles as a search result when
9
10
  # loading Documents from the database via Wgit::Database#search.
10
11
  #
11
12
  # The initialize method dynamically initializes instance variables from the
@@ -60,11 +61,11 @@ module Wgit
60
61
  # only used if url_or_obj is a String representing the web page's URL.
61
62
  # Otherwise, the HTML comes from the database object. A html of nil will
62
63
  # be defaulted to an empty String.
63
- def initialize(url_or_obj, html = '')
64
+ def initialize(url_or_obj, html = '', encode_html: true)
64
65
  if url_or_obj.is_a?(String)
65
- init_from_strings(url_or_obj, html)
66
+ init_from_strings(url_or_obj, html, encode_html: encode_html)
66
67
  else
67
- init_from_object(url_or_obj)
68
+ init_from_object(url_or_obj, encode_html: encode_html)
68
69
  end
69
70
  end
70
71
 
@@ -91,25 +92,28 @@ module Wgit
91
92
  # instance variables upon Document initialization. See the default
92
93
  # extensions defined in 'document_extensions.rb' as examples.
93
94
  #
94
- # Initialises a private instance variable with the xpath or database object
95
- # result(s). When initialising from HTML, a true singleton value will only
96
- # ever return one result otherwise all xpath results are returned in an
95
+ # Note that defined extensions work for both Documents initialized from
96
+ # HTML (via Wgit::Crawler methods) and from database objects.
97
+ # An extension once defined, initializes a private instance variable with
98
+ # the xpath or database object result(s).
99
+ #
100
+ # When initialising from HTML, a singleton value of true will only
101
+ # ever return one result; otherwise all xpath results are returned in an
97
102
  # Array. When initialising from a database object, the value is taken as
98
103
  # is and singleton is only used to define the default empty value.
99
104
  # If a value cannot be found (in either the HTML or database object), then
100
- # a default will be used. The default value is: singleton ? nil : [].
101
- #
102
- # Note that defined extensions work for both documents initialized from
103
- # the WWW (via Wgit::Crawler methods) and from database objects. This
104
- # effectively implements ORM like behavior using this class.
105
+ # a default will be used. The default value is: `singleton ? nil : []`.
105
106
  #
106
107
  # @param var [Symbol] The name of the variable to be initialised.
107
108
  # @param xpath [String, Object#call] The xpath used to find the element(s)
108
- # of the webpage. Pass a callable object (proc etc.) if you want the
109
+ # of the webpage. Only used when initializing from HTML.
110
+ #
111
+ # Pass a callable object (proc etc.) if you want the
109
112
  # xpath value to be derived on Document initialisation (instead of when
110
113
  # the extension is defined). The call method must return a valid xpath
111
114
  # String.
112
- # @param options [Hash] The options to define an extension with.
115
+ # @param options [Hash] The options to define an extension with. The
116
+ # options are only used when intializing from HTML, not the database.
113
117
  # @option options [Boolean] :singleton The singleton option determines
114
118
  # whether or not the result(s) should be in an Array. If multiple
115
119
  # results are found and singleton is true then the first result will be
@@ -117,16 +121,17 @@ module Wgit
117
121
  # @option options [Boolean] :text_content_only The text_content_only option
118
122
  # if true will use the text content of the Nokogiri result object,
119
123
  # otherwise the Nokogiri object itself is returned. Defaults to true.
120
- # @yield [value, source] Yields the value (Object) about to be assigned to
121
- # the new var and the source (Symbol) of the value (either :html or
122
- # :object). The return value of the block becomes the new var value,
123
- # unless nil. Return nil if you want to inspect but not change the var
124
- # value. The block gets executed when a Document is initialized from html
125
- # or an object e.g. database.
124
+ # @yield [value, source, type] Yields the value (Object) about to be
125
+ # assigned to the new var, the source of the value (Wgit::Document or DB
126
+ # Object) and the source type (Symbol of either :document or :object).
127
+ #
128
+ # The return value of the block becomes the new var value, unless nil.
129
+ # Return nil if you want to inspect but not change the var value. The
130
+ # block is executed when a Wgit::Document is initialized.
126
131
  # @raise [StandardError] If the var param isn't valid.
127
- # @return [Symbol] The first half of the newly defined method names e.g.
128
- # if var == "title" then :init_title is returned.
132
+ # @return [Symbol] The given var Symbol.
129
133
  def self.define_extension(var, xpath, options = {}, &block)
134
+ var = var.to_sym
130
135
  default_options = { singleton: true, text_content_only: true }
131
136
  options = default_options.merge(options)
132
137
 
@@ -149,7 +154,7 @@ module Wgit
149
154
  end
150
155
  Document.send :private, func_name
151
156
 
152
- "init_#{var}".to_sym
157
+ var
153
158
  end
154
159
 
155
160
  # Removes the init_* methods created when an extension is defined.
@@ -189,55 +194,48 @@ module Wgit
189
194
  @html[range]
190
195
  end
191
196
 
192
- # Returns the timestamp of when this Document was crawled.
193
- #
194
- # @return [Time] Time of when this Document was crawled.
195
- def date_crawled
196
- @url.date_crawled
197
- end
198
-
199
- # Returns the duration of the crawl for this Document (in seconds).
200
- #
201
- # @return [Float] The duration of the crawl for this Document.
202
- def crawl_duration
203
- @url.crawl_duration
204
- end
205
-
206
197
  # Returns the base URL of this Wgit::Document. The base URL is either the
207
198
  # <base> element's href value or @url (if @base is nil). If @base is
208
199
  # present and relative, then @url.to_base + @base is returned. This method
209
200
  # should be used instead of `doc.url.to_base` etc. when manually building
210
- # absolute links from relative links.
201
+ # absolute links from relative links; or use `link.prefix_base(doc)`.
211
202
  #
212
203
  # Provide the `link:` parameter to get the correct base URL for that type
213
204
  # of link. For example, a link of `#top` would always return @url because
214
205
  # it applies to that page, not a different one. Query strings work in the
215
206
  # same way. Use this parameter if manually concatting Url's e.g.
216
207
  #
217
- # relative_link = Wgit::Url.new '?q=hello'
208
+ # relative_link = Wgit::Url.new('?q=hello')
218
209
  # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
219
210
  #
220
211
  # This is similar to how Wgit::Document#internal_absolute_links works.
221
212
  #
222
213
  # @param link [Wgit::Url, String] The link to obtain the correct base URL
223
- # for.
214
+ # for; must be relative, not absolute.
215
+ # @raise [StandardError] If link is relative or if a base URL can't be
216
+ # established e.g. the doc @url is relative and <base> is nil.
224
217
  # @return [Wgit::Url] The base URL of this Document e.g.
225
218
  # 'http://example.com/public'.
226
219
  def base_url(link: nil)
220
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
221
+ if @url.relative? && @base.nil?
222
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
223
+ if @url.relative? && @base&.relative?
224
+
227
225
  get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
228
226
 
229
227
  if link
230
228
  link = Wgit::Url.new(link)
231
229
  raise "link must be relative: #{link}" unless link.relative?
232
230
 
233
- if link.is_anchor? || link.is_query?
231
+ if link.is_fragment? || link.is_query?
234
232
  base_url = @base ? get_base.call : @url
235
- return base_url.without_anchor.without_query
233
+ return base_url.omit_fragment.omit_query
236
234
  end
237
235
  end
238
236
 
239
- base_url = @base ? get_base.call : @url.base
240
- base_url.without_anchor.without_query
237
+ base_url = @base ? get_base.call : @url.to_base
238
+ base_url.omit_fragment.omit_query
241
239
  end
242
240
 
243
241
  # Returns a Hash containing this Document's instance vars.
@@ -340,7 +338,7 @@ module Wgit
340
338
 
341
339
  links = @links
342
340
  .select { |link| link.relative?(host: @url.to_base) }
343
- .map(&:without_base)
341
+ .map(&:omit_base)
344
342
  .map do |link| # Map @url.to_host into / as it's a duplicate.
345
343
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
346
344
  end
@@ -354,7 +352,7 @@ module Wgit
354
352
  #
355
353
  # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
356
354
  def internal_absolute_links
357
- internal_links.map { |link| base_url(link: link).concat(link) }
355
+ internal_links.map { |link| link.prefix_base(self) }
358
356
  end
359
357
 
360
358
  # Returns all external links from this Document in absolute form. External
@@ -366,7 +364,7 @@ module Wgit
366
364
 
367
365
  links = @links
368
366
  .reject { |link| link.relative?(host: @url.to_base) }
369
- .map(&:without_trailing_slash)
367
+ .map(&:omit_trailing_slash)
370
368
 
371
369
  Wgit::Utils.process_arr(links)
372
370
  end
@@ -438,7 +436,7 @@ module Wgit
438
436
  orig_text = @text
439
437
  @text = search(
440
438
  query, case_sensitive: case_sensitive,
441
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
439
+ whole_sentence: whole_sentence, sentence_limit: sentence_limit
442
440
  )
443
441
 
444
442
  orig_text
@@ -473,7 +471,7 @@ module Wgit
473
471
  # @yield [value, source] Given the value (String/Object) before it's set as
474
472
  # an instance variable so that you can inspect/alter the value if
475
473
  # desired. Return nil from the block if you don't want to override the
476
- # value. Also given the source (Symbol) which is always :html.
474
+ # value. Also given the source (Symbol) which is always :document.
477
475
  # @return [String, Object] The value found in the html or the default value
478
476
  # (singleton ? nil : []).
479
477
  def find_in_html(xpath, singleton: true, text_content_only: true)
@@ -492,7 +490,7 @@ module Wgit
492
490
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
493
491
 
494
492
  if block_given?
495
- new_result = yield(result, :html)
493
+ new_result = yield(result, self, :document)
496
494
  result = new_result unless new_result.nil?
497
495
  end
498
496
 
@@ -519,7 +517,7 @@ module Wgit
519
517
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
520
518
 
521
519
  if block_given?
522
- new_result = yield(result, :object)
520
+ new_result = yield(result, obj, :object)
523
521
  result = new_result unless new_result.nil?
524
522
  end
525
523
 
@@ -529,19 +527,19 @@ module Wgit
529
527
  private
530
528
 
531
529
  # Initialise the Document from URL and HTML Strings.
532
- def init_from_strings(url, html)
530
+ def init_from_strings(url, html, encode_html: true)
533
531
  assert_types(html, [String, NilClass])
534
532
 
535
533
  # We already know url.is_a?(String) so parse into Url unless already so.
536
534
  url = Wgit::Url.parse(url)
537
- url.crawled = true unless url.crawled # Avoid overriding date_crawled.
535
+ url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
538
536
 
539
537
  @url = url
540
538
  @html = html || ''
541
539
  @doc = init_nokogiri
542
540
  @score = 0.0
543
541
 
544
- process_url_and_html
542
+ Wgit::Utils.process_str(@html, encode: encode_html)
545
543
 
546
544
  # Dynamically run the init_*_from_html methods.
547
545
  Document.private_instance_methods(false).each do |method|
@@ -554,7 +552,7 @@ module Wgit
554
552
 
555
553
  # Initialise the Document from a Hash like Object containing Strings as
556
554
  # keys e.g. database collection object or Hash.
557
- def init_from_object(obj)
555
+ def init_from_object(obj, encode_html: true)
558
556
  assert_respond_to(obj, :fetch)
559
557
 
560
558
  @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -562,7 +560,7 @@ module Wgit
562
560
  @doc = init_nokogiri
563
561
  @score = obj.fetch('score', 0.0)
564
562
 
565
- process_url_and_html
563
+ Wgit::Utils.process_str(@html, encode: encode_html)
566
564
 
567
565
  # Dynamically run the init_*_from_object methods.
568
566
  Document.private_instance_methods(false).each do |method|
@@ -573,12 +571,6 @@ module Wgit
573
571
  end
574
572
  end
575
573
 
576
- # Ensure the @url and @html Strings are correctly encoded etc.
577
- def process_url_and_html
578
- @url = Wgit::Utils.process_str(@url)
579
- @html = Wgit::Utils.process_str(@html)
580
- end
581
-
582
574
  # Initialises an instance variable and defines a getter method for it.
583
575
  #
584
576
  # @param var [Symbol] The name of the variable to be initialized.
@@ -597,6 +589,7 @@ module Wgit
597
589
  end
598
590
  end
599
591
 
592
+ alias content html
600
593
  alias statistics stats
601
594
  alias internal_urls internal_links
602
595
  alias internal_absolute_urls internal_absolute_links