wgit 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 70e6ec83e53550bcfe180b66248747763314c33738ecd0fabddca65dbb3918b0
4
- data.tar.gz: a1c3d1e8bb6d078731876093cb2beed0ea4da65cb03dff1ead975f714bd3d9b5
3
+ metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
4
+ data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
5
5
  SHA512:
6
- metadata.gz: ab519107506ec0798995cb52f986040da12d1a5c59c0c36f84bf8e09d847fd5ab83b3bd7f47ff95b6e474a35d855b176fdc9d245b1cef356781eadb21a4a84f7
7
- data.tar.gz: '010748005ded444f44812c8b6022d258b60a3485dcef8b78c562012428e3955a7fbfe80f53a570cb9f6b524042388949cf2cf08d6a1b27581f2cfd9b424603b0'
6
+ metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
7
+ data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee
data/lib/wgit/crawler.rb CHANGED
@@ -4,11 +4,13 @@ require_relative 'url'
4
4
  require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
+ require_relative 'response'
7
8
  require 'typhoeus'
8
9
 
9
10
  module Wgit
10
11
  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
11
- # serialising their HTML into Wgit::Document instances.
12
+ # serialising their HTML into Wgit::Document instances. This is the only Wgit
13
+ # class which contains network logic e.g. request/response handling.
12
14
  class Crawler
13
15
  include Assertable
14
16
 
@@ -20,8 +22,11 @@ module Wgit
20
22
  # before raising an error. Set to 0 to disable time outs completely.
21
23
  attr_accessor :time_out
22
24
 
23
- # The Typhoeus::Response of the most recently crawled URL or nil.
24
- # See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
25
+ # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
26
+ # crawling more than just HTML e.g. images etc.
27
+ attr_accessor :encode_html
28
+
29
+ # The Wgit::Response of the most recently crawled URL.
25
30
  attr_reader :last_response
26
31
 
27
32
  # Initializes and returns a Wgit::Crawler instance.
@@ -31,13 +36,18 @@ module Wgit
31
36
  # @param time_out [Integer, Float] The maximum amount of time (in seconds)
32
37
  # a crawl request has to complete before raising an error. Set to 0 to
33
38
  # disable time outs completely.
34
- def initialize(redirect_limit: 5, time_out: 5)
39
+ # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
40
+ # crawled. Set to false if crawling more than just HTML e.g. images etc.
41
+ def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
35
42
  @redirect_limit = redirect_limit
36
43
  @time_out = time_out
44
+ @encode_html = encode_html
37
45
  end
38
46
 
39
47
  # Crawls an entire website's HTML pages by recursively going through
40
- # its internal links. Each crawled Document is yielded to a block.
48
+ # its internal <a> links. Each crawled Document is yielded to a block. Use
49
+ # the allow and disallow paths params to partially and selectively crawl a
50
+ # site.
41
51
  #
42
52
  # Only redirects to the same host are followed. For example, the Url
43
53
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -50,20 +60,26 @@ module Wgit
50
60
  # @param url [Wgit::Url] The base URL of the website to be crawled.
51
61
  # It is recommended that this URL be the index page of the site to give a
52
62
  # greater chance of finding all pages within that site/host.
63
+ # @param allow_paths [String, Array<String>] Filters links by selecting
64
+ # them only if their path includes one of allow_paths.
65
+ # @param disallow_paths [String, Array<String>] Filters links by rejecting
66
+ # them if their path includes one of disallow_paths.
53
67
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
54
68
  # A block is the only way to interact with each crawled Document.
55
69
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
56
70
  # from all of the site's pages or nil if the url could not be
57
71
  # crawled successfully.
58
- def crawl_site(url, &block)
72
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
59
73
  doc = crawl_url(url, &block)
60
74
  return nil if doc.nil?
61
75
 
62
- opts = { follow_external_redirects: false, host: url.to_base }
76
+ crawl_opts = { follow_external_redirects: false, host: url.to_base }
77
+ link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
78
+
63
79
  alt_url = url.end_with?('/') ? url.chop : url + '/'
64
80
  crawled = [url, alt_url]
65
81
  externals = doc.external_links
66
- internals = get_internal_links(doc)
82
+ internals = get_internal_links(doc, link_opts)
67
83
 
68
84
  return doc.external_links.uniq if internals.empty?
69
85
 
@@ -76,12 +92,12 @@ module Wgit
76
92
 
77
93
  links.each do |link|
78
94
  orig_link = link.dup
79
- doc = crawl_url(link, opts, &block)
95
+ doc = crawl_url(link, crawl_opts, &block)
80
96
 
81
97
  crawled.push(orig_link, link) # Push both in case of redirects.
82
98
  next if doc.nil?
83
99
 
84
- internals.concat(get_internal_links(doc))
100
+ internals.concat(get_internal_links(doc, link_opts))
85
101
  externals.concat(doc.external_links)
86
102
  end
87
103
  end
@@ -141,7 +157,7 @@ module Wgit
141
157
  host: host
142
158
  )
143
159
 
144
- doc = Wgit::Document.new(url, html)
160
+ doc = Wgit::Document.new(url, html, encode_html: @encode_html)
145
161
  yield(doc) if block_given?
146
162
 
147
163
  doc.empty? ? nil : doc
@@ -149,7 +165,7 @@ module Wgit
149
165
 
150
166
  protected
151
167
 
152
- # Fetches the url HTML String or nil. Handles any errors that arise
168
+ # Returns the url HTML String or nil. Handles any errors that arise
153
169
  # and sets the @last_response. Errors or any HTTP response that doesn't
154
170
  # return a HTML body will be ignored, returning nil.
155
171
  #
@@ -166,31 +182,33 @@ module Wgit
166
182
  # @return [String, nil] The crawled HTML or nil if the crawl was
167
183
  # unsuccessful.
168
184
  def fetch(url, follow_external_redirects: true, host: nil)
169
- response = nil
170
- crawl_duration = nil
185
+ response = Wgit::Response.new
171
186
 
172
- response = resolve(
187
+ resolve(
173
188
  url,
189
+ response,
174
190
  follow_external_redirects: follow_external_redirects,
175
191
  host: host
176
192
  )
177
- crawl_duration = response.total_time
178
193
 
179
- response.body.empty? ? nil : response.body
194
+ response.body_or_nil
180
195
  rescue StandardError => e
181
- Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
196
+ Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
182
197
 
183
198
  nil
184
199
  ensure
185
- url.crawled = true # Also sets date_crawled underneath.
186
- url.crawl_duration = crawl_duration
187
- @last_response = response
200
+ url.crawled = true # Sets date_crawled underneath.
201
+ url.crawl_duration = response.total_time
202
+
203
+ @last_response = response
188
204
  end
189
205
 
190
- # Resolves the url by handling any redirects. The response object will be
191
- # returned or an error raised.
206
+ # GETs the given url, resolving any redirects. The given response object
207
+ # will be enriched.
192
208
  #
193
- # @param url [Wgit::Url] The URL to resolve.
209
+ # @param url [Wgit::Url] The URL to GET and resolve.
210
+ # @param response [Wgit::Response] The response to enrich. Modifies by
211
+ # reference.
194
212
  # @param follow_external_redirects [Boolean] Whether or not to follow
195
213
  # an external redirect. If false, you must also provide a `host:`
196
214
  # parameter.
@@ -200,91 +218,162 @@ module Wgit
200
218
  # 'http://www.example.com' will only allow redirects for Urls with a
201
219
  # `to_host` value of 'www.example.com'.
202
220
  # @raise [StandardError] If a redirect isn't allowed etc.
203
- # @return [Typhoeus::Response] The HTTP response of the GET request.
204
- def resolve(url, follow_external_redirects: true, host: nil)
205
- response = nil
206
- redirect_count = 0
207
- total_net_time = 0.0
208
-
221
+ def resolve(url, response, follow_external_redirects: true, host: nil)
209
222
  loop do
210
- response = get_response(url)
211
- total_net_time += response.total_time if response.total_time
212
-
213
- # Break unless it's a redirect.
214
- break unless (response.code >= 300) && (response.code < 400)
223
+ get_response(url, response)
224
+ break unless response.redirect?
215
225
 
216
226
  # Handle response 'Location' header.
217
- location = Wgit::Utils.fetch(response.headers, :location, '')
218
- location = Wgit::Url.new(location)
227
+ location = Wgit::Url.new(response.headers.fetch(:location, ''))
219
228
  raise 'Encountered redirect without Location header' if location.empty?
220
229
 
221
230
  yield(url, response, location) if block_given?
222
231
 
223
- # Handle redirect logic.
232
+ # Validate redirect.
224
233
  if !follow_external_redirects && !location.relative?(host: host)
225
234
  raise "External redirect not allowed - Redirected to: \
226
235
  '#{location}', which is outside of host: '#{host}'"
227
236
  end
228
237
 
229
- raise "Too many redirects, exceeded: #{redirect_count}" \
230
- if redirect_count >= @redirect_limit
231
-
232
- redirect_count += 1
238
+ raise "Too many redirects, exceeded: #{@redirect_limit}" \
239
+ if response.redirect_count >= @redirect_limit
233
240
 
234
241
  # Process the location to be crawled next.
235
242
  location = url.to_base.concat(location) if location.relative?
243
+ response.redirections[url.to_s] = location.to_s
236
244
  url.replace(location) # Update the url on redirect.
237
245
  end
238
-
239
- response.options[:redirect_count] = redirect_count
240
- response.options[:total_time] = total_net_time
241
-
242
- response
243
246
  end
244
247
 
245
- # Performs a HTTP GET request and returns the response.
248
+ # Makes a HTTP request and enriches the given Wgit::Response from it.
246
249
  #
247
250
  # @param url [String] The url to GET. Will call url#normalize if possible.
251
+ # @param response [Wgit::Response] The response to enrich. Modifies by
252
+ # reference.
248
253
  # @raise [StandardError] If a response can't be obtained.
249
- # @return [Typhoeus::Response] The HTTP response of the GET request.
250
- def get_response(url)
251
- url = url.normalize if url.respond_to?(:normalize)
254
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
255
+ def get_response(url, response)
256
+ # Perform a HTTP GET request.
257
+ orig_url = url.to_s
258
+ url = url.normalize if url.respond_to?(:normalize)
259
+
260
+ http_response = http_get(url)
261
+
262
+ # Enrich the given Wgit::Response object.
263
+ response.adapter_response = http_response
264
+ response.url = orig_url
265
+ response.status = http_response.code
266
+ response.headers = http_response.headers
267
+ response.body = http_response.body
268
+ response.ip_address = http_response.primary_ip
269
+ response.add_total_time(http_response.total_time)
270
+
271
+ # Log (debug) the request/response details.
272
+ resp_template = '[http] Response: %s (%s bytes in %s seconds)'
273
+ log_status = (response.status || 0)
274
+ log_total_time = response.total_time.truncate(3)
275
+
276
+ Wgit.logger.debug("[http] Request: #{response.url}")
277
+ Wgit.logger.debug(
278
+ format(resp_template, log_status, response.size, log_total_time)
279
+ )
252
280
 
281
+ # Handle a failed response.
282
+ raise "No response (within timeout: #{@time_out} second(s))" \
283
+ if response.failure?
284
+ end
285
+
286
+ # Performs a HTTP GET request and returns the response.
287
+ #
288
+ # @param url [String] The url to GET.
289
+ # @return [Typhoeus::Response] The HTTP response object.
290
+ def http_get(url)
253
291
  opts = {
254
292
  followlocation: false,
255
293
  timeout: @time_out,
256
294
  accept_encoding: 'gzip',
257
295
  headers: {
258
296
  'User-Agent' => "wgit/#{Wgit::VERSION}",
259
- 'Accept' => 'text/html'
297
+ 'Accept' => 'text/html'
260
298
  }
261
299
  }
262
300
 
263
- response = Typhoeus.get(url, opts)
264
-
265
- # Handle response status code.
266
- raise "No response (within timeout: #{@time_out} second(s))" \
267
- if response.code.zero?
268
-
269
- response
301
+ # See https://rubydoc.info/gems/typhoeus for more info.
302
+ Typhoeus.get(url, opts)
270
303
  end
271
304
 
272
305
  # Returns a doc's internal HTML page links in absolute form; used when
273
- # crawling a site. Override this method in a subclass to change how a site
306
+ # crawling a site. Use the allow and disallow paths params to partially
307
+ # and selectively crawl a site.
308
+ #
309
+ # Override this method in a subclass to change how a site
274
310
  # is crawled; not what is extracted from each page (Document extensions
275
- # should be used for this purpose instead).
311
+ # should be used for this purpose instead). Just remember that only HTML
312
+ # files containing <a> links can keep the crawl going beyond the base URL.
276
313
  #
277
314
  # @param doc [Wgit::Document] The document from which to extract it's
278
315
  # internal page links.
316
+ # @param allow_paths [String, Array<String>] Filters links by selecting
317
+ # them only if their path includes one of allow_paths.
318
+ # @param disallow_paths [String, Array<String>] Filters links by rejecting
319
+ # them if their path includes one of disallow_paths.
279
320
  # @return [Array<Wgit::Url>] The internal page links from doc.
280
- def get_internal_links(doc)
281
- doc.internal_absolute_links
282
- .map(&:without_anchor) # Because anchors don't change page content.
283
- .uniq
284
- .reject do |link|
321
+ def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
322
+ links = doc
323
+ .internal_absolute_links
324
+ .map(&:omit_fragment) # Because fragments don't alter content.
325
+ .uniq
326
+ .reject do |link|
285
327
  ext = link.to_extension
286
328
  ext ? !%w[htm html].include?(ext.downcase) : false
287
329
  end
330
+
331
+ return links if allow_paths.nil? && disallow_paths.nil?
332
+
333
+ process_paths(links, allow_paths, disallow_paths)
334
+ end
335
+
336
+ private
337
+
338
+ # Validate and filter by the given URL paths.
339
+ def process_paths(links, allow_paths, disallow_paths)
340
+ raise "You can't provide both allow_paths: and disallow_paths: params" \
341
+ if allow_paths && disallow_paths
342
+
343
+ if allow_paths # White list.
344
+ filter_method = :select
345
+ paths = allow_paths
346
+ else # Black list.
347
+ filter_method = :reject
348
+ paths = disallow_paths
349
+ end
350
+
351
+ paths = [paths] unless paths.is_a?(Array)
352
+ paths = paths
353
+ .compact
354
+ .reject(&:empty?)
355
+ .uniq
356
+ .map { |path| Wgit::Url.new(path).to_path }
357
+
358
+ raise 'The provided paths cannot be empty' if paths.empty?
359
+
360
+ filter_links_by_path(links, filter_method, paths)
361
+ end
362
+
363
+ # Filters links by selecting or rejecting them based on their path.
364
+ def filter_links_by_path(links, filter_method, paths)
365
+ links.send(filter_method) do |link|
366
+ link_path = link.to_path
367
+ next(false) unless link_path
368
+
369
+ match = false
370
+ paths.each do |path|
371
+ match = link_path.start_with?(path)
372
+ break if match
373
+ end
374
+
375
+ match
376
+ end
288
377
  end
289
378
 
290
379
  alias crawl crawl_urls
@@ -220,19 +220,20 @@ module Wgit
220
220
  # @param url [Wgit::Url] The Url to search the DB for.
221
221
  # @return [Boolean] True if url exists, otherwise false.
222
222
  def url?(url)
223
- h = { 'url' => url }
224
- @client[:urls].find(h).any?
223
+ assert_type(url, String) # This includes Wgit::Url's.
224
+ hash = { 'url' => url }
225
+ @client[:urls].find(hash).any?
225
226
  end
226
227
 
227
- # Returns whether or not a record with the given doc 'url' field (which is
228
- # unique) exists in the database's 'documents' collection.
228
+ # Returns whether or not a record with the given doc 'url.url' field
229
+ # (which is unique) exists in the database's 'documents' collection.
229
230
  #
230
231
  # @param doc [Wgit::Document] The Document to search the DB for.
231
232
  # @return [Boolean] True if doc exists, otherwise false.
232
233
  def doc?(doc)
233
- url = doc.respond_to?(:url) ? doc.url : doc
234
- h = { 'url' => url }
235
- @client[:documents].find(h).any?
234
+ assert_type(doc, Wgit::Document)
235
+ hash = { 'url.url' => doc.url }
236
+ @client[:documents].find(hash).any?
236
237
  end
237
238
 
238
239
  ### Update Data ###
@@ -309,7 +310,7 @@ module Wgit
309
310
  # @return [Integer] The number of updated records.
310
311
  def update_doc(doc)
311
312
  assert_type(doc, Wgit::Document)
312
- selection = { url: doc.url }
313
+ selection = { 'url.url' => doc.url }
313
314
  doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
314
315
  update = { '$set' => doc_hash }
315
316
  mutate(true, :documents, selection, update)
@@ -26,7 +26,7 @@ module Wgit
26
26
  raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
27
27
 
28
28
  model = doc.to_h(include_html: false, include_score: false)
29
- model['url'] = self.url(doc.url) # Expand Url String into full object.
29
+ model['url'] = url(doc.url) # Expand Url String into full object.
30
30
 
31
31
  Wgit::Utils.remove_non_bson_types(model)
32
32
  end
@@ -36,7 +36,7 @@ module Wgit
36
36
  # @return [Hash] Insertion fields common to all models.
37
37
  def self.common_insert_data
38
38
  {
39
- date_added: Wgit::Utils.time_stamp,
39
+ date_added: Wgit::Utils.time_stamp,
40
40
  date_modified: Wgit::Utils.time_stamp
41
41
  }
42
42
  end
data/lib/wgit/document.rb CHANGED
@@ -5,7 +5,8 @@ require 'nokogiri'
5
5
  require 'json'
6
6
 
7
7
  module Wgit
8
- # Class modeling a HTML web document. Also doubles as a search result when
8
+ # Class primarily modeling a HTML web document, although other MIME types
9
+ # will work e.g. images etc. Also doubles as a search result when
9
10
  # loading Documents from the database via Wgit::Database#search.
10
11
  #
11
12
  # The initialize method dynamically initializes instance variables from the
@@ -60,11 +61,11 @@ module Wgit
60
61
  # only used if url_or_obj is a String representing the web page's URL.
61
62
  # Otherwise, the HTML comes from the database object. A html of nil will
62
63
  # be defaulted to an empty String.
63
- def initialize(url_or_obj, html = '')
64
+ def initialize(url_or_obj, html = '', encode_html: true)
64
65
  if url_or_obj.is_a?(String)
65
- init_from_strings(url_or_obj, html)
66
+ init_from_strings(url_or_obj, html, encode_html: encode_html)
66
67
  else
67
- init_from_object(url_or_obj)
68
+ init_from_object(url_or_obj, encode_html: encode_html)
68
69
  end
69
70
  end
70
71
 
@@ -91,25 +92,28 @@ module Wgit
91
92
  # instance variables upon Document initialization. See the default
92
93
  # extensions defined in 'document_extensions.rb' as examples.
93
94
  #
94
- # Initialises a private instance variable with the xpath or database object
95
- # result(s). When initialising from HTML, a true singleton value will only
96
- # ever return one result otherwise all xpath results are returned in an
95
+ # Note that defined extensions work for both Documents initialized from
96
+ # HTML (via Wgit::Crawler methods) and from database objects.
97
+ # An extension once defined, initializes a private instance variable with
98
+ # the xpath or database object result(s).
99
+ #
100
+ # When initialising from HTML, a singleton value of true will only
101
+ # ever return one result; otherwise all xpath results are returned in an
97
102
  # Array. When initialising from a database object, the value is taken as
98
103
  # is and singleton is only used to define the default empty value.
99
104
  # If a value cannot be found (in either the HTML or database object), then
100
- # a default will be used. The default value is: singleton ? nil : [].
101
- #
102
- # Note that defined extensions work for both documents initialized from
103
- # the WWW (via Wgit::Crawler methods) and from database objects. This
104
- # effectively implements ORM like behavior using this class.
105
+ # a default will be used. The default value is: `singleton ? nil : []`.
105
106
  #
106
107
  # @param var [Symbol] The name of the variable to be initialised.
107
108
  # @param xpath [String, Object#call] The xpath used to find the element(s)
108
- # of the webpage. Pass a callable object (proc etc.) if you want the
109
+ # of the webpage. Only used when initializing from HTML.
110
+ #
111
+ # Pass a callable object (proc etc.) if you want the
109
112
  # xpath value to be derived on Document initialisation (instead of when
110
113
  # the extension is defined). The call method must return a valid xpath
111
114
  # String.
112
- # @param options [Hash] The options to define an extension with.
115
+ # @param options [Hash] The options to define an extension with. The
116
+ # options are only used when intializing from HTML, not the database.
113
117
  # @option options [Boolean] :singleton The singleton option determines
114
118
  # whether or not the result(s) should be in an Array. If multiple
115
119
  # results are found and singleton is true then the first result will be
@@ -117,16 +121,17 @@ module Wgit
117
121
  # @option options [Boolean] :text_content_only The text_content_only option
118
122
  # if true will use the text content of the Nokogiri result object,
119
123
  # otherwise the Nokogiri object itself is returned. Defaults to true.
120
- # @yield [value, source] Yields the value (Object) about to be assigned to
121
- # the new var and the source (Symbol) of the value (either :html or
122
- # :object). The return value of the block becomes the new var value,
123
- # unless nil. Return nil if you want to inspect but not change the var
124
- # value. The block gets executed when a Document is initialized from html
125
- # or an object e.g. database.
124
+ # @yield [value, source, type] Yields the value (Object) about to be
125
+ # assigned to the new var, the source of the value (Wgit::Document or DB
126
+ # Object) and the source type (Symbol of either :document or :object).
127
+ #
128
+ # The return value of the block becomes the new var value, unless nil.
129
+ # Return nil if you want to inspect but not change the var value. The
130
+ # block is executed when a Wgit::Document is initialized.
126
131
  # @raise [StandardError] If the var param isn't valid.
127
- # @return [Symbol] The first half of the newly defined method names e.g.
128
- # if var == "title" then :init_title is returned.
132
+ # @return [Symbol] The given var Symbol.
129
133
  def self.define_extension(var, xpath, options = {}, &block)
134
+ var = var.to_sym
130
135
  default_options = { singleton: true, text_content_only: true }
131
136
  options = default_options.merge(options)
132
137
 
@@ -149,7 +154,7 @@ module Wgit
149
154
  end
150
155
  Document.send :private, func_name
151
156
 
152
- "init_#{var}".to_sym
157
+ var
153
158
  end
154
159
 
155
160
  # Removes the init_* methods created when an extension is defined.
@@ -189,55 +194,48 @@ module Wgit
189
194
  @html[range]
190
195
  end
191
196
 
192
- # Returns the timestamp of when this Document was crawled.
193
- #
194
- # @return [Time] Time of when this Document was crawled.
195
- def date_crawled
196
- @url.date_crawled
197
- end
198
-
199
- # Returns the duration of the crawl for this Document (in seconds).
200
- #
201
- # @return [Float] The duration of the crawl for this Document.
202
- def crawl_duration
203
- @url.crawl_duration
204
- end
205
-
206
197
  # Returns the base URL of this Wgit::Document. The base URL is either the
207
198
  # <base> element's href value or @url (if @base is nil). If @base is
208
199
  # present and relative, then @url.to_base + @base is returned. This method
209
200
  # should be used instead of `doc.url.to_base` etc. when manually building
210
- # absolute links from relative links.
201
+ # absolute links from relative links; or use `link.prefix_base(doc)`.
211
202
  #
212
203
  # Provide the `link:` parameter to get the correct base URL for that type
213
204
  # of link. For example, a link of `#top` would always return @url because
214
205
  # it applies to that page, not a different one. Query strings work in the
215
206
  # same way. Use this parameter if manually concatting Url's e.g.
216
207
  #
217
- # relative_link = Wgit::Url.new '?q=hello'
208
+ # relative_link = Wgit::Url.new('?q=hello')
218
209
  # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
219
210
  #
220
211
  # This is similar to how Wgit::Document#internal_absolute_links works.
221
212
  #
222
213
  # @param link [Wgit::Url, String] The link to obtain the correct base URL
223
- # for.
214
+ # for; must be relative, not absolute.
215
+ # @raise [StandardError] If link is relative or if a base URL can't be
216
+ # established e.g. the doc @url is relative and <base> is nil.
224
217
  # @return [Wgit::Url] The base URL of this Document e.g.
225
218
  # 'http://example.com/public'.
226
219
  def base_url(link: nil)
220
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
221
+ if @url.relative? && @base.nil?
222
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
223
+ if @url.relative? && @base&.relative?
224
+
227
225
  get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
228
226
 
229
227
  if link
230
228
  link = Wgit::Url.new(link)
231
229
  raise "link must be relative: #{link}" unless link.relative?
232
230
 
233
- if link.is_anchor? || link.is_query?
231
+ if link.is_fragment? || link.is_query?
234
232
  base_url = @base ? get_base.call : @url
235
- return base_url.without_anchor.without_query
233
+ return base_url.omit_fragment.omit_query
236
234
  end
237
235
  end
238
236
 
239
- base_url = @base ? get_base.call : @url.base
240
- base_url.without_anchor.without_query
237
+ base_url = @base ? get_base.call : @url.to_base
238
+ base_url.omit_fragment.omit_query
241
239
  end
242
240
 
243
241
  # Returns a Hash containing this Document's instance vars.
@@ -340,7 +338,7 @@ module Wgit
340
338
 
341
339
  links = @links
342
340
  .select { |link| link.relative?(host: @url.to_base) }
343
- .map(&:without_base)
341
+ .map(&:omit_base)
344
342
  .map do |link| # Map @url.to_host into / as it's a duplicate.
345
343
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
346
344
  end
@@ -354,7 +352,7 @@ module Wgit
354
352
  #
355
353
  # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
356
354
  def internal_absolute_links
357
- internal_links.map { |link| base_url(link: link).concat(link) }
355
+ internal_links.map { |link| link.prefix_base(self) }
358
356
  end
359
357
 
360
358
  # Returns all external links from this Document in absolute form. External
@@ -366,7 +364,7 @@ module Wgit
366
364
 
367
365
  links = @links
368
366
  .reject { |link| link.relative?(host: @url.to_base) }
369
- .map(&:without_trailing_slash)
367
+ .map(&:omit_trailing_slash)
370
368
 
371
369
  Wgit::Utils.process_arr(links)
372
370
  end
@@ -438,7 +436,7 @@ module Wgit
438
436
  orig_text = @text
439
437
  @text = search(
440
438
  query, case_sensitive: case_sensitive,
441
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
439
+ whole_sentence: whole_sentence, sentence_limit: sentence_limit
442
440
  )
443
441
 
444
442
  orig_text
@@ -473,7 +471,7 @@ module Wgit
473
471
  # @yield [value, source] Given the value (String/Object) before it's set as
474
472
  # an instance variable so that you can inspect/alter the value if
475
473
  # desired. Return nil from the block if you don't want to override the
476
- # value. Also given the source (Symbol) which is always :html.
474
+ # value. Also given the source (Symbol) which is always :document.
477
475
  # @return [String, Object] The value found in the html or the default value
478
476
  # (singleton ? nil : []).
479
477
  def find_in_html(xpath, singleton: true, text_content_only: true)
@@ -492,7 +490,7 @@ module Wgit
492
490
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
493
491
 
494
492
  if block_given?
495
- new_result = yield(result, :html)
493
+ new_result = yield(result, self, :document)
496
494
  result = new_result unless new_result.nil?
497
495
  end
498
496
 
@@ -519,7 +517,7 @@ module Wgit
519
517
  singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
520
518
 
521
519
  if block_given?
522
- new_result = yield(result, :object)
520
+ new_result = yield(result, obj, :object)
523
521
  result = new_result unless new_result.nil?
524
522
  end
525
523
 
@@ -529,19 +527,19 @@ module Wgit
529
527
  private
530
528
 
531
529
  # Initialise the Document from URL and HTML Strings.
532
- def init_from_strings(url, html)
530
+ def init_from_strings(url, html, encode_html: true)
533
531
  assert_types(html, [String, NilClass])
534
532
 
535
533
  # We already know url.is_a?(String) so parse into Url unless already so.
536
534
  url = Wgit::Url.parse(url)
537
- url.crawled = true unless url.crawled # Avoid overriding date_crawled.
535
+ url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
538
536
 
539
537
  @url = url
540
538
  @html = html || ''
541
539
  @doc = init_nokogiri
542
540
  @score = 0.0
543
541
 
544
- process_url_and_html
542
+ Wgit::Utils.process_str(@html, encode: encode_html)
545
543
 
546
544
  # Dynamically run the init_*_from_html methods.
547
545
  Document.private_instance_methods(false).each do |method|
@@ -554,7 +552,7 @@ module Wgit
554
552
 
555
553
  # Initialise the Document from a Hash like Object containing Strings as
556
554
  # keys e.g. database collection object or Hash.
557
- def init_from_object(obj)
555
+ def init_from_object(obj, encode_html: true)
558
556
  assert_respond_to(obj, :fetch)
559
557
 
560
558
  @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -562,7 +560,7 @@ module Wgit
562
560
  @doc = init_nokogiri
563
561
  @score = obj.fetch('score', 0.0)
564
562
 
565
- process_url_and_html
563
+ Wgit::Utils.process_str(@html, encode: encode_html)
566
564
 
567
565
  # Dynamically run the init_*_from_object methods.
568
566
  Document.private_instance_methods(false).each do |method|
@@ -573,12 +571,6 @@ module Wgit
573
571
  end
574
572
  end
575
573
 
576
- # Ensure the @url and @html Strings are correctly encoded etc.
577
- def process_url_and_html
578
- @url = Wgit::Utils.process_str(@url)
579
- @html = Wgit::Utils.process_str(@html)
580
- end
581
-
582
574
  # Initialises an instance variable and defines a getter method for it.
583
575
  #
584
576
  # @param var [Symbol] The name of the variable to be initialized.
@@ -597,6 +589,7 @@ module Wgit
597
589
  end
598
590
  end
599
591
 
592
+ alias content html
600
593
  alias statistics stats
601
594
  alias internal_urls internal_links
602
595
  alias internal_absolute_urls internal_absolute_links