wgit 0.10.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/crawler.rb CHANGED
@@ -5,7 +5,6 @@ require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
- require 'set'
9
8
  require 'benchmark'
10
9
  require 'typhoeus'
11
10
  require 'ferrum'
@@ -55,6 +54,11 @@ module Wgit
55
54
  # The value should balance between a good UX and enough JS parse time.
56
55
  attr_accessor :parse_javascript_delay
57
56
 
57
+ # The opts Hash passed directly to the ferrum Chrome browser when
58
+ # `parse_javascript: true`.
59
+ # See https://github.com/rubycdp/ferrum for details.
60
+ attr_accessor :ferrum_opts
61
+
58
62
  # The Wgit::Response of the most recently crawled URL.
59
63
  attr_reader :last_response
60
64
 
@@ -70,13 +74,31 @@ module Wgit
70
74
  # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
75
  # of the crawled document. Parsing requires Chrome/Chromium to be
72
76
  # installed and in $PATH.
77
+ # @param parse_javascript_delay [Integer] The delay time given to a page's
78
+ # JS to update the DOM. After the delay, the HTML is crawled.
73
79
  def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
- parse_javascript: false, parse_javascript_delay: 1)
80
+ parse_javascript: false, parse_javascript_delay: 1,
81
+ ferrum_opts: {})
82
+ assert_type(redirect_limit, Integer)
83
+ assert_type(timeout, [Integer, Float])
84
+ assert_type(encode, [TrueClass, FalseClass])
85
+ assert_type(parse_javascript, [TrueClass, FalseClass])
86
+ assert_type(parse_javascript_delay, Integer)
87
+ assert_type(ferrum_opts, Hash)
88
+
75
89
  @redirect_limit = redirect_limit
76
90
  @timeout = timeout
77
91
  @encode = encode
78
92
  @parse_javascript = parse_javascript
79
93
  @parse_javascript_delay = parse_javascript_delay
94
+ @ferrum_opts = default_ferrum_opts.merge(ferrum_opts)
95
+ end
96
+
97
+ # Overrides String#inspect to shorten the printed output of a Crawler.
98
+ #
99
+ # @return [String] A short textual representation of this Crawler.
100
+ def inspect
101
+ "#<Wgit::Crawler timeout=#{@timeout} redirect_limit=#{@redirect_limit} encode=#{@encode} parse_javascript=#{@parse_javascript} parse_javascript_delay=#{@parse_javascript_delay} ferrum_opts=#{@ferrum_opts}>"
80
102
  end
81
103
 
82
104
  # Crawls an entire website's HTML pages by recursively going through
@@ -86,8 +108,6 @@ module Wgit
86
108
  #
87
109
  # Use the allow and disallow paths params to partially and selectively
88
110
  # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
89
- # Note that each path must NOT start with a slash; the only exception being
90
- # a `/` on its own with no other characters, referring to the index page.
91
111
  #
92
112
  # Only redirects to the same host are followed. For example, the Url
93
113
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -104,6 +124,7 @@ module Wgit
104
124
  # the crawl. This changes how a site is crawled. Only links pointing to
105
125
  # the site domain are allowed. The `:default` is any `<a>` href returning
106
126
  # HTML.
127
+ # @param max_pages [Integer]
107
128
  # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
129
  # selecting them if their path `File.fnmatch?` one of allow_paths.
109
130
  # @param disallow_paths [String, Array<String>] Filters the `follow` links
@@ -115,40 +136,48 @@ module Wgit
115
136
  # from all of the site's pages or nil if the given url could not be
116
137
  # crawled successfully.
117
138
  def crawl_site(
118
- url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
139
+ url, follow: :default, max_pages: nil,
140
+ allow_paths: nil, disallow_paths: nil, &block
119
141
  )
120
142
  doc = crawl_url(url, &block)
121
- return nil if doc.nil?
143
+ return nil if doc.empty?
122
144
 
123
- link_opts = {
124
- xpath: follow,
125
- allow_paths: allow_paths,
126
- disallow_paths: disallow_paths
127
- }
128
- alt_url = url.end_with?('/') ? url.chop : url + '/'
145
+ total_pages = 1
146
+ limit_reached = max_pages && total_pages >= max_pages
147
+ link_opts = { xpath: follow, allow_paths:, disallow_paths: }
129
148
 
130
- crawled = Set.new([url, alt_url])
149
+ crawled = Set.new(url.redirects_journey)
131
150
  externals = Set.new(doc.external_links)
132
151
  internals = Set.new(next_internal_links(doc, **link_opts))
133
152
 
134
153
  return externals.to_a if internals.empty?
135
154
 
136
155
  loop do
137
- links = internals - crawled
156
+ if limit_reached
157
+ Wgit.logger.debug("Crawled and reached the max_pages limit of: #{max_pages}")
158
+ break
159
+ end
160
+
161
+ links = subtract_links(internals, crawled)
138
162
  break if links.empty?
139
163
 
140
164
  links.each do |link|
141
- orig_link = link.dup
165
+ limit_reached = max_pages && total_pages >= max_pages
166
+ break if limit_reached
167
+
142
168
  doc = crawl_url(link, follow_redirects: :host, &block)
143
169
 
144
- crawled += [orig_link, link] # Push both links in case of redirects.
145
- next if doc.nil?
170
+ crawled += link.redirects_journey
171
+ next if doc.empty?
146
172
 
147
- internals += next_internal_links(doc, **link_opts)
148
- externals += doc.external_links
173
+ total_pages += 1
174
+ internals += next_internal_links(doc, **link_opts)
175
+ externals += doc.external_links
149
176
  end
150
177
  end
151
178
 
179
+ Wgit.logger.debug("Crawled #{total_pages} documents for the site: #{url}")
180
+
152
181
  externals.to_a
153
182
  end
154
183
 
@@ -169,7 +198,7 @@ module Wgit
169
198
  def crawl_urls(*urls, follow_redirects: true, &block)
170
199
  raise 'You must provide at least one Url' if urls.empty?
171
200
 
172
- opts = { follow_redirects: follow_redirects }
201
+ opts = { follow_redirects: }
173
202
  doc = nil
174
203
 
175
204
  Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
@@ -189,19 +218,19 @@ module Wgit
189
218
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
190
219
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
220
  # Use `doc.empty?` to determine if the page is valid.
192
- # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
193
- # crawl was unsuccessful.
221
+ # @return [Wgit::Document] The crawled HTML Document. Check if the crawl
222
+ # was successful with doc.empty? (true if unsuccessful).
194
223
  def crawl_url(url, follow_redirects: true)
195
224
  # A String url isn't allowed because it's passed by value not reference,
196
225
  # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
197
226
  assert_type(url, Wgit::Url)
198
227
 
199
- html = fetch(url, follow_redirects: follow_redirects)
228
+ html = fetch(url, follow_redirects:)
200
229
  doc = Wgit::Document.new(url, html, encode: @encode)
201
230
 
202
231
  yield(doc) if block_given?
203
232
 
204
- doc.empty? ? nil : doc
233
+ doc
205
234
  end
206
235
 
207
236
  protected
@@ -226,7 +255,7 @@ module Wgit
226
255
  response = Wgit::Response.new
227
256
  raise "Invalid url: #{url}" if url.invalid?
228
257
 
229
- resolve(url, response, follow_redirects: follow_redirects)
258
+ resolve(url, response, follow_redirects:)
230
259
  get_browser_response(url, response) if @parse_javascript
231
260
 
232
261
  response.body_or_nil
@@ -238,6 +267,9 @@ module Wgit
238
267
  url.crawled = true # Sets date_crawled underneath.
239
268
  url.crawl_duration = response.total_time
240
269
 
270
+ # Don't override previous url.redirects if response is fully resolved.
271
+ url.redirects = response.redirects unless response.redirects.empty?
272
+
241
273
  @last_response = response
242
274
  end
243
275
 
@@ -253,7 +285,7 @@ module Wgit
253
285
  # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
254
286
  # @raise [StandardError] If a redirect isn't allowed etc.
255
287
  def resolve(url, response, follow_redirects: true)
256
- origin = url.to_url.to_origin # Recorded before any redirects.
288
+ origin = url.to_origin # Record the origin before any redirects.
257
289
  follow_redirects, within = redirect?(follow_redirects)
258
290
 
259
291
  loop do
@@ -277,7 +309,7 @@ module Wgit
277
309
  if response.redirect_count >= @redirect_limit
278
310
 
279
311
  # Process the location to be crawled next.
280
- location = url.to_origin.concat(location) if location.relative?
312
+ location = url.to_origin.join(location) if location.relative?
281
313
  response.redirections[url.to_s] = location.to_s
282
314
  url.replace(location) # Update the url on redirect.
283
315
  end
@@ -370,7 +402,7 @@ module Wgit
370
402
  # @param url [String] The url to browse to.
371
403
  # @return [Ferrum::Browser] The browser response object.
372
404
  def browser_get(url)
373
- @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
405
+ @browser ||= Ferrum::Browser.new(**@ferrum_opts)
374
406
  @browser.goto(url)
375
407
 
376
408
  # Wait for the page's JS to finish dynamically manipulating the DOM.
@@ -420,6 +452,38 @@ module Wgit
420
452
 
421
453
  private
422
454
 
455
+ # The default opts which are merged with the user's ferrum_opts: and then
456
+ # passed directly to the ferrum Chrome browser.
457
+ def default_ferrum_opts
458
+ {
459
+ timeout: @timeout,
460
+ process_timeout: 10,
461
+ headless: true
462
+ }
463
+ end
464
+
465
+ # Manually does the following: `links = internals - crawled`.
466
+ # This is needed due to an apparent bug in Set<Url> (when upgrading from
467
+ # Ruby v3.0.2 to v3.3.0) causing an infinite crawl loop in #crawl_site.
468
+ # Run in a shell to test: bundle exec toys test infinite_crawl_loop
469
+ # TODO: Check in future Ruby versions and remove this method when fixed.
470
+ def subtract_links(internals, crawled)
471
+ links = Set.new
472
+
473
+ internals.each do |internal_url|
474
+ already_crawled = false
475
+
476
+ crawled.each do |crawled_url|
477
+ already_crawled = internal_url == crawled_url
478
+ break if already_crawled
479
+ end
480
+
481
+ links.add(internal_url) unless already_crawled
482
+ end
483
+
484
+ links
485
+ end
486
+
423
487
  # Returns the next links used to continue crawling a site. The xpath value
424
488
  # is used to obtain the links. Any valid URL Strings will be converted into
425
489
  # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
@@ -431,7 +495,8 @@ module Wgit
431
495
  .compact
432
496
  end
433
497
 
434
- if links.any? { |link| link.to_domain != doc.url.to_domain }
498
+ doc_domain = doc.url.to_domain
499
+ if links.any? { |link| link.to_domain != doc_domain }
435
500
  raise 'The links to follow must be within the site domain'
436
501
  end
437
502
 
@@ -458,12 +523,12 @@ module Wgit
458
523
 
459
524
  # Validate and filter by the given URL paths.
460
525
  def process_paths(links, allow_paths, disallow_paths)
461
- if allow_paths
526
+ if allow_paths && !allow_paths.empty?
462
527
  paths = validate_paths(allow_paths)
463
528
  filter_links(links, :select!, paths)
464
529
  end
465
530
 
466
- if disallow_paths
531
+ if disallow_paths && !disallow_paths.empty?
467
532
  paths = validate_paths(disallow_paths)
468
533
  filter_links(links, :reject!, paths)
469
534
  end
@@ -477,7 +542,7 @@ module Wgit
477
542
  raise 'The provided paths must all be Strings' \
478
543
  unless paths.all? { |path| path.is_a?(String) }
479
544
 
480
- Wgit::Utils.sanitize(paths, encode: false)
545
+ paths = Wgit::Utils.sanitize(paths, encode: false)
481
546
  raise 'The provided paths cannot be empty' if paths.empty?
482
547
 
483
548
  paths.map do |path|
@@ -491,7 +556,7 @@ module Wgit
491
556
  def filter_links(links, filter_method, paths)
492
557
  links.send(filter_method) do |link|
493
558
  # Turn http://example.com into / meaning index.
494
- link = link.to_endpoint.index? ? '/' : link.omit_base
559
+ link = link.to_endpoint.index? ? '/' : link.omit_base.omit_trailing_slash
495
560
 
496
561
  match = false
497
562
  paths.each do |pattern|
@@ -532,9 +597,9 @@ module Wgit
532
597
  )
533
598
  end
534
599
 
535
- alias crawl crawl_urls
536
- alias crawl_pages crawl_urls
537
- alias crawl_page crawl_url
538
- alias crawl_r crawl_site
600
+ alias_method :crawl, :crawl_urls
601
+ alias_method :crawl_pages, :crawl_urls
602
+ alias_method :crawl_page, :crawl_url
603
+ alias_method :crawl_r, :crawl_site
539
604
  end
540
605
  end
@@ -0,0 +1,204 @@
1
+ require_relative "../../utils"
2
+ require_relative "../../url"
3
+ require_relative "../../document"
4
+ require_relative "../../model"
5
+ require_relative "../database_adapter"
6
+
7
+ module Wgit::Database
8
+ # Database implementer class for in-memory (RAM) storage. This DB is mainly used
9
+ # for testing and experimenting with. This DB is thread safe.
10
+ class InMemory < DatabaseAdapter
11
+ # Initializes a thread safe InMemory Database instance.
12
+ #
13
+ # @param connection_string [String] Not used but needed to adhere to the
14
+ # DatabaseAdapter interface.
15
+ def initialize(connection_string = nil)
16
+ # Inits @urls and @docs vars.
17
+ initialize_store
18
+
19
+ super
20
+ end
21
+
22
+ # Overrides String#inspect to display collection sizes.
23
+ #
24
+ # @return [String] A short textual representation of this object.
25
+ def inspect
26
+ "#<Wgit::Database::InMemory num_urls=#{@urls.size} \
27
+ num_docs=#{@docs.size} size=#{size}>"
28
+ end
29
+
30
+ # The Wgit::Url's collection stored as an in-memory Concurrent::Array.
31
+ def urls(&block)
32
+ map_urls(@urls, &block)
33
+ end
34
+
35
+ # The Wgit::Document's collection stored as an in-memory Concurrent::Array.
36
+ def docs(&block)
37
+ map_documents(@docs, &block)
38
+ end
39
+
40
+ # The raw url Hashes, not mapped into their corresponding Wgit objects.
41
+ def url_hashes
42
+ @urls
43
+ end
44
+
45
+ # The raw doc Hashes, not mapped into their corresponding Wgit objects.
46
+ def doc_hashes
47
+ @docs
48
+ end
49
+
50
+ # Returns the current size of the in-memory database.
51
+ # An empty database will return a size of 4 because there are 4 bytes in
52
+ # two empty arrays (urls and docs collections).
53
+ #
54
+ # @return [Integer] The current size of the in-memory DB.
55
+ def size
56
+ @urls.to_s.size + @docs.to_s.size
57
+ end
58
+
59
+ # Searches the database's Document#text for the given query. The returned
60
+ # Documents are sorted for relevance, starting with the most relevant. Each
61
+ # Document's #score value will be set accordingly.
62
+ #
63
+ # @param query [Regexp, #to_s] The regex or text value to search each
64
+ # document's @text for.
65
+ # @param case_sensitive [Boolean] Whether character case must match.
66
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
67
+ # for separately.
68
+ # @param limit [Integer] The max number of results to return.
69
+ # @param skip [Integer] The number of results to skip.
70
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
71
+ # DB.
72
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
73
+ def search(
74
+ query, case_sensitive: false, whole_sentence: true,
75
+ limit: 10, skip: 0, &block
76
+ )
77
+ regex = Wgit::Utils.build_search_regex(
78
+ query, case_sensitive:, whole_sentence:)
79
+
80
+ # Search the Wgit::Document's, not the raw Hashes.
81
+ results = docs.select do |doc|
82
+ score = 0
83
+ doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
84
+ score = results_hash.values.sum
85
+ end
86
+ next false if score.zero?
87
+
88
+ doc.instance_variable_set :@score, score
89
+ true
90
+ end
91
+
92
+ return [] if results.empty?
93
+
94
+ results = results.sort_by { |doc| -doc.score }
95
+
96
+ results = results[skip..]
97
+ return [] unless results
98
+
99
+ results = results[0...limit] if limit.positive?
100
+ results.each(&block) if block_given?
101
+
102
+ results
103
+ end
104
+
105
+ # Deletes everything in the urls and documents collections.
106
+ #
107
+ # @return [Integer] The number of deleted records.
108
+ def empty
109
+ previous_size = @urls.size + @docs.size
110
+ initialize_store
111
+
112
+ previous_size
113
+ end
114
+
115
+ # Returns Url records that haven't yet been crawled.
116
+ #
117
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
118
+ # @param skip [Integer] Skip n amount of Url's.
119
+ # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
120
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
121
+ def uncrawled_urls(limit: 0, skip: 0, &block)
122
+ uncrawled = @urls.reject { |url| url["crawled"] }
123
+ uncrawled = uncrawled[skip..]
124
+ return [] unless uncrawled
125
+
126
+ uncrawled = uncrawled[0...limit] if limit.positive?
127
+ map_urls(uncrawled, &block)
128
+ end
129
+
130
+ # Inserts or updates the object in the in-memory database.
131
+ #
132
+ # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
133
+ # @return [Boolean] True if inserted, false if updated.
134
+ def upsert(obj)
135
+ collection, index, model = get_model_info(obj)
136
+
137
+ if index
138
+ collection[index] = model
139
+ false
140
+ else
141
+ collection << model
142
+ true
143
+ end
144
+ end
145
+
146
+ # Bulk upserts the objects in the in-memory database collection.
147
+ # You cannot mix collection objs types, all must be Urls or Documents.
148
+ #
149
+ # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
150
+ # inserted/updated.
151
+ # @return [Integer] The total number of newly inserted objects.
152
+ def bulk_upsert(objs)
153
+ assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
154
+
155
+ objs.reduce(0) do |inserted, obj|
156
+ inserted += 1 if upsert(obj)
157
+ inserted
158
+ end
159
+ end
160
+
161
+ private
162
+
163
+ # Creates a new Concurrent::Array for each collection.
164
+ def initialize_store
165
+ @urls = Concurrent::Array.new
166
+ @docs = Concurrent::Array.new
167
+ end
168
+
169
+ # Get the database's model info (collection type, index, model) for
170
+ # obj.
171
+ #
172
+ # Use like:
173
+ # ```
174
+ # collection, index, model = get_model_info(obj)
175
+ # ```
176
+ #
177
+ # Raises an error if obj isn't a Wgit::Url or Wgit::Document.
178
+ #
179
+ # @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
180
+ # @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
181
+ # @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
182
+ # the collection, nil otherwise) and the Wgit::Model of obj.
183
+ def get_model_info(obj)
184
+ obj = obj.dup
185
+
186
+ case obj
187
+ when Wgit::Url
188
+ key = obj.to_s
189
+ collection = @urls
190
+ index = @urls.index { |url| url["url"] == key }
191
+ model = build_model(obj)
192
+ when Wgit::Document
193
+ key = obj.url.to_s
194
+ collection = @docs
195
+ index = @docs.index { |doc| doc["url"]&.[]("url") == key }
196
+ model = build_model(obj)
197
+ else
198
+ raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
199
+ end
200
+
201
+ [collection, index, model]
202
+ end
203
+ end
204
+ end