arachnid2 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0299b63b16c0e19acd87ff8e8b34302f015a8f36ea87855faedbbe3c38d0f080'
4
- data.tar.gz: 5da1c60fe38252b7699a5ecd2ead756fd01e2e13918ef25923c5b44ab039617c
3
+ metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
4
+ data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
5
5
  SHA512:
6
- metadata.gz: 6525470d1ec273fb06421f0a092307527786697fa5bb698c8f2fb5f788ef9b6c220bb09204656cec807e7f8816c65684b061afa0808fbc49317abee19a9b8d4b
7
- data.tar.gz: 8f7bc29d3e5129da2e04c5bc32f44a0025244ba160581c2a073e9b61cc46baa3685f3481e979743556342bf4b2791767eb0c21b42558f62b2443645f592be677
6
+ metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
7
+ data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
data/README.md CHANGED
@@ -186,6 +186,36 @@ with_watir = true
186
186
  Arachnid2.new(url).crawl(opts, with_watir)
187
187
  ```
188
188
 
189
+ Arachnid2 has base defaults which you might want to address when
190
+ employing Watir.
191
+
192
+ * First, the default crawl time is 15 seconds.
193
+ As browser page loads can take this long, you will probably want to
194
+ set a higher crawl time.
195
+ * Simply storing the browser is not a great idea, since it will
196
+ be inaccessible after it is closed. Instead, consider nabbing the
197
+ HTML, cookies, or whatever content is required during the crawl.
198
+ * Finally, note that Firefox is the default browser.
199
+
200
+
201
+ ```ruby
202
+ require 'arachnid2'
203
+
204
+ with_watir = true
205
+ responses = []
206
+ url = "http://maximumfun.org"
207
+ max = 60
208
+ browser = :chrome
209
+ opts = {time_box: max, browser_type: browser}
210
+
211
+ spider = Arachnid2.new(url)
212
+ spider.crawl(opts, with_watir) do |response|
213
+ response.body.wait_until(&:present?)
214
+ responses << response.body.html if response.body.present?
215
+ end
216
+
217
+ ```
218
+
189
219
  #### Options
190
220
 
191
221
  See the Typhoeus options above &mdash; most apply to Watir as well, with
@@ -1,5 +1,5 @@
1
1
  require "arachnid2/version"
2
- require "arachnid2/cached_arachnid_responses"
2
+ require "arachnid2/cached_responses"
3
3
  require "arachnid2/exoskeleton"
4
4
  require "arachnid2/typhoeus"
5
5
  require "arachnid2/watir"
@@ -1,6 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'json'
3
- module CachedArachnidResponses
3
+ module CachedResponses
4
4
  CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
5
 
6
6
  def load_data(_url, _options)
@@ -15,7 +15,7 @@ module CachedArachnidResponses
15
15
 
16
16
  body = ::JSON.parse(response.body)
17
17
  responses_list = Base64.decode64(body['encrypted_response'])
18
- return Marshal.load responses_list # here we get array of Typhoeus::Response
18
+ return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
19
19
  end
20
20
  rescue StandardError
21
21
  nil
@@ -10,7 +10,7 @@ class Arachnid2
10
10
  end
11
11
 
12
12
  def process(url, html)
13
- return false unless Adomain["#{url}"].include? @domain
13
+ return false unless Adomain["#{url}"]&.include? @domain
14
14
 
15
15
  extract_hrefs(html)
16
16
  end
@@ -1,6 +1,6 @@
1
1
  class Arachnid2
2
2
  class Typhoeus
3
- include CachedArachnidResponses
3
+ include CachedResponses
4
4
  include Arachnid2::Exoskeleton
5
5
 
6
6
  def initialize(url)
@@ -17,39 +17,58 @@ class Arachnid2
17
17
  max_concurrency.times do
18
18
  q = @global_queue.shift
19
19
 
20
- break if @global_visited.size >= crawl_options[:max_urls] || \
21
- Time.now > crawl_options[:time_limit] || \
22
- memory_danger?
23
-
20
+ break if time_to_stop?
24
21
  @global_visited.insert(q)
25
22
 
26
- request = ::Typhoeus::Request.new(q, request_options)
27
-
28
- data = load_data(@url, opts)
29
- data.each { |response| yield response } and return unless data.nil?
30
-
31
- request.on_complete do |response|
32
- @cached_data.push(response)
33
- links = process(response.effective_url, response.body)
34
- next unless links
35
-
36
- yield response
37
-
38
- vacuum(links, response.effective_url)
39
- end
23
+ found_in_cache = use_cache(q, opts, &Proc.new)
24
+ return if found_in_cache
40
25
 
41
- @hydra.queue(request)
26
+ request = ::Typhoeus::Request.new(q, request_options)
27
+ requestable = after_request(request, &Proc.new)
28
+ @hydra.queue(request) if requestable
42
29
  end # max_concurrency.times do
43
30
 
44
31
  @hydra.run
45
-
46
32
  end # until @global_queue.empty?
47
- put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
48
33
  ensure
49
34
  @cookie_file.close! if @cookie_file
50
35
  end # def crawl(opts = {})
51
36
 
52
37
  private
38
+ def after_request(request)
39
+ request.on_complete do |response|
40
+ cacheable = use_response(response, &Proc.new)
41
+ return unless cacheable
42
+
43
+ put_cached_data(response.effective_url, @options, response)
44
+ end
45
+
46
+ true
47
+ end
48
+
49
+ def use_response(response)
50
+ links = process(response.effective_url, response.body)
51
+ return unless links
52
+
53
+ yield response
54
+
55
+ vacuum(links, response.effective_url)
56
+ true
57
+ end
58
+
59
+ def use_cache(url, options)
60
+ data = load_data(url, options)
61
+ use_response(data, &Proc.new) if data
62
+
63
+ data
64
+ end
65
+
66
+ def time_to_stop?
67
+ @global_visited.size >= crawl_options[:max_urls] || \
68
+ Time.now > crawl_options[:time_limit] || \
69
+ memory_danger?
70
+ end
71
+
53
72
  def typhoeus_preflight
54
73
  @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
55
74
  typhoeus_proxy_options
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.9"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -19,44 +19,71 @@ class Arachnid2
19
19
  q = @global_queue.shift
20
20
  links = nil
21
21
 
22
- break if @global_visited.size >= crawl_options[:max_urls]
23
- break if Time.now > crawl_options[:time_limit]
24
- break if memory_danger?
22
+ break if time_to_stop?
25
23
 
26
24
  @global_visited.insert(q)
27
25
 
26
+ make_request(q, &Proc.new)
27
+ end # until @global_queue.empty?
28
+ ensure
29
+ @browser.close if @browser rescue nil
30
+ @headless.destroy if @headless rescue nil
31
+ end
32
+
33
+ private
34
+ def make_request(q)
28
35
  begin
29
- begin
30
- browser.goto q
31
- rescue Selenium::WebDriver::Error::UnknownError => e
32
- # Firefox and Selenium, in their infinite wisdom
33
- # raise an error when a page cannot be loaded.
34
- # At the time of writing this, the page at
35
- # thewirecutter.com/cars/accessories-auto
36
- # causes such an issue (too many redirects).
37
- # This error handling moves us on from those pages.
38
- raise e unless e.message =~ /.*Reached error page.*/i
39
- next
40
- end
41
- links = process(browser.url, browser.body.html) if browser.body.exists?
42
- next unless links
43
-
44
- yield browser
36
+ links = browse_links(q, &Proc.new)
37
+ return unless links
45
38
 
46
39
  vacuum(links, browser.url)
47
40
  rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
41
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
42
+ "is ignoring an error: " \
43
+ "#{e.class} - #{e.message}"
44
+ puts msg
48
45
  rescue => e
49
46
  raise e if raise_before_retry?(e.class)
47
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
48
+ "is retrying once after an error: " \
49
+ "#{e.class} - #{e.message}"
50
+ puts msg
51
+ e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
50
52
  reset_for_retry
51
53
  end
54
+ end
52
55
 
53
- end # until @global_queue.empty?
54
- ensure
55
- @browser.close if @browser rescue nil
56
- @headless.destroy if @headless rescue nil
57
- end
56
+ def browse_links(url)
57
+ return unless navigate(url)
58
+
59
+ yield browser
60
+
61
+ process(browser.url, browser.body.html) if browser.body.exists?
62
+ end
63
+
64
+ def navigate(url)
65
+ begin
66
+ browser.goto url
67
+ rescue Selenium::WebDriver::Error::UnknownError => e
68
+ # Firefox and Selenium, in their infinite wisdom
69
+ # raise an error when a page cannot be loaded.
70
+ # At the time of writing this, the page at
71
+ # thewirecutter.com/cars/accessories-auto
72
+ # causes such an issue (too many redirects).
73
+ # This error handling moves us on from those pages.
74
+ raise e unless e.message =~ /.*Reached error page.*/i
75
+ return
76
+ end
77
+
78
+ true
79
+ end
80
+
81
+ def time_to_stop?
82
+ @global_visited.size >= crawl_options[:max_urls] || \
83
+ Time.now > crawl_options[:time_limit] || \
84
+ memory_danger?
85
+ end
58
86
 
59
- private
60
87
  def raise_before_retry?(klass)
61
88
  @already_retried || \
62
89
  "#{klass}".include?("Selenium") || \
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.9
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-05 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -184,7 +184,7 @@ files:
184
184
  - bin/console
185
185
  - bin/setup
186
186
  - lib/arachnid2.rb
187
- - lib/arachnid2/cached_arachnid_responses.rb
187
+ - lib/arachnid2/cached_responses.rb
188
188
  - lib/arachnid2/exoskeleton.rb
189
189
  - lib/arachnid2/typhoeus.rb
190
190
  - lib/arachnid2/version.rb
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.0.6
211
+ rubygems_version: 3.1.2
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A simple, fast web crawler