arachnid2 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e25353806a447177f129c56d4c57c38c70223849f2bbd858c932f3f4ec8a4ef
4
- data.tar.gz: d2725c9981671ee010692d82b97801ccc00a1f2b28663fb72b23bc08f6be890e
3
+ metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
4
+ data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
5
5
  SHA512:
6
- metadata.gz: 52a0b49101ca136ddee4c4ae8e976bd81cc9f3c559df3a94463bee7f42a2e4ce591330e2a587f5285bac98be52723ab518870ac8a8197413df8cd06267892858
7
- data.tar.gz: 2514be62a0ae76a2d594f14d5ad8b66a45696bafa455a6347bb04b07ae99e48f322936d0afb6bd9e025c67ac9ce52213519f398a8f5deec54e508d6c4f1b4d84
6
+ metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
7
+ data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
@@ -1,11 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.3.5)
4
+ arachnid2 (0.3.9)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
8
- nokogiri (>= 1.8.5)
8
+ nokogiri (>= 1.10.4)
9
9
  typhoeus
10
10
  watir
11
11
  webdriver-user-agent (>= 7.6)
@@ -14,30 +14,30 @@ PATH
14
14
  GEM
15
15
  remote: https://rubygems.org/
16
16
  specs:
17
- addressable (2.6.0)
18
- public_suffix (>= 2.0.2, < 4.0)
19
- adomain (0.1.1)
17
+ addressable (2.7.0)
18
+ public_suffix (>= 2.0.2, < 5.0)
19
+ adomain (0.2.3)
20
20
  addressable (~> 2.5)
21
+ logger
21
22
  bloomfilter-rb (2.1.1)
22
23
  redis
23
- childprocess (0.9.0)
24
- ffi (~> 1.0, >= 1.0.11)
24
+ childprocess (3.0.0)
25
25
  diff-lcs (1.3)
26
26
  ethon (0.12.0)
27
27
  ffi (>= 1.3.0)
28
28
  facets (3.1.0)
29
- ffi (1.10.0)
30
- json (2.2.0)
29
+ ffi (1.12.2)
30
+ json (2.3.0)
31
+ logger (1.4.2)
31
32
  mini_portile2 (2.4.0)
32
- net_http_ssl_fix (0.0.10)
33
- nokogiri (1.10.1)
33
+ nokogiri (1.10.9)
34
34
  mini_portile2 (~> 2.4.0)
35
- os (1.0.0)
35
+ os (1.0.1)
36
36
  psych (3.1.0)
37
- public_suffix (3.0.3)
38
- rake (10.5.0)
39
- redis (4.1.0)
40
- regexp_parser (1.3.0)
37
+ public_suffix (4.0.3)
38
+ rake (13.0.1)
39
+ redis (4.1.3)
40
+ regexp_parser (1.7.0)
41
41
  rspec (3.8.0)
42
42
  rspec-core (~> 3.8.0)
43
43
  rspec-expectations (~> 3.8.0)
@@ -51,10 +51,10 @@ GEM
51
51
  diff-lcs (>= 1.2.0, < 2.0)
52
52
  rspec-support (~> 3.8.0)
53
53
  rspec-support (3.8.0)
54
- rubyzip (1.2.2)
55
- selenium-webdriver (3.141.0)
56
- childprocess (~> 0.5)
57
- rubyzip (~> 1.2, >= 1.2.2)
54
+ rubyzip (2.2.0)
55
+ selenium-webdriver (3.142.7)
56
+ childprocess (>= 0.5, < 4.0)
57
+ rubyzip (>= 1.2.2)
58
58
  typhoeus (1.3.1)
59
59
  ethon (>= 0.9.0)
60
60
  watir (6.16.5)
@@ -66,11 +66,10 @@ GEM
66
66
  os
67
67
  psych
68
68
  selenium-webdriver (>= 3.4.0)
69
- webdrivers (3.6.0)
70
- net_http_ssl_fix
69
+ webdrivers (4.2.0)
71
70
  nokogiri (~> 1.6)
72
- rubyzip (~> 1.0)
73
- selenium-webdriver (~> 3.0)
71
+ rubyzip (>= 1.3.0)
72
+ selenium-webdriver (>= 3.0, < 4.0)
74
73
 
75
74
  PLATFORMS
76
75
  ruby
@@ -78,8 +77,8 @@ PLATFORMS
78
77
  DEPENDENCIES
79
78
  arachnid2!
80
79
  bundler (~> 1.16)
81
- rake (~> 10.0)
80
+ rake (>= 12.3.3)
82
81
  rspec (~> 3.0)
83
82
 
84
83
  BUNDLED WITH
85
- 1.16.5
84
+ 1.17.3
data/README.md CHANGED
@@ -186,6 +186,36 @@ with_watir = true
186
186
  Arachnid2.new(url).crawl(opts, with_watir)
187
187
  ```
188
188
 
189
+ Arachnid2 has base defaults which you might want to address when
190
+ employing Watir.
191
+
192
+ * First, the default crawl time is 15 seconds.
193
+ As browser page loads can take this long, you will probably want to
194
+ set a higher crawl time.
195
+ * Simply storing the browser is not a great idea, since it will
196
+ be inaccessible after it is closed. Instead, consider nabbing the
197
+ HTML, cookies, or whatever content is required during the crawl.
198
+ * Finally, note that Firefox is the default browser.
199
+
200
+
201
+ ```ruby
202
+ require 'arachnid2'
203
+
204
+ with_watir = true
205
+ responses = []
206
+ url = "http://maximumfun.org"
207
+ max = 60
208
+ browser = :chrome
209
+ opts = {time_box: max, browser_type: browser}
210
+
211
+ spider = Arachnid2.new(url)
212
+ spider.crawl(opts, with_watir) do |response|
213
+ response.body.wait_until(&:present?)
214
+ responses << response.body.html if response.body.present?
215
+ end
216
+
217
+ ```
218
+
189
219
  #### Options
190
220
 
191
221
  See the Typhoeus options above &mdash; most apply to Watir as well, with
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  spec.add_development_dependency "bundler", "~> 1.16"
25
- spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rake", ">= 12.3.3"
26
26
  spec.add_development_dependency "rspec", "~> 3.0"
27
27
 
28
28
  spec.add_dependency "webdriver-user-agent", ">= 7.6"
@@ -32,5 +32,5 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "bloomfilter-rb"
33
33
  spec.add_dependency "adomain"
34
34
  spec.add_dependency "addressable"
35
- spec.add_dependency "nokogiri", ">= 1.8.5"
35
+ spec.add_dependency "nokogiri", ">= 1.10.4"
36
36
  end
@@ -1,5 +1,5 @@
1
1
  require "arachnid2/version"
2
- require "arachnid2/cached_arachnid_responses"
2
+ require "arachnid2/cached_responses"
3
3
  require "arachnid2/exoskeleton"
4
4
  require "arachnid2/typhoeus"
5
5
  require "arachnid2/watir"
@@ -1,6 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'json'
3
- module CachedArachnidResponses
3
+ module CachedResponses
4
4
  CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
5
 
6
6
  def load_data(_url, _options)
@@ -15,7 +15,7 @@ module CachedArachnidResponses
15
15
 
16
16
  body = ::JSON.parse(response.body)
17
17
  responses_list = Base64.decode64(body['encrypted_response'])
18
- return Marshal.load responses_list # here we get array of Typhoeus::Response
18
+ return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
19
19
  end
20
20
  rescue StandardError
21
21
  nil
@@ -10,7 +10,7 @@ class Arachnid2
10
10
  end
11
11
 
12
12
  def process(url, html)
13
- return false unless Adomain["#{url}"].include? @domain
13
+ return false unless Adomain["#{url}"]&.include? @domain
14
14
 
15
15
  extract_hrefs(html)
16
16
  end
@@ -1,6 +1,6 @@
1
1
  class Arachnid2
2
2
  class Typhoeus
3
- include CachedArachnidResponses
3
+ include CachedResponses
4
4
  include Arachnid2::Exoskeleton
5
5
 
6
6
  def initialize(url)
@@ -17,39 +17,58 @@ class Arachnid2
17
17
  max_concurrency.times do
18
18
  q = @global_queue.shift
19
19
 
20
- break if @global_visited.size >= crawl_options[:max_urls] || \
21
- Time.now > crawl_options[:time_limit] || \
22
- memory_danger?
23
-
20
+ break if time_to_stop?
24
21
  @global_visited.insert(q)
25
22
 
26
- request = ::Typhoeus::Request.new(q, request_options)
27
-
28
- data = load_data(@url, opts)
29
- data.each { |response| yield response } and return unless data.nil?
30
-
31
- request.on_complete do |response|
32
- @cached_data.push(response)
33
- links = process(response.effective_url, response.body)
34
- next unless links
35
-
36
- yield response
37
-
38
- vacuum(links, response.effective_url)
39
- end
23
+ found_in_cache = use_cache(q, opts, &Proc.new)
24
+ return if found_in_cache
40
25
 
41
- @hydra.queue(request)
26
+ request = ::Typhoeus::Request.new(q, request_options)
27
+ requestable = after_request(request, &Proc.new)
28
+ @hydra.queue(request) if requestable
42
29
  end # max_concurrency.times do
43
30
 
44
31
  @hydra.run
45
-
46
32
  end # until @global_queue.empty?
47
- put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
48
33
  ensure
49
34
  @cookie_file.close! if @cookie_file
50
35
  end # def crawl(opts = {})
51
36
 
52
37
  private
38
+ def after_request(request)
39
+ request.on_complete do |response|
40
+ cacheable = use_response(response, &Proc.new)
41
+ return unless cacheable
42
+
43
+ put_cached_data(response.effective_url, @options, response)
44
+ end
45
+
46
+ true
47
+ end
48
+
49
+ def use_response(response)
50
+ links = process(response.effective_url, response.body)
51
+ return unless links
52
+
53
+ yield response
54
+
55
+ vacuum(links, response.effective_url)
56
+ true
57
+ end
58
+
59
+ def use_cache(url, options)
60
+ data = load_data(url, options)
61
+ use_response(data, &Proc.new) if data
62
+
63
+ data
64
+ end
65
+
66
+ def time_to_stop?
67
+ @global_visited.size >= crawl_options[:max_urls] || \
68
+ Time.now > crawl_options[:time_limit] || \
69
+ memory_danger?
70
+ end
71
+
53
72
  def typhoeus_preflight
54
73
  @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
55
74
  typhoeus_proxy_options
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.5"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -13,46 +13,17 @@ class Arachnid2
13
13
  def crawl(opts)
14
14
  preflight(opts)
15
15
  watir_preflight
16
+ @already_retried = false
16
17
 
17
18
  until @global_queue.empty?
18
- @already_retried = false
19
19
  q = @global_queue.shift
20
+ links = nil
20
21
 
21
- break if @global_visited.size >= crawl_options[:max_urls]
22
- break if Time.now > crawl_options[:time_limit]
23
- break if memory_danger?
22
+ break if time_to_stop?
24
23
 
25
24
  @global_visited.insert(q)
26
25
 
27
- begin
28
- begin
29
- browser.goto q
30
- rescue Selenium::WebDriver::Error::UnknownError => e
31
- # Firefox and Selenium, in their infinite wisdom
32
- # raise an error when a page cannot be loaded.
33
- # At the time of writing this, the page at
34
- # thewirecutter.com/cars/accessories-auto
35
- # causes such an issue (too many redirects).
36
- # This error handling moves us on from those pages.
37
- raise e unless e.message =~ /.*Reached error page.*/i
38
- next
39
- end
40
- links = process(browser.url, browser.body.html)
41
- next unless links
42
-
43
- yield browser
44
-
45
- vacuum(links, browser.url)
46
- rescue => e
47
- raise e if @already_retried
48
- raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
49
- @browser.close if @browser rescue nil
50
- @headless.destroy if @headless rescue nil
51
- @browser = nil
52
- @already_retried = true
53
- retry
54
- end
55
-
26
+ make_request(q, &Proc.new)
56
27
  end # until @global_queue.empty?
57
28
  ensure
58
29
  @browser.close if @browser rescue nil
@@ -60,6 +31,74 @@ class Arachnid2
60
31
  end
61
32
 
62
33
  private
34
+ def make_request(q)
35
+ begin
36
+ links = browse_links(q, &Proc.new)
37
+ return unless links
38
+
39
+ vacuum(links, browser.url)
40
+ rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
41
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
42
+ "is ignoring an error: " \
43
+ "#{e.class} - #{e.message}"
44
+ puts msg
45
+ rescue => e
46
+ raise e if raise_before_retry?(e.class)
47
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
48
+ "is retrying once after an error: " \
49
+ "#{e.class} - #{e.message}"
50
+ puts msg
51
+ e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
52
+ reset_for_retry
53
+ end
54
+ end
55
+
56
+ def browse_links(url)
57
+ return unless navigate(url)
58
+
59
+ yield browser
60
+
61
+ process(browser.url, browser.body.html) if browser.body.exists?
62
+ end
63
+
64
+ def navigate(url)
65
+ begin
66
+ browser.goto url
67
+ rescue Selenium::WebDriver::Error::UnknownError => e
68
+ # Firefox and Selenium, in their infinite wisdom
69
+ # raise an error when a page cannot be loaded.
70
+ # At the time of writing this, the page at
71
+ # thewirecutter.com/cars/accessories-auto
72
+ # causes such an issue (too many redirects).
73
+ # This error handling moves us on from those pages.
74
+ raise e unless e.message =~ /.*Reached error page.*/i
75
+ return
76
+ end
77
+
78
+ true
79
+ end
80
+
81
+ def time_to_stop?
82
+ @global_visited.size >= crawl_options[:max_urls] || \
83
+ Time.now > crawl_options[:time_limit] || \
84
+ memory_danger?
85
+ end
86
+
87
+ def raise_before_retry?(klass)
88
+ @already_retried || \
89
+ "#{klass}".include?("Selenium") || \
90
+ "#{klass}".include?("Watir")
91
+ end
92
+
93
+ def reset_for_retry
94
+ @browser.close if @browser rescue nil
95
+ @headless.destroy if @headless rescue nil
96
+ @driver.quit if @headless rescue nil
97
+ @driver = nil
98
+ @browser = nil
99
+ @already_retried = true
100
+ end
101
+
63
102
  def browser
64
103
  unless @browser
65
104
  behead if @make_headless
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-18 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -28,16 +28,16 @@ dependencies:
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -156,14 +156,14 @@ dependencies:
156
156
  requirements:
157
157
  - - ">="
158
158
  - !ruby/object:Gem::Version
159
- version: 1.8.5
159
+ version: 1.10.4
160
160
  type: :runtime
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
- version: 1.8.5
166
+ version: 1.10.4
167
167
  description:
168
168
  email:
169
169
  - scnissen@gmail.com
@@ -184,7 +184,7 @@ files:
184
184
  - bin/console
185
185
  - bin/setup
186
186
  - lib/arachnid2.rb
187
- - lib/arachnid2/cached_arachnid_responses.rb
187
+ - lib/arachnid2/cached_responses.rb
188
188
  - lib/arachnid2/exoskeleton.rb
189
189
  - lib/arachnid2/typhoeus.rb
190
190
  - lib/arachnid2/version.rb
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubyforge_project:
212
- rubygems_version: 2.7.7
211
+ rubygems_version: 3.1.2
213
212
  signing_key:
214
213
  specification_version: 4
215
214
  summary: A simple, fast web crawler