arachnid2 0.3.5 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +25 -26
- data/README.md +30 -0
- data/arachnid2.gemspec +2 -2
- data/lib/arachnid2.rb +1 -1
- data/lib/arachnid2/{cached_arachnid_responses.rb → cached_responses.rb} +2 -2
- data/lib/arachnid2/exoskeleton.rb +1 -1
- data/lib/arachnid2/typhoeus.rb +41 -22
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +72 -33
- metadata +10 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
|
4
|
+
data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
|
7
|
+
data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.3.
|
4
|
+
arachnid2 (0.3.9)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
8
|
-
nokogiri (>= 1.
|
8
|
+
nokogiri (>= 1.10.4)
|
9
9
|
typhoeus
|
10
10
|
watir
|
11
11
|
webdriver-user-agent (>= 7.6)
|
@@ -14,30 +14,30 @@ PATH
|
|
14
14
|
GEM
|
15
15
|
remote: https://rubygems.org/
|
16
16
|
specs:
|
17
|
-
addressable (2.
|
18
|
-
public_suffix (>= 2.0.2, <
|
19
|
-
adomain (0.
|
17
|
+
addressable (2.7.0)
|
18
|
+
public_suffix (>= 2.0.2, < 5.0)
|
19
|
+
adomain (0.2.3)
|
20
20
|
addressable (~> 2.5)
|
21
|
+
logger
|
21
22
|
bloomfilter-rb (2.1.1)
|
22
23
|
redis
|
23
|
-
childprocess (0.
|
24
|
-
ffi (~> 1.0, >= 1.0.11)
|
24
|
+
childprocess (3.0.0)
|
25
25
|
diff-lcs (1.3)
|
26
26
|
ethon (0.12.0)
|
27
27
|
ffi (>= 1.3.0)
|
28
28
|
facets (3.1.0)
|
29
|
-
ffi (1.
|
30
|
-
json (2.
|
29
|
+
ffi (1.12.2)
|
30
|
+
json (2.3.0)
|
31
|
+
logger (1.4.2)
|
31
32
|
mini_portile2 (2.4.0)
|
32
|
-
|
33
|
-
nokogiri (1.10.1)
|
33
|
+
nokogiri (1.10.9)
|
34
34
|
mini_portile2 (~> 2.4.0)
|
35
|
-
os (1.0.
|
35
|
+
os (1.0.1)
|
36
36
|
psych (3.1.0)
|
37
|
-
public_suffix (
|
38
|
-
rake (
|
39
|
-
redis (4.1.
|
40
|
-
regexp_parser (1.
|
37
|
+
public_suffix (4.0.3)
|
38
|
+
rake (13.0.1)
|
39
|
+
redis (4.1.3)
|
40
|
+
regexp_parser (1.7.0)
|
41
41
|
rspec (3.8.0)
|
42
42
|
rspec-core (~> 3.8.0)
|
43
43
|
rspec-expectations (~> 3.8.0)
|
@@ -51,10 +51,10 @@ GEM
|
|
51
51
|
diff-lcs (>= 1.2.0, < 2.0)
|
52
52
|
rspec-support (~> 3.8.0)
|
53
53
|
rspec-support (3.8.0)
|
54
|
-
rubyzip (
|
55
|
-
selenium-webdriver (3.
|
56
|
-
childprocess (
|
57
|
-
rubyzip (
|
54
|
+
rubyzip (2.2.0)
|
55
|
+
selenium-webdriver (3.142.7)
|
56
|
+
childprocess (>= 0.5, < 4.0)
|
57
|
+
rubyzip (>= 1.2.2)
|
58
58
|
typhoeus (1.3.1)
|
59
59
|
ethon (>= 0.9.0)
|
60
60
|
watir (6.16.5)
|
@@ -66,11 +66,10 @@ GEM
|
|
66
66
|
os
|
67
67
|
psych
|
68
68
|
selenium-webdriver (>= 3.4.0)
|
69
|
-
webdrivers (
|
70
|
-
net_http_ssl_fix
|
69
|
+
webdrivers (4.2.0)
|
71
70
|
nokogiri (~> 1.6)
|
72
|
-
rubyzip (
|
73
|
-
selenium-webdriver (
|
71
|
+
rubyzip (>= 1.3.0)
|
72
|
+
selenium-webdriver (>= 3.0, < 4.0)
|
74
73
|
|
75
74
|
PLATFORMS
|
76
75
|
ruby
|
@@ -78,8 +77,8 @@ PLATFORMS
|
|
78
77
|
DEPENDENCIES
|
79
78
|
arachnid2!
|
80
79
|
bundler (~> 1.16)
|
81
|
-
rake (
|
80
|
+
rake (>= 12.3.3)
|
82
81
|
rspec (~> 3.0)
|
83
82
|
|
84
83
|
BUNDLED WITH
|
85
|
-
1.
|
84
|
+
1.17.3
|
data/README.md
CHANGED
@@ -186,6 +186,36 @@ with_watir = true
|
|
186
186
|
Arachnid2.new(url).crawl(opts, with_watir)
|
187
187
|
```
|
188
188
|
|
189
|
+
Arachnid2 has base defaults which you might want to address when
|
190
|
+
employing Watir.
|
191
|
+
|
192
|
+
* First, the default crawl time is 15 seconds.
|
193
|
+
As browser page loads can take this long, you will probably want to
|
194
|
+
set a higher crawl time.
|
195
|
+
* Simply storing the browser is not a great idea, since it will
|
196
|
+
be inaccessible after it is closed. Instead, consider nabbing the
|
197
|
+
HTML, cookies, or whatever content is required during the crawl.
|
198
|
+
* Finally, note that Firefox is the default browser.
|
199
|
+
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
require 'arachnid2'
|
203
|
+
|
204
|
+
with_watir = true
|
205
|
+
responses = []
|
206
|
+
url = "http://maximumfun.org"
|
207
|
+
max = 60
|
208
|
+
browser = :chrome
|
209
|
+
opts = {time_box: max, browser_type: browser}
|
210
|
+
|
211
|
+
spider = Arachnid2.new(url)
|
212
|
+
spider.crawl(opts, with_watir) do |response|
|
213
|
+
response.body.wait_until(&:present?)
|
214
|
+
responses << response.body.html if response.body.present?
|
215
|
+
end
|
216
|
+
|
217
|
+
```
|
218
|
+
|
189
219
|
#### Options
|
190
220
|
|
191
221
|
See the Typhoeus options above — most apply to Watir as well, with
|
data/arachnid2.gemspec
CHANGED
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.require_paths = ["lib"]
|
23
23
|
|
24
24
|
spec.add_development_dependency "bundler", "~> 1.16"
|
25
|
-
spec.add_development_dependency "rake", "
|
25
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
26
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
27
27
|
|
28
28
|
spec.add_dependency "webdriver-user-agent", ">= 7.6"
|
@@ -32,5 +32,5 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "bloomfilter-rb"
|
33
33
|
spec.add_dependency "adomain"
|
34
34
|
spec.add_dependency "addressable"
|
35
|
-
spec.add_dependency "nokogiri", ">= 1.
|
35
|
+
spec.add_dependency "nokogiri", ">= 1.10.4"
|
36
36
|
end
|
data/lib/arachnid2.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'json'
|
3
|
-
module
|
3
|
+
module CachedResponses
|
4
4
|
CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
|
5
5
|
|
6
6
|
def load_data(_url, _options)
|
@@ -15,7 +15,7 @@ module CachedArachnidResponses
|
|
15
15
|
|
16
16
|
body = ::JSON.parse(response.body)
|
17
17
|
responses_list = Base64.decode64(body['encrypted_response'])
|
18
|
-
return Marshal.load responses_list # here we get
|
18
|
+
return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
|
19
19
|
end
|
20
20
|
rescue StandardError
|
21
21
|
nil
|
data/lib/arachnid2/typhoeus.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
class Arachnid2
|
2
2
|
class Typhoeus
|
3
|
-
include
|
3
|
+
include CachedResponses
|
4
4
|
include Arachnid2::Exoskeleton
|
5
5
|
|
6
6
|
def initialize(url)
|
@@ -17,39 +17,58 @@ class Arachnid2
|
|
17
17
|
max_concurrency.times do
|
18
18
|
q = @global_queue.shift
|
19
19
|
|
20
|
-
break if
|
21
|
-
Time.now > crawl_options[:time_limit] || \
|
22
|
-
memory_danger?
|
23
|
-
|
20
|
+
break if time_to_stop?
|
24
21
|
@global_visited.insert(q)
|
25
22
|
|
26
|
-
|
27
|
-
|
28
|
-
data = load_data(@url, opts)
|
29
|
-
data.each { |response| yield response } and return unless data.nil?
|
30
|
-
|
31
|
-
request.on_complete do |response|
|
32
|
-
@cached_data.push(response)
|
33
|
-
links = process(response.effective_url, response.body)
|
34
|
-
next unless links
|
35
|
-
|
36
|
-
yield response
|
37
|
-
|
38
|
-
vacuum(links, response.effective_url)
|
39
|
-
end
|
23
|
+
found_in_cache = use_cache(q, opts, &Proc.new)
|
24
|
+
return if found_in_cache
|
40
25
|
|
41
|
-
|
26
|
+
request = ::Typhoeus::Request.new(q, request_options)
|
27
|
+
requestable = after_request(request, &Proc.new)
|
28
|
+
@hydra.queue(request) if requestable
|
42
29
|
end # max_concurrency.times do
|
43
30
|
|
44
31
|
@hydra.run
|
45
|
-
|
46
32
|
end # until @global_queue.empty?
|
47
|
-
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
48
33
|
ensure
|
49
34
|
@cookie_file.close! if @cookie_file
|
50
35
|
end # def crawl(opts = {})
|
51
36
|
|
52
37
|
private
|
38
|
+
def after_request(request)
|
39
|
+
request.on_complete do |response|
|
40
|
+
cacheable = use_response(response, &Proc.new)
|
41
|
+
return unless cacheable
|
42
|
+
|
43
|
+
put_cached_data(response.effective_url, @options, response)
|
44
|
+
end
|
45
|
+
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
def use_response(response)
|
50
|
+
links = process(response.effective_url, response.body)
|
51
|
+
return unless links
|
52
|
+
|
53
|
+
yield response
|
54
|
+
|
55
|
+
vacuum(links, response.effective_url)
|
56
|
+
true
|
57
|
+
end
|
58
|
+
|
59
|
+
def use_cache(url, options)
|
60
|
+
data = load_data(url, options)
|
61
|
+
use_response(data, &Proc.new) if data
|
62
|
+
|
63
|
+
data
|
64
|
+
end
|
65
|
+
|
66
|
+
def time_to_stop?
|
67
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
68
|
+
Time.now > crawl_options[:time_limit] || \
|
69
|
+
memory_danger?
|
70
|
+
end
|
71
|
+
|
53
72
|
def typhoeus_preflight
|
54
73
|
@hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
|
55
74
|
typhoeus_proxy_options
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2/watir.rb
CHANGED
@@ -13,46 +13,17 @@ class Arachnid2
|
|
13
13
|
def crawl(opts)
|
14
14
|
preflight(opts)
|
15
15
|
watir_preflight
|
16
|
+
@already_retried = false
|
16
17
|
|
17
18
|
until @global_queue.empty?
|
18
|
-
@already_retried = false
|
19
19
|
q = @global_queue.shift
|
20
|
+
links = nil
|
20
21
|
|
21
|
-
break if
|
22
|
-
break if Time.now > crawl_options[:time_limit]
|
23
|
-
break if memory_danger?
|
22
|
+
break if time_to_stop?
|
24
23
|
|
25
24
|
@global_visited.insert(q)
|
26
25
|
|
27
|
-
|
28
|
-
begin
|
29
|
-
browser.goto q
|
30
|
-
rescue Selenium::WebDriver::Error::UnknownError => e
|
31
|
-
# Firefox and Selenium, in their infinite wisdom
|
32
|
-
# raise an error when a page cannot be loaded.
|
33
|
-
# At the time of writing this, the page at
|
34
|
-
# thewirecutter.com/cars/accessories-auto
|
35
|
-
# causes such an issue (too many redirects).
|
36
|
-
# This error handling moves us on from those pages.
|
37
|
-
raise e unless e.message =~ /.*Reached error page.*/i
|
38
|
-
next
|
39
|
-
end
|
40
|
-
links = process(browser.url, browser.body.html)
|
41
|
-
next unless links
|
42
|
-
|
43
|
-
yield browser
|
44
|
-
|
45
|
-
vacuum(links, browser.url)
|
46
|
-
rescue => e
|
47
|
-
raise e if @already_retried
|
48
|
-
raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
|
49
|
-
@browser.close if @browser rescue nil
|
50
|
-
@headless.destroy if @headless rescue nil
|
51
|
-
@browser = nil
|
52
|
-
@already_retried = true
|
53
|
-
retry
|
54
|
-
end
|
55
|
-
|
26
|
+
make_request(q, &Proc.new)
|
56
27
|
end # until @global_queue.empty?
|
57
28
|
ensure
|
58
29
|
@browser.close if @browser rescue nil
|
@@ -60,6 +31,74 @@ class Arachnid2
|
|
60
31
|
end
|
61
32
|
|
62
33
|
private
|
34
|
+
def make_request(q)
|
35
|
+
begin
|
36
|
+
links = browse_links(q, &Proc.new)
|
37
|
+
return unless links
|
38
|
+
|
39
|
+
vacuum(links, browser.url)
|
40
|
+
rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
|
41
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
42
|
+
"is ignoring an error: " \
|
43
|
+
"#{e.class} - #{e.message}"
|
44
|
+
puts msg
|
45
|
+
rescue => e
|
46
|
+
raise e if raise_before_retry?(e.class)
|
47
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
48
|
+
"is retrying once after an error: " \
|
49
|
+
"#{e.class} - #{e.message}"
|
50
|
+
puts msg
|
51
|
+
e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
|
52
|
+
reset_for_retry
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def browse_links(url)
|
57
|
+
return unless navigate(url)
|
58
|
+
|
59
|
+
yield browser
|
60
|
+
|
61
|
+
process(browser.url, browser.body.html) if browser.body.exists?
|
62
|
+
end
|
63
|
+
|
64
|
+
def navigate(url)
|
65
|
+
begin
|
66
|
+
browser.goto url
|
67
|
+
rescue Selenium::WebDriver::Error::UnknownError => e
|
68
|
+
# Firefox and Selenium, in their infinite wisdom
|
69
|
+
# raise an error when a page cannot be loaded.
|
70
|
+
# At the time of writing this, the page at
|
71
|
+
# thewirecutter.com/cars/accessories-auto
|
72
|
+
# causes such an issue (too many redirects).
|
73
|
+
# This error handling moves us on from those pages.
|
74
|
+
raise e unless e.message =~ /.*Reached error page.*/i
|
75
|
+
return
|
76
|
+
end
|
77
|
+
|
78
|
+
true
|
79
|
+
end
|
80
|
+
|
81
|
+
def time_to_stop?
|
82
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
83
|
+
Time.now > crawl_options[:time_limit] || \
|
84
|
+
memory_danger?
|
85
|
+
end
|
86
|
+
|
87
|
+
def raise_before_retry?(klass)
|
88
|
+
@already_retried || \
|
89
|
+
"#{klass}".include?("Selenium") || \
|
90
|
+
"#{klass}".include?("Watir")
|
91
|
+
end
|
92
|
+
|
93
|
+
def reset_for_retry
|
94
|
+
@browser.close if @browser rescue nil
|
95
|
+
@headless.destroy if @headless rescue nil
|
96
|
+
@driver.quit if @headless rescue nil
|
97
|
+
@driver = nil
|
98
|
+
@browser = nil
|
99
|
+
@already_retried = true
|
100
|
+
end
|
101
|
+
|
63
102
|
def browser
|
64
103
|
unless @browser
|
65
104
|
behead if @make_headless
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -28,16 +28,16 @@ dependencies:
|
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -156,14 +156,14 @@ dependencies:
|
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
version: 1.
|
159
|
+
version: 1.10.4
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version: 1.
|
166
|
+
version: 1.10.4
|
167
167
|
description:
|
168
168
|
email:
|
169
169
|
- scnissen@gmail.com
|
@@ -184,7 +184,7 @@ files:
|
|
184
184
|
- bin/console
|
185
185
|
- bin/setup
|
186
186
|
- lib/arachnid2.rb
|
187
|
-
- lib/arachnid2/
|
187
|
+
- lib/arachnid2/cached_responses.rb
|
188
188
|
- lib/arachnid2/exoskeleton.rb
|
189
189
|
- lib/arachnid2/typhoeus.rb
|
190
190
|
- lib/arachnid2/version.rb
|
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
|
212
|
-
rubygems_version: 2.7.7
|
211
|
+
rubygems_version: 3.1.2
|
213
212
|
signing_key:
|
214
213
|
specification_version: 4
|
215
214
|
summary: A simple, fast web crawler
|