arachnid2 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -0
- data/lib/arachnid2.rb +1 -1
- data/lib/arachnid2/{cached_arachnid_responses.rb → cached_responses.rb} +2 -2
- data/lib/arachnid2/exoskeleton.rb +1 -1
- data/lib/arachnid2/typhoeus.rb +41 -22
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +52 -25
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
|
4
|
+
data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
|
7
|
+
data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
|
data/README.md
CHANGED
@@ -186,6 +186,36 @@ with_watir = true
|
|
186
186
|
Arachnid2.new(url).crawl(opts, with_watir)
|
187
187
|
```
|
188
188
|
|
189
|
+
Arachnid2 has base defaults which you might want to address when
|
190
|
+
employing Watir.
|
191
|
+
|
192
|
+
* First, the default crawl time is 15 seconds.
|
193
|
+
As browser page loads can take this long, you will probably want to
|
194
|
+
set a higher crawl time.
|
195
|
+
* Simply storing the browser is not a great idea, since it will
|
196
|
+
be inaccessible after it is closed. Instead, consider nabbing the
|
197
|
+
HTML, cookies, or whatever content is required during the crawl.
|
198
|
+
* Finally, note that Firefox is the default browser.
|
199
|
+
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
require 'arachnid2'
|
203
|
+
|
204
|
+
with_watir = true
|
205
|
+
responses = []
|
206
|
+
url = "http://maximumfun.org"
|
207
|
+
max = 60
|
208
|
+
browser = :chrome
|
209
|
+
opts = {time_box: max, browser_type: browser}
|
210
|
+
|
211
|
+
spider = Arachnid2.new(url)
|
212
|
+
spider.crawl(opts, with_watir) do |response|
|
213
|
+
response.body.wait_until(&:present?)
|
214
|
+
responses << response.body.html if response.body.present?
|
215
|
+
end
|
216
|
+
|
217
|
+
```
|
218
|
+
|
189
219
|
#### Options
|
190
220
|
|
191
221
|
See the Typhoeus options above — most apply to Watir as well, with
|
data/lib/arachnid2.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'json'
|
3
|
-
module
|
3
|
+
module CachedResponses
|
4
4
|
CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
|
5
5
|
|
6
6
|
def load_data(_url, _options)
|
@@ -15,7 +15,7 @@ module CachedArachnidResponses
|
|
15
15
|
|
16
16
|
body = ::JSON.parse(response.body)
|
17
17
|
responses_list = Base64.decode64(body['encrypted_response'])
|
18
|
-
return Marshal.load responses_list # here we get
|
18
|
+
return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
|
19
19
|
end
|
20
20
|
rescue StandardError
|
21
21
|
nil
|
data/lib/arachnid2/typhoeus.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
class Arachnid2
|
2
2
|
class Typhoeus
|
3
|
-
include
|
3
|
+
include CachedResponses
|
4
4
|
include Arachnid2::Exoskeleton
|
5
5
|
|
6
6
|
def initialize(url)
|
@@ -17,39 +17,58 @@ class Arachnid2
|
|
17
17
|
max_concurrency.times do
|
18
18
|
q = @global_queue.shift
|
19
19
|
|
20
|
-
break if
|
21
|
-
Time.now > crawl_options[:time_limit] || \
|
22
|
-
memory_danger?
|
23
|
-
|
20
|
+
break if time_to_stop?
|
24
21
|
@global_visited.insert(q)
|
25
22
|
|
26
|
-
|
27
|
-
|
28
|
-
data = load_data(@url, opts)
|
29
|
-
data.each { |response| yield response } and return unless data.nil?
|
30
|
-
|
31
|
-
request.on_complete do |response|
|
32
|
-
@cached_data.push(response)
|
33
|
-
links = process(response.effective_url, response.body)
|
34
|
-
next unless links
|
35
|
-
|
36
|
-
yield response
|
37
|
-
|
38
|
-
vacuum(links, response.effective_url)
|
39
|
-
end
|
23
|
+
found_in_cache = use_cache(q, opts, &Proc.new)
|
24
|
+
return if found_in_cache
|
40
25
|
|
41
|
-
|
26
|
+
request = ::Typhoeus::Request.new(q, request_options)
|
27
|
+
requestable = after_request(request, &Proc.new)
|
28
|
+
@hydra.queue(request) if requestable
|
42
29
|
end # max_concurrency.times do
|
43
30
|
|
44
31
|
@hydra.run
|
45
|
-
|
46
32
|
end # until @global_queue.empty?
|
47
|
-
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
48
33
|
ensure
|
49
34
|
@cookie_file.close! if @cookie_file
|
50
35
|
end # def crawl(opts = {})
|
51
36
|
|
52
37
|
private
|
38
|
+
def after_request(request)
|
39
|
+
request.on_complete do |response|
|
40
|
+
cacheable = use_response(response, &Proc.new)
|
41
|
+
return unless cacheable
|
42
|
+
|
43
|
+
put_cached_data(response.effective_url, @options, response)
|
44
|
+
end
|
45
|
+
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
def use_response(response)
|
50
|
+
links = process(response.effective_url, response.body)
|
51
|
+
return unless links
|
52
|
+
|
53
|
+
yield response
|
54
|
+
|
55
|
+
vacuum(links, response.effective_url)
|
56
|
+
true
|
57
|
+
end
|
58
|
+
|
59
|
+
def use_cache(url, options)
|
60
|
+
data = load_data(url, options)
|
61
|
+
use_response(data, &Proc.new) if data
|
62
|
+
|
63
|
+
data
|
64
|
+
end
|
65
|
+
|
66
|
+
def time_to_stop?
|
67
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
68
|
+
Time.now > crawl_options[:time_limit] || \
|
69
|
+
memory_danger?
|
70
|
+
end
|
71
|
+
|
53
72
|
def typhoeus_preflight
|
54
73
|
@hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
|
55
74
|
typhoeus_proxy_options
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2/watir.rb
CHANGED
@@ -19,44 +19,71 @@ class Arachnid2
|
|
19
19
|
q = @global_queue.shift
|
20
20
|
links = nil
|
21
21
|
|
22
|
-
break if
|
23
|
-
break if Time.now > crawl_options[:time_limit]
|
24
|
-
break if memory_danger?
|
22
|
+
break if time_to_stop?
|
25
23
|
|
26
24
|
@global_visited.insert(q)
|
27
25
|
|
26
|
+
make_request(q, &Proc.new)
|
27
|
+
end # until @global_queue.empty?
|
28
|
+
ensure
|
29
|
+
@browser.close if @browser rescue nil
|
30
|
+
@headless.destroy if @headless rescue nil
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def make_request(q)
|
28
35
|
begin
|
29
|
-
|
30
|
-
|
31
|
-
rescue Selenium::WebDriver::Error::UnknownError => e
|
32
|
-
# Firefox and Selenium, in their infinite wisdom
|
33
|
-
# raise an error when a page cannot be loaded.
|
34
|
-
# At the time of writing this, the page at
|
35
|
-
# thewirecutter.com/cars/accessories-auto
|
36
|
-
# causes such an issue (too many redirects).
|
37
|
-
# This error handling moves us on from those pages.
|
38
|
-
raise e unless e.message =~ /.*Reached error page.*/i
|
39
|
-
next
|
40
|
-
end
|
41
|
-
links = process(browser.url, browser.body.html) if browser.body.exists?
|
42
|
-
next unless links
|
43
|
-
|
44
|
-
yield browser
|
36
|
+
links = browse_links(q, &Proc.new)
|
37
|
+
return unless links
|
45
38
|
|
46
39
|
vacuum(links, browser.url)
|
47
40
|
rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
|
41
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
42
|
+
"is ignoring an error: " \
|
43
|
+
"#{e.class} - #{e.message}"
|
44
|
+
puts msg
|
48
45
|
rescue => e
|
49
46
|
raise e if raise_before_retry?(e.class)
|
47
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
48
|
+
"is retrying once after an error: " \
|
49
|
+
"#{e.class} - #{e.message}"
|
50
|
+
puts msg
|
51
|
+
e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
|
50
52
|
reset_for_retry
|
51
53
|
end
|
54
|
+
end
|
52
55
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
56
|
+
def browse_links(url)
|
57
|
+
return unless navigate(url)
|
58
|
+
|
59
|
+
yield browser
|
60
|
+
|
61
|
+
process(browser.url, browser.body.html) if browser.body.exists?
|
62
|
+
end
|
63
|
+
|
64
|
+
def navigate(url)
|
65
|
+
begin
|
66
|
+
browser.goto url
|
67
|
+
rescue Selenium::WebDriver::Error::UnknownError => e
|
68
|
+
# Firefox and Selenium, in their infinite wisdom
|
69
|
+
# raise an error when a page cannot be loaded.
|
70
|
+
# At the time of writing this, the page at
|
71
|
+
# thewirecutter.com/cars/accessories-auto
|
72
|
+
# causes such an issue (too many redirects).
|
73
|
+
# This error handling moves us on from those pages.
|
74
|
+
raise e unless e.message =~ /.*Reached error page.*/i
|
75
|
+
return
|
76
|
+
end
|
77
|
+
|
78
|
+
true
|
79
|
+
end
|
80
|
+
|
81
|
+
def time_to_stop?
|
82
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
83
|
+
Time.now > crawl_options[:time_limit] || \
|
84
|
+
memory_danger?
|
85
|
+
end
|
58
86
|
|
59
|
-
private
|
60
87
|
def raise_before_retry?(klass)
|
61
88
|
@already_retried || \
|
62
89
|
"#{klass}".include?("Selenium") || \
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -184,7 +184,7 @@ files:
|
|
184
184
|
- bin/console
|
185
185
|
- bin/setup
|
186
186
|
- lib/arachnid2.rb
|
187
|
-
- lib/arachnid2/
|
187
|
+
- lib/arachnid2/cached_responses.rb
|
188
188
|
- lib/arachnid2/exoskeleton.rb
|
189
189
|
- lib/arachnid2/typhoeus.rb
|
190
190
|
- lib/arachnid2/version.rb
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
rubygems_version: 3.
|
211
|
+
rubygems_version: 3.1.2
|
212
212
|
signing_key:
|
213
213
|
specification_version: 4
|
214
214
|
summary: A simple, fast web crawler
|