arachnid2 0.3.9 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +30 -0
- data/lib/arachnid2.rb +1 -1
- data/lib/arachnid2/{cached_arachnid_responses.rb → cached_responses.rb} +2 -2
- data/lib/arachnid2/exoskeleton.rb +1 -1
- data/lib/arachnid2/typhoeus.rb +41 -22
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +52 -25
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
|
4
|
+
data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
|
7
|
+
data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
|
data/README.md
CHANGED
@@ -186,6 +186,36 @@ with_watir = true
|
|
186
186
|
Arachnid2.new(url).crawl(opts, with_watir)
|
187
187
|
```
|
188
188
|
|
189
|
+
Arachnid2 has base defaults which you might want to address when
|
190
|
+
employing Watir.
|
191
|
+
|
192
|
+
* First, the default crawl time is 15 seconds.
|
193
|
+
As browser page loads can take this long, you will probably want to
|
194
|
+
set a higher crawl time.
|
195
|
+
* Simply storing the browser is not a great idea, since it will
|
196
|
+
be inaccessible after it is closed. Instead, consider nabbing the
|
197
|
+
HTML, cookies, or whatever content is required during the crawl.
|
198
|
+
* Finally, note that Firefox is the default browser.
|
199
|
+
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
require 'arachnid2'
|
203
|
+
|
204
|
+
with_watir = true
|
205
|
+
responses = []
|
206
|
+
url = "http://maximumfun.org"
|
207
|
+
max = 60
|
208
|
+
browser = :chrome
|
209
|
+
opts = {time_box: max, browser_type: browser}
|
210
|
+
|
211
|
+
spider = Arachnid2.new(url)
|
212
|
+
spider.crawl(opts, with_watir) do |response|
|
213
|
+
response.body.wait_until(&:present?)
|
214
|
+
responses << response.body.html if response.body.present?
|
215
|
+
end
|
216
|
+
|
217
|
+
```
|
218
|
+
|
189
219
|
#### Options
|
190
220
|
|
191
221
|
See the Typhoeus options above — most apply to Watir as well, with
|
data/lib/arachnid2.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'json'
|
3
|
-
module
|
3
|
+
module CachedResponses
|
4
4
|
CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
|
5
5
|
|
6
6
|
def load_data(_url, _options)
|
@@ -15,7 +15,7 @@ module CachedArachnidResponses
|
|
15
15
|
|
16
16
|
body = ::JSON.parse(response.body)
|
17
17
|
responses_list = Base64.decode64(body['encrypted_response'])
|
18
|
-
return Marshal.load responses_list # here we get
|
18
|
+
return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
|
19
19
|
end
|
20
20
|
rescue StandardError
|
21
21
|
nil
|
data/lib/arachnid2/typhoeus.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
class Arachnid2
|
2
2
|
class Typhoeus
|
3
|
-
include
|
3
|
+
include CachedResponses
|
4
4
|
include Arachnid2::Exoskeleton
|
5
5
|
|
6
6
|
def initialize(url)
|
@@ -17,39 +17,58 @@ class Arachnid2
|
|
17
17
|
max_concurrency.times do
|
18
18
|
q = @global_queue.shift
|
19
19
|
|
20
|
-
break if
|
21
|
-
Time.now > crawl_options[:time_limit] || \
|
22
|
-
memory_danger?
|
23
|
-
|
20
|
+
break if time_to_stop?
|
24
21
|
@global_visited.insert(q)
|
25
22
|
|
26
|
-
|
27
|
-
|
28
|
-
data = load_data(@url, opts)
|
29
|
-
data.each { |response| yield response } and return unless data.nil?
|
30
|
-
|
31
|
-
request.on_complete do |response|
|
32
|
-
@cached_data.push(response)
|
33
|
-
links = process(response.effective_url, response.body)
|
34
|
-
next unless links
|
35
|
-
|
36
|
-
yield response
|
37
|
-
|
38
|
-
vacuum(links, response.effective_url)
|
39
|
-
end
|
23
|
+
found_in_cache = use_cache(q, opts, &Proc.new)
|
24
|
+
return if found_in_cache
|
40
25
|
|
41
|
-
|
26
|
+
request = ::Typhoeus::Request.new(q, request_options)
|
27
|
+
requestable = after_request(request, &Proc.new)
|
28
|
+
@hydra.queue(request) if requestable
|
42
29
|
end # max_concurrency.times do
|
43
30
|
|
44
31
|
@hydra.run
|
45
|
-
|
46
32
|
end # until @global_queue.empty?
|
47
|
-
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
48
33
|
ensure
|
49
34
|
@cookie_file.close! if @cookie_file
|
50
35
|
end # def crawl(opts = {})
|
51
36
|
|
52
37
|
private
|
38
|
+
def after_request(request)
|
39
|
+
request.on_complete do |response|
|
40
|
+
cacheable = use_response(response, &Proc.new)
|
41
|
+
return unless cacheable
|
42
|
+
|
43
|
+
put_cached_data(response.effective_url, @options, response)
|
44
|
+
end
|
45
|
+
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
def use_response(response)
|
50
|
+
links = process(response.effective_url, response.body)
|
51
|
+
return unless links
|
52
|
+
|
53
|
+
yield response
|
54
|
+
|
55
|
+
vacuum(links, response.effective_url)
|
56
|
+
true
|
57
|
+
end
|
58
|
+
|
59
|
+
def use_cache(url, options)
|
60
|
+
data = load_data(url, options)
|
61
|
+
use_response(data, &Proc.new) if data
|
62
|
+
|
63
|
+
data
|
64
|
+
end
|
65
|
+
|
66
|
+
def time_to_stop?
|
67
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
68
|
+
Time.now > crawl_options[:time_limit] || \
|
69
|
+
memory_danger?
|
70
|
+
end
|
71
|
+
|
53
72
|
def typhoeus_preflight
|
54
73
|
@hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
|
55
74
|
typhoeus_proxy_options
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2/watir.rb
CHANGED
@@ -19,44 +19,71 @@ class Arachnid2
|
|
19
19
|
q = @global_queue.shift
|
20
20
|
links = nil
|
21
21
|
|
22
|
-
break if
|
23
|
-
break if Time.now > crawl_options[:time_limit]
|
24
|
-
break if memory_danger?
|
22
|
+
break if time_to_stop?
|
25
23
|
|
26
24
|
@global_visited.insert(q)
|
27
25
|
|
26
|
+
make_request(q, &Proc.new)
|
27
|
+
end # until @global_queue.empty?
|
28
|
+
ensure
|
29
|
+
@browser.close if @browser rescue nil
|
30
|
+
@headless.destroy if @headless rescue nil
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def make_request(q)
|
28
35
|
begin
|
29
|
-
|
30
|
-
|
31
|
-
rescue Selenium::WebDriver::Error::UnknownError => e
|
32
|
-
# Firefox and Selenium, in their infinite wisdom
|
33
|
-
# raise an error when a page cannot be loaded.
|
34
|
-
# At the time of writing this, the page at
|
35
|
-
# thewirecutter.com/cars/accessories-auto
|
36
|
-
# causes such an issue (too many redirects).
|
37
|
-
# This error handling moves us on from those pages.
|
38
|
-
raise e unless e.message =~ /.*Reached error page.*/i
|
39
|
-
next
|
40
|
-
end
|
41
|
-
links = process(browser.url, browser.body.html) if browser.body.exists?
|
42
|
-
next unless links
|
43
|
-
|
44
|
-
yield browser
|
36
|
+
links = browse_links(q, &Proc.new)
|
37
|
+
return unless links
|
45
38
|
|
46
39
|
vacuum(links, browser.url)
|
47
40
|
rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
|
41
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
42
|
+
"is ignoring an error: " \
|
43
|
+
"#{e.class} - #{e.message}"
|
44
|
+
puts msg
|
48
45
|
rescue => e
|
49
46
|
raise e if raise_before_retry?(e.class)
|
47
|
+
msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
|
48
|
+
"is retrying once after an error: " \
|
49
|
+
"#{e.class} - #{e.message}"
|
50
|
+
puts msg
|
51
|
+
e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
|
50
52
|
reset_for_retry
|
51
53
|
end
|
54
|
+
end
|
52
55
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
56
|
+
def browse_links(url)
|
57
|
+
return unless navigate(url)
|
58
|
+
|
59
|
+
yield browser
|
60
|
+
|
61
|
+
process(browser.url, browser.body.html) if browser.body.exists?
|
62
|
+
end
|
63
|
+
|
64
|
+
def navigate(url)
|
65
|
+
begin
|
66
|
+
browser.goto url
|
67
|
+
rescue Selenium::WebDriver::Error::UnknownError => e
|
68
|
+
# Firefox and Selenium, in their infinite wisdom
|
69
|
+
# raise an error when a page cannot be loaded.
|
70
|
+
# At the time of writing this, the page at
|
71
|
+
# thewirecutter.com/cars/accessories-auto
|
72
|
+
# causes such an issue (too many redirects).
|
73
|
+
# This error handling moves us on from those pages.
|
74
|
+
raise e unless e.message =~ /.*Reached error page.*/i
|
75
|
+
return
|
76
|
+
end
|
77
|
+
|
78
|
+
true
|
79
|
+
end
|
80
|
+
|
81
|
+
def time_to_stop?
|
82
|
+
@global_visited.size >= crawl_options[:max_urls] || \
|
83
|
+
Time.now > crawl_options[:time_limit] || \
|
84
|
+
memory_danger?
|
85
|
+
end
|
58
86
|
|
59
|
-
private
|
60
87
|
def raise_before_retry?(klass)
|
61
88
|
@already_retried || \
|
62
89
|
"#{klass}".include?("Selenium") || \
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -184,7 +184,7 @@ files:
|
|
184
184
|
- bin/console
|
185
185
|
- bin/setup
|
186
186
|
- lib/arachnid2.rb
|
187
|
-
- lib/arachnid2/
|
187
|
+
- lib/arachnid2/cached_responses.rb
|
188
188
|
- lib/arachnid2/exoskeleton.rb
|
189
189
|
- lib/arachnid2/typhoeus.rb
|
190
190
|
- lib/arachnid2/version.rb
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
rubygems_version: 3.
|
211
|
+
rubygems_version: 3.1.2
|
212
212
|
signing_key:
|
213
213
|
specification_version: 4
|
214
214
|
summary: A simple, fast web crawler
|