arachnid2 0.3.6 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +21 -21
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +20 -7
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1df2624ea1b73392b92e3fcdc229e0816f2aab2dcac07e160749b29fdd0149e6
|
4
|
+
data.tar.gz: fc26566bd947099e80b028e0523cb7bbe469a4304ceea4d69d262dd33ad4e410
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af5a70e62c2194566c855edaebe805201d3cca0d612e32cbd15dde06cd49869538fcbb4af2ad14b5ed0c9f45b968ce39085b36b0faf74eac115f0a7ba5f92cc
|
7
|
+
data.tar.gz: bc250f87e6376fa05e1756490a5a8a7693f3cdd88df1c536e227a9a26b57a734f299a8c6cbcfa7f2a6ad16721c7c6ef1f4022ce7822b4cd5ca17f883b57a1fdd
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.3.
|
4
|
+
arachnid2 (0.3.7)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
8
|
-
nokogiri (>= 1.
|
8
|
+
nokogiri (>= 1.10.4)
|
9
9
|
typhoeus
|
10
10
|
watir
|
11
11
|
webdriver-user-agent (>= 7.6)
|
@@ -14,30 +14,31 @@ PATH
|
|
14
14
|
GEM
|
15
15
|
remote: https://rubygems.org/
|
16
16
|
specs:
|
17
|
-
addressable (2.
|
18
|
-
public_suffix (>= 2.0.2, <
|
19
|
-
adomain (0.
|
17
|
+
addressable (2.7.0)
|
18
|
+
public_suffix (>= 2.0.2, < 5.0)
|
19
|
+
adomain (0.2.3)
|
20
20
|
addressable (~> 2.5)
|
21
|
+
logger
|
21
22
|
bloomfilter-rb (2.1.1)
|
22
23
|
redis
|
23
|
-
childprocess (0.
|
24
|
-
|
24
|
+
childprocess (1.0.1)
|
25
|
+
rake (< 13.0)
|
25
26
|
diff-lcs (1.3)
|
26
27
|
ethon (0.12.0)
|
27
28
|
ffi (>= 1.3.0)
|
28
29
|
facets (3.1.0)
|
29
|
-
ffi (1.
|
30
|
+
ffi (1.11.1)
|
30
31
|
json (2.2.0)
|
32
|
+
logger (1.4.1)
|
31
33
|
mini_portile2 (2.4.0)
|
32
|
-
|
33
|
-
nokogiri (1.10.4)
|
34
|
+
nokogiri (1.10.7)
|
34
35
|
mini_portile2 (~> 2.4.0)
|
35
|
-
os (1.0.
|
36
|
+
os (1.0.1)
|
36
37
|
psych (3.1.0)
|
37
|
-
public_suffix (
|
38
|
+
public_suffix (4.0.1)
|
38
39
|
rake (10.5.0)
|
39
|
-
redis (4.1.
|
40
|
-
regexp_parser (1.
|
40
|
+
redis (4.1.2)
|
41
|
+
regexp_parser (1.6.0)
|
41
42
|
rspec (3.8.0)
|
42
43
|
rspec-core (~> 3.8.0)
|
43
44
|
rspec-expectations (~> 3.8.0)
|
@@ -51,9 +52,9 @@ GEM
|
|
51
52
|
diff-lcs (>= 1.2.0, < 2.0)
|
52
53
|
rspec-support (~> 3.8.0)
|
53
54
|
rspec-support (3.8.0)
|
54
|
-
rubyzip (1.2.
|
55
|
-
selenium-webdriver (3.
|
56
|
-
childprocess (
|
55
|
+
rubyzip (1.2.3)
|
56
|
+
selenium-webdriver (3.142.3)
|
57
|
+
childprocess (>= 0.5, < 2.0)
|
57
58
|
rubyzip (~> 1.2, >= 1.2.2)
|
58
59
|
typhoeus (1.3.1)
|
59
60
|
ethon (>= 0.9.0)
|
@@ -66,11 +67,10 @@ GEM
|
|
66
67
|
os
|
67
68
|
psych
|
68
69
|
selenium-webdriver (>= 3.4.0)
|
69
|
-
webdrivers (
|
70
|
-
net_http_ssl_fix
|
70
|
+
webdrivers (4.1.0)
|
71
71
|
nokogiri (~> 1.6)
|
72
72
|
rubyzip (~> 1.0)
|
73
|
-
selenium-webdriver (
|
73
|
+
selenium-webdriver (>= 3.0, < 4.0)
|
74
74
|
|
75
75
|
PLATFORMS
|
76
76
|
ruby
|
@@ -82,4 +82,4 @@ DEPENDENCIES
|
|
82
82
|
rspec (~> 3.0)
|
83
83
|
|
84
84
|
BUNDLED WITH
|
85
|
-
1.
|
85
|
+
1.17.3
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2/watir.rb
CHANGED
@@ -17,6 +17,7 @@ class Arachnid2
|
|
17
17
|
until @global_queue.empty?
|
18
18
|
@already_retried = false
|
19
19
|
q = @global_queue.shift
|
20
|
+
links = nil
|
20
21
|
|
21
22
|
break if @global_visited.size >= crawl_options[:max_urls]
|
22
23
|
break if Time.now > crawl_options[:time_limit]
|
@@ -37,19 +38,18 @@ class Arachnid2
|
|
37
38
|
raise e unless e.message =~ /.*Reached error page.*/i
|
38
39
|
next
|
39
40
|
end
|
40
|
-
links = process(browser.url, browser.body.html)
|
41
|
+
links = process(browser.url, browser.body.html) if browser.body.exists?
|
41
42
|
next unless links
|
42
43
|
|
43
44
|
yield browser
|
44
45
|
|
45
46
|
vacuum(links, browser.url)
|
46
47
|
rescue => e
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
@already_retried = true
|
48
|
+
next if e.class == Net::ReadTimeout
|
49
|
+
|
50
|
+
raise e if raise_before_retry?(e.class)
|
51
|
+
|
52
|
+
reset_for_retry
|
53
53
|
retry
|
54
54
|
end
|
55
55
|
|
@@ -60,6 +60,19 @@ class Arachnid2
|
|
60
60
|
end
|
61
61
|
|
62
62
|
private
|
63
|
+
def raise_before_retry?(klass)
|
64
|
+
@already_retried || \
|
65
|
+
"#{klass}".include?("Selenium") || \
|
66
|
+
"#{klass}".include?("Watir")
|
67
|
+
end
|
68
|
+
|
69
|
+
def reset_for_retry
|
70
|
+
@browser.close if @browser rescue nil
|
71
|
+
@headless.destroy if @headless rescue nil
|
72
|
+
@browser = nil
|
73
|
+
@already_retried = true
|
74
|
+
end
|
75
|
+
|
63
76
|
def browser
|
64
77
|
unless @browser
|
65
78
|
behead if @make_headless
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
|
212
|
-
rubygems_version: 2.7.10
|
211
|
+
rubygems_version: 3.0.6
|
213
212
|
signing_key:
|
214
213
|
specification_version: 4
|
215
214
|
summary: A simple, fast web crawler
|