arachnid2 0.3.6 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +21 -21
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +20 -7
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1df2624ea1b73392b92e3fcdc229e0816f2aab2dcac07e160749b29fdd0149e6
|
4
|
+
data.tar.gz: fc26566bd947099e80b028e0523cb7bbe469a4304ceea4d69d262dd33ad4e410
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af5a70e62c2194566c855edaebe805201d3cca0d612e32cbd15dde06cd49869538fcbb4af2ad14b5ed0c9f45b968ce39085b36b0faf74eac115f0a7ba5f92cc
|
7
|
+
data.tar.gz: bc250f87e6376fa05e1756490a5a8a7693f3cdd88df1c536e227a9a26b57a734f299a8c6cbcfa7f2a6ad16721c7c6ef1f4022ce7822b4cd5ca17f883b57a1fdd
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.3.
|
4
|
+
arachnid2 (0.3.7)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
8
|
-
nokogiri (>= 1.
|
8
|
+
nokogiri (>= 1.10.4)
|
9
9
|
typhoeus
|
10
10
|
watir
|
11
11
|
webdriver-user-agent (>= 7.6)
|
@@ -14,30 +14,31 @@ PATH
|
|
14
14
|
GEM
|
15
15
|
remote: https://rubygems.org/
|
16
16
|
specs:
|
17
|
-
addressable (2.
|
18
|
-
public_suffix (>= 2.0.2, <
|
19
|
-
adomain (0.
|
17
|
+
addressable (2.7.0)
|
18
|
+
public_suffix (>= 2.0.2, < 5.0)
|
19
|
+
adomain (0.2.3)
|
20
20
|
addressable (~> 2.5)
|
21
|
+
logger
|
21
22
|
bloomfilter-rb (2.1.1)
|
22
23
|
redis
|
23
|
-
childprocess (0.
|
24
|
-
|
24
|
+
childprocess (1.0.1)
|
25
|
+
rake (< 13.0)
|
25
26
|
diff-lcs (1.3)
|
26
27
|
ethon (0.12.0)
|
27
28
|
ffi (>= 1.3.0)
|
28
29
|
facets (3.1.0)
|
29
|
-
ffi (1.
|
30
|
+
ffi (1.11.1)
|
30
31
|
json (2.2.0)
|
32
|
+
logger (1.4.1)
|
31
33
|
mini_portile2 (2.4.0)
|
32
|
-
|
33
|
-
nokogiri (1.10.4)
|
34
|
+
nokogiri (1.10.7)
|
34
35
|
mini_portile2 (~> 2.4.0)
|
35
|
-
os (1.0.
|
36
|
+
os (1.0.1)
|
36
37
|
psych (3.1.0)
|
37
|
-
public_suffix (
|
38
|
+
public_suffix (4.0.1)
|
38
39
|
rake (10.5.0)
|
39
|
-
redis (4.1.
|
40
|
-
regexp_parser (1.
|
40
|
+
redis (4.1.2)
|
41
|
+
regexp_parser (1.6.0)
|
41
42
|
rspec (3.8.0)
|
42
43
|
rspec-core (~> 3.8.0)
|
43
44
|
rspec-expectations (~> 3.8.0)
|
@@ -51,9 +52,9 @@ GEM
|
|
51
52
|
diff-lcs (>= 1.2.0, < 2.0)
|
52
53
|
rspec-support (~> 3.8.0)
|
53
54
|
rspec-support (3.8.0)
|
54
|
-
rubyzip (1.2.
|
55
|
-
selenium-webdriver (3.
|
56
|
-
childprocess (
|
55
|
+
rubyzip (1.2.3)
|
56
|
+
selenium-webdriver (3.142.3)
|
57
|
+
childprocess (>= 0.5, < 2.0)
|
57
58
|
rubyzip (~> 1.2, >= 1.2.2)
|
58
59
|
typhoeus (1.3.1)
|
59
60
|
ethon (>= 0.9.0)
|
@@ -66,11 +67,10 @@ GEM
|
|
66
67
|
os
|
67
68
|
psych
|
68
69
|
selenium-webdriver (>= 3.4.0)
|
69
|
-
webdrivers (
|
70
|
-
net_http_ssl_fix
|
70
|
+
webdrivers (4.1.0)
|
71
71
|
nokogiri (~> 1.6)
|
72
72
|
rubyzip (~> 1.0)
|
73
|
-
selenium-webdriver (
|
73
|
+
selenium-webdriver (>= 3.0, < 4.0)
|
74
74
|
|
75
75
|
PLATFORMS
|
76
76
|
ruby
|
@@ -82,4 +82,4 @@ DEPENDENCIES
|
|
82
82
|
rspec (~> 3.0)
|
83
83
|
|
84
84
|
BUNDLED WITH
|
85
|
-
1.
|
85
|
+
1.17.3
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2/watir.rb
CHANGED
@@ -17,6 +17,7 @@ class Arachnid2
|
|
17
17
|
until @global_queue.empty?
|
18
18
|
@already_retried = false
|
19
19
|
q = @global_queue.shift
|
20
|
+
links = nil
|
20
21
|
|
21
22
|
break if @global_visited.size >= crawl_options[:max_urls]
|
22
23
|
break if Time.now > crawl_options[:time_limit]
|
@@ -37,19 +38,18 @@ class Arachnid2
|
|
37
38
|
raise e unless e.message =~ /.*Reached error page.*/i
|
38
39
|
next
|
39
40
|
end
|
40
|
-
links = process(browser.url, browser.body.html)
|
41
|
+
links = process(browser.url, browser.body.html) if browser.body.exists?
|
41
42
|
next unless links
|
42
43
|
|
43
44
|
yield browser
|
44
45
|
|
45
46
|
vacuum(links, browser.url)
|
46
47
|
rescue => e
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
@already_retried = true
|
48
|
+
next if e.class == Net::ReadTimeout
|
49
|
+
|
50
|
+
raise e if raise_before_retry?(e.class)
|
51
|
+
|
52
|
+
reset_for_retry
|
53
53
|
retry
|
54
54
|
end
|
55
55
|
|
@@ -60,6 +60,19 @@ class Arachnid2
|
|
60
60
|
end
|
61
61
|
|
62
62
|
private
|
63
|
+
def raise_before_retry?(klass)
|
64
|
+
@already_retried || \
|
65
|
+
"#{klass}".include?("Selenium") || \
|
66
|
+
"#{klass}".include?("Watir")
|
67
|
+
end
|
68
|
+
|
69
|
+
def reset_for_retry
|
70
|
+
@browser.close if @browser rescue nil
|
71
|
+
@headless.destroy if @headless rescue nil
|
72
|
+
@browser = nil
|
73
|
+
@already_retried = true
|
74
|
+
end
|
75
|
+
|
63
76
|
def browser
|
64
77
|
unless @browser
|
65
78
|
behead if @make_headless
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
|
212
|
-
rubygems_version: 2.7.10
|
211
|
+
rubygems_version: 3.0.6
|
213
212
|
signing_key:
|
214
213
|
specification_version: 4
|
215
214
|
summary: A simple, fast web crawler
|