arachnid2 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f21f9cc6f752a43952d694abe4419d8c5d2ba4f3e5154543dab9d39e415753f
4
- data.tar.gz: c29401a47c0291cb5087e237b72431167d3d5f59210a49fb9d73a8ef399fbb36
3
+ metadata.gz: 1df2624ea1b73392b92e3fcdc229e0816f2aab2dcac07e160749b29fdd0149e6
4
+ data.tar.gz: fc26566bd947099e80b028e0523cb7bbe469a4304ceea4d69d262dd33ad4e410
5
5
  SHA512:
6
- metadata.gz: f6114133ad6c4b26bebfa762c676f5c4b544c55e3fb0802c3f5571b7d7ae2f4235218871f75e2bf284bdd1ebb5b64e992d83e291fd6c900038fc68bb6afe54a3
7
- data.tar.gz: b543fb52387b4188c8738f8dbc44f0be5aec764ead8a53c2babf1e4d51c8d778c45ead9389478942afe6d16f3629fe7a81d6f4d965217079528ec7f968e2f89f
6
+ metadata.gz: 2af5a70e62c2194566c855edaebe805201d3cca0d612e32cbd15dde06cd49869538fcbb4af2ad14b5ed0c9f45b968ce39085b36b0faf74eac115f0a7ba5f92cc
7
+ data.tar.gz: bc250f87e6376fa05e1756490a5a8a7693f3cdd88df1c536e227a9a26b57a734f299a8c6cbcfa7f2a6ad16721c7c6ef1f4022ce7822b4cd5ca17f883b57a1fdd
@@ -1,11 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.3.5)
4
+ arachnid2 (0.3.7)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
8
- nokogiri (>= 1.8.5)
8
+ nokogiri (>= 1.10.4)
9
9
  typhoeus
10
10
  watir
11
11
  webdriver-user-agent (>= 7.6)
@@ -14,30 +14,31 @@ PATH
14
14
  GEM
15
15
  remote: https://rubygems.org/
16
16
  specs:
17
- addressable (2.6.0)
18
- public_suffix (>= 2.0.2, < 4.0)
19
- adomain (0.1.1)
17
+ addressable (2.7.0)
18
+ public_suffix (>= 2.0.2, < 5.0)
19
+ adomain (0.2.3)
20
20
  addressable (~> 2.5)
21
+ logger
21
22
  bloomfilter-rb (2.1.1)
22
23
  redis
23
- childprocess (0.9.0)
24
- ffi (~> 1.0, >= 1.0.11)
24
+ childprocess (1.0.1)
25
+ rake (< 13.0)
25
26
  diff-lcs (1.3)
26
27
  ethon (0.12.0)
27
28
  ffi (>= 1.3.0)
28
29
  facets (3.1.0)
29
- ffi (1.10.0)
30
+ ffi (1.11.1)
30
31
  json (2.2.0)
32
+ logger (1.4.1)
31
33
  mini_portile2 (2.4.0)
32
- net_http_ssl_fix (0.0.10)
33
- nokogiri (1.10.4)
34
+ nokogiri (1.10.7)
34
35
  mini_portile2 (~> 2.4.0)
35
- os (1.0.0)
36
+ os (1.0.1)
36
37
  psych (3.1.0)
37
- public_suffix (3.0.3)
38
+ public_suffix (4.0.1)
38
39
  rake (10.5.0)
39
- redis (4.1.0)
40
- regexp_parser (1.3.0)
40
+ redis (4.1.2)
41
+ regexp_parser (1.6.0)
41
42
  rspec (3.8.0)
42
43
  rspec-core (~> 3.8.0)
43
44
  rspec-expectations (~> 3.8.0)
@@ -51,9 +52,9 @@ GEM
51
52
  diff-lcs (>= 1.2.0, < 2.0)
52
53
  rspec-support (~> 3.8.0)
53
54
  rspec-support (3.8.0)
54
- rubyzip (1.2.2)
55
- selenium-webdriver (3.141.0)
56
- childprocess (~> 0.5)
55
+ rubyzip (1.2.3)
56
+ selenium-webdriver (3.142.3)
57
+ childprocess (>= 0.5, < 2.0)
57
58
  rubyzip (~> 1.2, >= 1.2.2)
58
59
  typhoeus (1.3.1)
59
60
  ethon (>= 0.9.0)
@@ -66,11 +67,10 @@ GEM
66
67
  os
67
68
  psych
68
69
  selenium-webdriver (>= 3.4.0)
69
- webdrivers (3.6.0)
70
- net_http_ssl_fix
70
+ webdrivers (4.1.0)
71
71
  nokogiri (~> 1.6)
72
72
  rubyzip (~> 1.0)
73
- selenium-webdriver (~> 3.0)
73
+ selenium-webdriver (>= 3.0, < 4.0)
74
74
 
75
75
  PLATFORMS
76
76
  ruby
@@ -82,4 +82,4 @@ DEPENDENCIES
82
82
  rspec (~> 3.0)
83
83
 
84
84
  BUNDLED WITH
85
- 1.16.6
85
+ 1.17.3
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.6"
2
+ VERSION = "0.3.7"
3
3
  end
@@ -17,6 +17,7 @@ class Arachnid2
17
17
  until @global_queue.empty?
18
18
  @already_retried = false
19
19
  q = @global_queue.shift
20
+ links = nil
20
21
 
21
22
  break if @global_visited.size >= crawl_options[:max_urls]
22
23
  break if Time.now > crawl_options[:time_limit]
@@ -37,19 +38,18 @@ class Arachnid2
37
38
  raise e unless e.message =~ /.*Reached error page.*/i
38
39
  next
39
40
  end
40
- links = process(browser.url, browser.body.html)
41
+ links = process(browser.url, browser.body.html) if browser.body.exists?
41
42
  next unless links
42
43
 
43
44
  yield browser
44
45
 
45
46
  vacuum(links, browser.url)
46
47
  rescue => e
47
- raise e if @already_retried
48
- raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
49
- @browser.close if @browser rescue nil
50
- @headless.destroy if @headless rescue nil
51
- @browser = nil
52
- @already_retried = true
48
+ next if e.class == Net::ReadTimeout
49
+
50
+ raise e if raise_before_retry?(e.class)
51
+
52
+ reset_for_retry
53
53
  retry
54
54
  end
55
55
 
@@ -60,6 +60,19 @@ class Arachnid2
60
60
  end
61
61
 
62
62
  private
63
+ def raise_before_retry?(klass)
64
+ @already_retried || \
65
+ "#{klass}".include?("Selenium") || \
66
+ "#{klass}".include?("Watir")
67
+ end
68
+
69
+ def reset_for_retry
70
+ @browser.close if @browser rescue nil
71
+ @headless.destroy if @headless rescue nil
72
+ @browser = nil
73
+ @already_retried = true
74
+ end
75
+
63
76
  def browser
64
77
  unless @browser
65
78
  behead if @make_headless
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-05 00:00:00.000000000 Z
11
+ date: 2020-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubyforge_project:
212
- rubygems_version: 2.7.10
211
+ rubygems_version: 3.0.6
213
212
  signing_key:
214
213
  specification_version: 4
215
214
  summary: A simple, fast web crawler