arachnid2 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f21f9cc6f752a43952d694abe4419d8c5d2ba4f3e5154543dab9d39e415753f
4
- data.tar.gz: c29401a47c0291cb5087e237b72431167d3d5f59210a49fb9d73a8ef399fbb36
3
+ metadata.gz: 1df2624ea1b73392b92e3fcdc229e0816f2aab2dcac07e160749b29fdd0149e6
4
+ data.tar.gz: fc26566bd947099e80b028e0523cb7bbe469a4304ceea4d69d262dd33ad4e410
5
5
  SHA512:
6
- metadata.gz: f6114133ad6c4b26bebfa762c676f5c4b544c55e3fb0802c3f5571b7d7ae2f4235218871f75e2bf284bdd1ebb5b64e992d83e291fd6c900038fc68bb6afe54a3
7
- data.tar.gz: b543fb52387b4188c8738f8dbc44f0be5aec764ead8a53c2babf1e4d51c8d778c45ead9389478942afe6d16f3629fe7a81d6f4d965217079528ec7f968e2f89f
6
+ metadata.gz: 2af5a70e62c2194566c855edaebe805201d3cca0d612e32cbd15dde06cd49869538fcbb4af2ad14b5ed0c9f45b968ce39085b36b0faf74eac115f0a7ba5f92cc
7
+ data.tar.gz: bc250f87e6376fa05e1756490a5a8a7693f3cdd88df1c536e227a9a26b57a734f299a8c6cbcfa7f2a6ad16721c7c6ef1f4022ce7822b4cd5ca17f883b57a1fdd
@@ -1,11 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.3.5)
4
+ arachnid2 (0.3.7)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
8
- nokogiri (>= 1.8.5)
8
+ nokogiri (>= 1.10.4)
9
9
  typhoeus
10
10
  watir
11
11
  webdriver-user-agent (>= 7.6)
@@ -14,30 +14,31 @@ PATH
14
14
  GEM
15
15
  remote: https://rubygems.org/
16
16
  specs:
17
- addressable (2.6.0)
18
- public_suffix (>= 2.0.2, < 4.0)
19
- adomain (0.1.1)
17
+ addressable (2.7.0)
18
+ public_suffix (>= 2.0.2, < 5.0)
19
+ adomain (0.2.3)
20
20
  addressable (~> 2.5)
21
+ logger
21
22
  bloomfilter-rb (2.1.1)
22
23
  redis
23
- childprocess (0.9.0)
24
- ffi (~> 1.0, >= 1.0.11)
24
+ childprocess (1.0.1)
25
+ rake (< 13.0)
25
26
  diff-lcs (1.3)
26
27
  ethon (0.12.0)
27
28
  ffi (>= 1.3.0)
28
29
  facets (3.1.0)
29
- ffi (1.10.0)
30
+ ffi (1.11.1)
30
31
  json (2.2.0)
32
+ logger (1.4.1)
31
33
  mini_portile2 (2.4.0)
32
- net_http_ssl_fix (0.0.10)
33
- nokogiri (1.10.4)
34
+ nokogiri (1.10.7)
34
35
  mini_portile2 (~> 2.4.0)
35
- os (1.0.0)
36
+ os (1.0.1)
36
37
  psych (3.1.0)
37
- public_suffix (3.0.3)
38
+ public_suffix (4.0.1)
38
39
  rake (10.5.0)
39
- redis (4.1.0)
40
- regexp_parser (1.3.0)
40
+ redis (4.1.2)
41
+ regexp_parser (1.6.0)
41
42
  rspec (3.8.0)
42
43
  rspec-core (~> 3.8.0)
43
44
  rspec-expectations (~> 3.8.0)
@@ -51,9 +52,9 @@ GEM
51
52
  diff-lcs (>= 1.2.0, < 2.0)
52
53
  rspec-support (~> 3.8.0)
53
54
  rspec-support (3.8.0)
54
- rubyzip (1.2.2)
55
- selenium-webdriver (3.141.0)
56
- childprocess (~> 0.5)
55
+ rubyzip (1.2.3)
56
+ selenium-webdriver (3.142.3)
57
+ childprocess (>= 0.5, < 2.0)
57
58
  rubyzip (~> 1.2, >= 1.2.2)
58
59
  typhoeus (1.3.1)
59
60
  ethon (>= 0.9.0)
@@ -66,11 +67,10 @@ GEM
66
67
  os
67
68
  psych
68
69
  selenium-webdriver (>= 3.4.0)
69
- webdrivers (3.6.0)
70
- net_http_ssl_fix
70
+ webdrivers (4.1.0)
71
71
  nokogiri (~> 1.6)
72
72
  rubyzip (~> 1.0)
73
- selenium-webdriver (~> 3.0)
73
+ selenium-webdriver (>= 3.0, < 4.0)
74
74
 
75
75
  PLATFORMS
76
76
  ruby
@@ -82,4 +82,4 @@ DEPENDENCIES
82
82
  rspec (~> 3.0)
83
83
 
84
84
  BUNDLED WITH
85
- 1.16.6
85
+ 1.17.3
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.6"
2
+ VERSION = "0.3.7"
3
3
  end
@@ -17,6 +17,7 @@ class Arachnid2
17
17
  until @global_queue.empty?
18
18
  @already_retried = false
19
19
  q = @global_queue.shift
20
+ links = nil
20
21
 
21
22
  break if @global_visited.size >= crawl_options[:max_urls]
22
23
  break if Time.now > crawl_options[:time_limit]
@@ -37,19 +38,18 @@ class Arachnid2
37
38
  raise e unless e.message =~ /.*Reached error page.*/i
38
39
  next
39
40
  end
40
- links = process(browser.url, browser.body.html)
41
+ links = process(browser.url, browser.body.html) if browser.body.exists?
41
42
  next unless links
42
43
 
43
44
  yield browser
44
45
 
45
46
  vacuum(links, browser.url)
46
47
  rescue => e
47
- raise e if @already_retried
48
- raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
49
- @browser.close if @browser rescue nil
50
- @headless.destroy if @headless rescue nil
51
- @browser = nil
52
- @already_retried = true
48
+ next if e.class == Net::ReadTimeout
49
+
50
+ raise e if raise_before_retry?(e.class)
51
+
52
+ reset_for_retry
53
53
  retry
54
54
  end
55
55
 
@@ -60,6 +60,19 @@ class Arachnid2
60
60
  end
61
61
 
62
62
  private
63
+ def raise_before_retry?(klass)
64
+ @already_retried || \
65
+ "#{klass}".include?("Selenium") || \
66
+ "#{klass}".include?("Watir")
67
+ end
68
+
69
+ def reset_for_retry
70
+ @browser.close if @browser rescue nil
71
+ @headless.destroy if @headless rescue nil
72
+ @browser = nil
73
+ @already_retried = true
74
+ end
75
+
63
76
  def browser
64
77
  unless @browser
65
78
  behead if @make_headless
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-05 00:00:00.000000000 Z
11
+ date: 2020-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubyforge_project:
212
- rubygems_version: 2.7.10
211
+ rubygems_version: 3.0.6
213
212
  signing_key:
214
213
  specification_version: 4
215
214
  summary: A simple, fast web crawler