arachnid2 0.3.5 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e25353806a447177f129c56d4c57c38c70223849f2bbd858c932f3f4ec8a4ef
4
- data.tar.gz: d2725c9981671ee010692d82b97801ccc00a1f2b28663fb72b23bc08f6be890e
3
+ metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
4
+ data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
5
5
  SHA512:
6
- metadata.gz: 52a0b49101ca136ddee4c4ae8e976bd81cc9f3c559df3a94463bee7f42a2e4ce591330e2a587f5285bac98be52723ab518870ac8a8197413df8cd06267892858
7
- data.tar.gz: 2514be62a0ae76a2d594f14d5ad8b66a45696bafa455a6347bb04b07ae99e48f322936d0afb6bd9e025c67ac9ce52213519f398a8f5deec54e508d6c4f1b4d84
6
+ metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
7
+ data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
@@ -1,11 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.3.5)
4
+ arachnid2 (0.3.9)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
8
- nokogiri (>= 1.8.5)
8
+ nokogiri (>= 1.10.4)
9
9
  typhoeus
10
10
  watir
11
11
  webdriver-user-agent (>= 7.6)
@@ -14,30 +14,30 @@ PATH
14
14
  GEM
15
15
  remote: https://rubygems.org/
16
16
  specs:
17
- addressable (2.6.0)
18
- public_suffix (>= 2.0.2, < 4.0)
19
- adomain (0.1.1)
17
+ addressable (2.7.0)
18
+ public_suffix (>= 2.0.2, < 5.0)
19
+ adomain (0.2.3)
20
20
  addressable (~> 2.5)
21
+ logger
21
22
  bloomfilter-rb (2.1.1)
22
23
  redis
23
- childprocess (0.9.0)
24
- ffi (~> 1.0, >= 1.0.11)
24
+ childprocess (3.0.0)
25
25
  diff-lcs (1.3)
26
26
  ethon (0.12.0)
27
27
  ffi (>= 1.3.0)
28
28
  facets (3.1.0)
29
- ffi (1.10.0)
30
- json (2.2.0)
29
+ ffi (1.12.2)
30
+ json (2.3.0)
31
+ logger (1.4.2)
31
32
  mini_portile2 (2.4.0)
32
- net_http_ssl_fix (0.0.10)
33
- nokogiri (1.10.1)
33
+ nokogiri (1.10.9)
34
34
  mini_portile2 (~> 2.4.0)
35
- os (1.0.0)
35
+ os (1.0.1)
36
36
  psych (3.1.0)
37
- public_suffix (3.0.3)
38
- rake (10.5.0)
39
- redis (4.1.0)
40
- regexp_parser (1.3.0)
37
+ public_suffix (4.0.3)
38
+ rake (13.0.1)
39
+ redis (4.1.3)
40
+ regexp_parser (1.7.0)
41
41
  rspec (3.8.0)
42
42
  rspec-core (~> 3.8.0)
43
43
  rspec-expectations (~> 3.8.0)
@@ -51,10 +51,10 @@ GEM
51
51
  diff-lcs (>= 1.2.0, < 2.0)
52
52
  rspec-support (~> 3.8.0)
53
53
  rspec-support (3.8.0)
54
- rubyzip (1.2.2)
55
- selenium-webdriver (3.141.0)
56
- childprocess (~> 0.5)
57
- rubyzip (~> 1.2, >= 1.2.2)
54
+ rubyzip (2.2.0)
55
+ selenium-webdriver (3.142.7)
56
+ childprocess (>= 0.5, < 4.0)
57
+ rubyzip (>= 1.2.2)
58
58
  typhoeus (1.3.1)
59
59
  ethon (>= 0.9.0)
60
60
  watir (6.16.5)
@@ -66,11 +66,10 @@ GEM
66
66
  os
67
67
  psych
68
68
  selenium-webdriver (>= 3.4.0)
69
- webdrivers (3.6.0)
70
- net_http_ssl_fix
69
+ webdrivers (4.2.0)
71
70
  nokogiri (~> 1.6)
72
- rubyzip (~> 1.0)
73
- selenium-webdriver (~> 3.0)
71
+ rubyzip (>= 1.3.0)
72
+ selenium-webdriver (>= 3.0, < 4.0)
74
73
 
75
74
  PLATFORMS
76
75
  ruby
@@ -78,8 +77,8 @@ PLATFORMS
78
77
  DEPENDENCIES
79
78
  arachnid2!
80
79
  bundler (~> 1.16)
81
- rake (~> 10.0)
80
+ rake (>= 12.3.3)
82
81
  rspec (~> 3.0)
83
82
 
84
83
  BUNDLED WITH
85
- 1.16.5
84
+ 1.17.3
data/README.md CHANGED
@@ -186,6 +186,36 @@ with_watir = true
186
186
  Arachnid2.new(url).crawl(opts, with_watir)
187
187
  ```
188
188
 
189
+ Arachnid2 has base defaults which you might want to address when
190
+ employing Watir.
191
+
192
+ * First, the default crawl time is 15 seconds.
193
+ As browser page loads can take this long, you will probably want to
194
+ set a higher crawl time.
195
+ * Simply storing the browser is not a great idea, since it will
196
+ be inaccessible after it is closed. Instead, consider nabbing the
197
+ HTML, cookies, or whatever content is required during the crawl.
198
+ * Finally, note that Firefox is the default browser.
199
+
200
+
201
+ ```ruby
202
+ require 'arachnid2'
203
+
204
+ with_watir = true
205
+ responses = []
206
+ url = "http://maximumfun.org"
207
+ max = 60
208
+ browser = :chrome
209
+ opts = {time_box: max, browser_type: browser}
210
+
211
+ spider = Arachnid2.new(url)
212
+ spider.crawl(opts, with_watir) do |response|
213
+ response.body.wait_until(&:present?)
214
+ responses << response.body.html if response.body.present?
215
+ end
216
+
217
+ ```
218
+
189
219
  #### Options
190
220
 
191
221
  See the Typhoeus options above &mdash; most apply to Watir as well, with
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  spec.add_development_dependency "bundler", "~> 1.16"
25
- spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rake", ">= 12.3.3"
26
26
  spec.add_development_dependency "rspec", "~> 3.0"
27
27
 
28
28
  spec.add_dependency "webdriver-user-agent", ">= 7.6"
@@ -32,5 +32,5 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "bloomfilter-rb"
33
33
  spec.add_dependency "adomain"
34
34
  spec.add_dependency "addressable"
35
- spec.add_dependency "nokogiri", ">= 1.8.5"
35
+ spec.add_dependency "nokogiri", ">= 1.10.4"
36
36
  end
@@ -1,5 +1,5 @@
1
1
  require "arachnid2/version"
2
- require "arachnid2/cached_arachnid_responses"
2
+ require "arachnid2/cached_responses"
3
3
  require "arachnid2/exoskeleton"
4
4
  require "arachnid2/typhoeus"
5
5
  require "arachnid2/watir"
@@ -1,6 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'json'
3
- module CachedArachnidResponses
3
+ module CachedResponses
4
4
  CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
5
 
6
6
  def load_data(_url, _options)
@@ -15,7 +15,7 @@ module CachedArachnidResponses
15
15
 
16
16
  body = ::JSON.parse(response.body)
17
17
  responses_list = Base64.decode64(body['encrypted_response'])
18
- return Marshal.load responses_list # here we get array of Typhoeus::Response
18
+ return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
19
19
  end
20
20
  rescue StandardError
21
21
  nil
@@ -10,7 +10,7 @@ class Arachnid2
10
10
  end
11
11
 
12
12
  def process(url, html)
13
- return false unless Adomain["#{url}"].include? @domain
13
+ return false unless Adomain["#{url}"]&.include? @domain
14
14
 
15
15
  extract_hrefs(html)
16
16
  end
@@ -1,6 +1,6 @@
1
1
  class Arachnid2
2
2
  class Typhoeus
3
- include CachedArachnidResponses
3
+ include CachedResponses
4
4
  include Arachnid2::Exoskeleton
5
5
 
6
6
  def initialize(url)
@@ -17,39 +17,58 @@ class Arachnid2
17
17
  max_concurrency.times do
18
18
  q = @global_queue.shift
19
19
 
20
- break if @global_visited.size >= crawl_options[:max_urls] || \
21
- Time.now > crawl_options[:time_limit] || \
22
- memory_danger?
23
-
20
+ break if time_to_stop?
24
21
  @global_visited.insert(q)
25
22
 
26
- request = ::Typhoeus::Request.new(q, request_options)
27
-
28
- data = load_data(@url, opts)
29
- data.each { |response| yield response } and return unless data.nil?
30
-
31
- request.on_complete do |response|
32
- @cached_data.push(response)
33
- links = process(response.effective_url, response.body)
34
- next unless links
35
-
36
- yield response
37
-
38
- vacuum(links, response.effective_url)
39
- end
23
+ found_in_cache = use_cache(q, opts, &Proc.new)
24
+ return if found_in_cache
40
25
 
41
- @hydra.queue(request)
26
+ request = ::Typhoeus::Request.new(q, request_options)
27
+ requestable = after_request(request, &Proc.new)
28
+ @hydra.queue(request) if requestable
42
29
  end # max_concurrency.times do
43
30
 
44
31
  @hydra.run
45
-
46
32
  end # until @global_queue.empty?
47
- put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
48
33
  ensure
49
34
  @cookie_file.close! if @cookie_file
50
35
  end # def crawl(opts = {})
51
36
 
52
37
  private
38
+ def after_request(request)
39
+ request.on_complete do |response|
40
+ cacheable = use_response(response, &Proc.new)
41
+ return unless cacheable
42
+
43
+ put_cached_data(response.effective_url, @options, response)
44
+ end
45
+
46
+ true
47
+ end
48
+
49
+ def use_response(response)
50
+ links = process(response.effective_url, response.body)
51
+ return unless links
52
+
53
+ yield response
54
+
55
+ vacuum(links, response.effective_url)
56
+ true
57
+ end
58
+
59
+ def use_cache(url, options)
60
+ data = load_data(url, options)
61
+ use_response(data, &Proc.new) if data
62
+
63
+ data
64
+ end
65
+
66
+ def time_to_stop?
67
+ @global_visited.size >= crawl_options[:max_urls] || \
68
+ Time.now > crawl_options[:time_limit] || \
69
+ memory_danger?
70
+ end
71
+
53
72
  def typhoeus_preflight
54
73
  @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
55
74
  typhoeus_proxy_options
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.5"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -13,46 +13,17 @@ class Arachnid2
13
13
  def crawl(opts)
14
14
  preflight(opts)
15
15
  watir_preflight
16
+ @already_retried = false
16
17
 
17
18
  until @global_queue.empty?
18
- @already_retried = false
19
19
  q = @global_queue.shift
20
+ links = nil
20
21
 
21
- break if @global_visited.size >= crawl_options[:max_urls]
22
- break if Time.now > crawl_options[:time_limit]
23
- break if memory_danger?
22
+ break if time_to_stop?
24
23
 
25
24
  @global_visited.insert(q)
26
25
 
27
- begin
28
- begin
29
- browser.goto q
30
- rescue Selenium::WebDriver::Error::UnknownError => e
31
- # Firefox and Selenium, in their infinite wisdom
32
- # raise an error when a page cannot be loaded.
33
- # At the time of writing this, the page at
34
- # thewirecutter.com/cars/accessories-auto
35
- # causes such an issue (too many redirects).
36
- # This error handling moves us on from those pages.
37
- raise e unless e.message =~ /.*Reached error page.*/i
38
- next
39
- end
40
- links = process(browser.url, browser.body.html)
41
- next unless links
42
-
43
- yield browser
44
-
45
- vacuum(links, browser.url)
46
- rescue => e
47
- raise e if @already_retried
48
- raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
49
- @browser.close if @browser rescue nil
50
- @headless.destroy if @headless rescue nil
51
- @browser = nil
52
- @already_retried = true
53
- retry
54
- end
55
-
26
+ make_request(q, &Proc.new)
56
27
  end # until @global_queue.empty?
57
28
  ensure
58
29
  @browser.close if @browser rescue nil
@@ -60,6 +31,74 @@ class Arachnid2
60
31
  end
61
32
 
62
33
  private
34
+ def make_request(q)
35
+ begin
36
+ links = browse_links(q, &Proc.new)
37
+ return unless links
38
+
39
+ vacuum(links, browser.url)
40
+ rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
41
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
42
+ "is ignoring an error: " \
43
+ "#{e.class} - #{e.message}"
44
+ puts msg
45
+ rescue => e
46
+ raise e if raise_before_retry?(e.class)
47
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
48
+ "is retrying once after an error: " \
49
+ "#{e.class} - #{e.message}"
50
+ puts msg
51
+ e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
52
+ reset_for_retry
53
+ end
54
+ end
55
+
56
+ def browse_links(url)
57
+ return unless navigate(url)
58
+
59
+ yield browser
60
+
61
+ process(browser.url, browser.body.html) if browser.body.exists?
62
+ end
63
+
64
+ def navigate(url)
65
+ begin
66
+ browser.goto url
67
+ rescue Selenium::WebDriver::Error::UnknownError => e
68
+ # Firefox and Selenium, in their infinite wisdom
69
+ # raise an error when a page cannot be loaded.
70
+ # At the time of writing this, the page at
71
+ # thewirecutter.com/cars/accessories-auto
72
+ # causes such an issue (too many redirects).
73
+ # This error handling moves us on from those pages.
74
+ raise e unless e.message =~ /.*Reached error page.*/i
75
+ return
76
+ end
77
+
78
+ true
79
+ end
80
+
81
+ def time_to_stop?
82
+ @global_visited.size >= crawl_options[:max_urls] || \
83
+ Time.now > crawl_options[:time_limit] || \
84
+ memory_danger?
85
+ end
86
+
87
+ def raise_before_retry?(klass)
88
+ @already_retried || \
89
+ "#{klass}".include?("Selenium") || \
90
+ "#{klass}".include?("Watir")
91
+ end
92
+
93
+ def reset_for_retry
94
+ @browser.close if @browser rescue nil
95
+ @headless.destroy if @headless rescue nil
96
+ @driver.quit if @headless rescue nil
97
+ @driver = nil
98
+ @browser = nil
99
+ @already_retried = true
100
+ end
101
+
63
102
  def browser
64
103
  unless @browser
65
104
  behead if @make_headless
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-18 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -28,16 +28,16 @@ dependencies:
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -156,14 +156,14 @@ dependencies:
156
156
  requirements:
157
157
  - - ">="
158
158
  - !ruby/object:Gem::Version
159
- version: 1.8.5
159
+ version: 1.10.4
160
160
  type: :runtime
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
- version: 1.8.5
166
+ version: 1.10.4
167
167
  description:
168
168
  email:
169
169
  - scnissen@gmail.com
@@ -184,7 +184,7 @@ files:
184
184
  - bin/console
185
185
  - bin/setup
186
186
  - lib/arachnid2.rb
187
- - lib/arachnid2/cached_arachnid_responses.rb
187
+ - lib/arachnid2/cached_responses.rb
188
188
  - lib/arachnid2/exoskeleton.rb
189
189
  - lib/arachnid2/typhoeus.rb
190
190
  - lib/arachnid2/version.rb
@@ -208,8 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubyforge_project:
212
- rubygems_version: 2.7.7
211
+ rubygems_version: 3.1.2
213
212
  signing_key:
214
213
  specification_version: 4
215
214
  summary: A simple, fast web crawler