arachnid2 0.3.9 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0299b63b16c0e19acd87ff8e8b34302f015a8f36ea87855faedbbe3c38d0f080'
4
- data.tar.gz: 5da1c60fe38252b7699a5ecd2ead756fd01e2e13918ef25923c5b44ab039617c
3
+ metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
4
+ data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
5
5
  SHA512:
6
- metadata.gz: 6525470d1ec273fb06421f0a092307527786697fa5bb698c8f2fb5f788ef9b6c220bb09204656cec807e7f8816c65684b061afa0808fbc49317abee19a9b8d4b
7
- data.tar.gz: 8f7bc29d3e5129da2e04c5bc32f44a0025244ba160581c2a073e9b61cc46baa3685f3481e979743556342bf4b2791767eb0c21b42558f62b2443645f592be677
6
+ metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
7
+ data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'
data/README.md CHANGED
@@ -186,6 +186,36 @@ with_watir = true
186
186
  Arachnid2.new(url).crawl(opts, with_watir)
187
187
  ```
188
188
 
189
+ Arachnid2 has base defaults which you might want to address when
190
+ employing Watir.
191
+
192
+ * First, the default crawl time is 15 seconds.
193
+ As browser page loads can take this long, you will probably want to
194
+ set a higher crawl time.
195
+ * Simply storing the browser is not a great idea, since it will
196
+ be inaccessible after it is closed. Instead, consider nabbing the
197
+ HTML, cookies, or whatever content is required during the crawl.
198
+ * Finally, note that Firefox is the default browser.
199
+
200
+
201
+ ```ruby
202
+ require 'arachnid2'
203
+
204
+ with_watir = true
205
+ responses = []
206
+ url = "http://maximumfun.org"
207
+ max = 60
208
+ browser = :chrome
209
+ opts = {time_box: max, browser_type: browser}
210
+
211
+ spider = Arachnid2.new(url)
212
+ spider.crawl(opts, with_watir) do |response|
213
+ response.body.wait_until(&:present?)
214
+ responses << response.body.html if response.body.present?
215
+ end
216
+
217
+ ```
218
+
189
219
  #### Options
190
220
 
191
221
  See the Typhoeus options above &mdash; most apply to Watir as well, with
@@ -1,5 +1,5 @@
1
1
  require "arachnid2/version"
2
- require "arachnid2/cached_arachnid_responses"
2
+ require "arachnid2/cached_responses"
3
3
  require "arachnid2/exoskeleton"
4
4
  require "arachnid2/typhoeus"
5
5
  require "arachnid2/watir"
@@ -1,6 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'json'
3
- module CachedArachnidResponses
3
+ module CachedResponses
4
4
  CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
5
 
6
6
  def load_data(_url, _options)
@@ -15,7 +15,7 @@ module CachedArachnidResponses
15
15
 
16
16
  body = ::JSON.parse(response.body)
17
17
  responses_list = Base64.decode64(body['encrypted_response'])
18
- return Marshal.load responses_list # here we get array of Typhoeus::Response
18
+ return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
19
19
  end
20
20
  rescue StandardError
21
21
  nil
@@ -10,7 +10,7 @@ class Arachnid2
10
10
  end
11
11
 
12
12
  def process(url, html)
13
- return false unless Adomain["#{url}"].include? @domain
13
+ return false unless Adomain["#{url}"]&.include? @domain
14
14
 
15
15
  extract_hrefs(html)
16
16
  end
@@ -1,6 +1,6 @@
1
1
  class Arachnid2
2
2
  class Typhoeus
3
- include CachedArachnidResponses
3
+ include CachedResponses
4
4
  include Arachnid2::Exoskeleton
5
5
 
6
6
  def initialize(url)
@@ -17,39 +17,58 @@ class Arachnid2
17
17
  max_concurrency.times do
18
18
  q = @global_queue.shift
19
19
 
20
- break if @global_visited.size >= crawl_options[:max_urls] || \
21
- Time.now > crawl_options[:time_limit] || \
22
- memory_danger?
23
-
20
+ break if time_to_stop?
24
21
  @global_visited.insert(q)
25
22
 
26
- request = ::Typhoeus::Request.new(q, request_options)
27
-
28
- data = load_data(@url, opts)
29
- data.each { |response| yield response } and return unless data.nil?
30
-
31
- request.on_complete do |response|
32
- @cached_data.push(response)
33
- links = process(response.effective_url, response.body)
34
- next unless links
35
-
36
- yield response
37
-
38
- vacuum(links, response.effective_url)
39
- end
23
+ found_in_cache = use_cache(q, opts, &Proc.new)
24
+ return if found_in_cache
40
25
 
41
- @hydra.queue(request)
26
+ request = ::Typhoeus::Request.new(q, request_options)
27
+ requestable = after_request(request, &Proc.new)
28
+ @hydra.queue(request) if requestable
42
29
  end # max_concurrency.times do
43
30
 
44
31
  @hydra.run
45
-
46
32
  end # until @global_queue.empty?
47
- put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
48
33
  ensure
49
34
  @cookie_file.close! if @cookie_file
50
35
  end # def crawl(opts = {})
51
36
 
52
37
  private
38
+ def after_request(request)
39
+ request.on_complete do |response|
40
+ cacheable = use_response(response, &Proc.new)
41
+ return unless cacheable
42
+
43
+ put_cached_data(response.effective_url, @options, response)
44
+ end
45
+
46
+ true
47
+ end
48
+
49
+ def use_response(response)
50
+ links = process(response.effective_url, response.body)
51
+ return unless links
52
+
53
+ yield response
54
+
55
+ vacuum(links, response.effective_url)
56
+ true
57
+ end
58
+
59
+ def use_cache(url, options)
60
+ data = load_data(url, options)
61
+ use_response(data, &Proc.new) if data
62
+
63
+ data
64
+ end
65
+
66
+ def time_to_stop?
67
+ @global_visited.size >= crawl_options[:max_urls] || \
68
+ Time.now > crawl_options[:time_limit] || \
69
+ memory_danger?
70
+ end
71
+
53
72
  def typhoeus_preflight
54
73
  @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
55
74
  typhoeus_proxy_options
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.3.9"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -19,44 +19,71 @@ class Arachnid2
19
19
  q = @global_queue.shift
20
20
  links = nil
21
21
 
22
- break if @global_visited.size >= crawl_options[:max_urls]
23
- break if Time.now > crawl_options[:time_limit]
24
- break if memory_danger?
22
+ break if time_to_stop?
25
23
 
26
24
  @global_visited.insert(q)
27
25
 
26
+ make_request(q, &Proc.new)
27
+ end # until @global_queue.empty?
28
+ ensure
29
+ @browser.close if @browser rescue nil
30
+ @headless.destroy if @headless rescue nil
31
+ end
32
+
33
+ private
34
+ def make_request(q)
28
35
  begin
29
- begin
30
- browser.goto q
31
- rescue Selenium::WebDriver::Error::UnknownError => e
32
- # Firefox and Selenium, in their infinite wisdom
33
- # raise an error when a page cannot be loaded.
34
- # At the time of writing this, the page at
35
- # thewirecutter.com/cars/accessories-auto
36
- # causes such an issue (too many redirects).
37
- # This error handling moves us on from those pages.
38
- raise e unless e.message =~ /.*Reached error page.*/i
39
- next
40
- end
41
- links = process(browser.url, browser.body.html) if browser.body.exists?
42
- next unless links
43
-
44
- yield browser
36
+ links = browse_links(q, &Proc.new)
37
+ return unless links
45
38
 
46
39
  vacuum(links, browser.url)
47
40
  rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
41
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
42
+ "is ignoring an error: " \
43
+ "#{e.class} - #{e.message}"
44
+ puts msg
48
45
  rescue => e
49
46
  raise e if raise_before_retry?(e.class)
47
+ msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
48
+ "is retrying once after an error: " \
49
+ "#{e.class} - #{e.message}"
50
+ puts msg
51
+ e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
50
52
  reset_for_retry
51
53
  end
54
+ end
52
55
 
53
- end # until @global_queue.empty?
54
- ensure
55
- @browser.close if @browser rescue nil
56
- @headless.destroy if @headless rescue nil
57
- end
56
+ def browse_links(url)
57
+ return unless navigate(url)
58
+
59
+ yield browser
60
+
61
+ process(browser.url, browser.body.html) if browser.body.exists?
62
+ end
63
+
64
+ def navigate(url)
65
+ begin
66
+ browser.goto url
67
+ rescue Selenium::WebDriver::Error::UnknownError => e
68
+ # Firefox and Selenium, in their infinite wisdom
69
+ # raise an error when a page cannot be loaded.
70
+ # At the time of writing this, the page at
71
+ # thewirecutter.com/cars/accessories-auto
72
+ # causes such an issue (too many redirects).
73
+ # This error handling moves us on from those pages.
74
+ raise e unless e.message =~ /.*Reached error page.*/i
75
+ return
76
+ end
77
+
78
+ true
79
+ end
80
+
81
+ def time_to_stop?
82
+ @global_visited.size >= crawl_options[:max_urls] || \
83
+ Time.now > crawl_options[:time_limit] || \
84
+ memory_danger?
85
+ end
58
86
 
59
- private
60
87
  def raise_before_retry?(klass)
61
88
  @already_retried || \
62
89
  "#{klass}".include?("Selenium") || \
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.9
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-05 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -184,7 +184,7 @@ files:
184
184
  - bin/console
185
185
  - bin/setup
186
186
  - lib/arachnid2.rb
187
- - lib/arachnid2/cached_arachnid_responses.rb
187
+ - lib/arachnid2/cached_responses.rb
188
188
  - lib/arachnid2/exoskeleton.rb
189
189
  - lib/arachnid2/typhoeus.rb
190
190
  - lib/arachnid2/version.rb
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.0.6
211
+ rubygems_version: 3.1.2
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A simple, fast web crawler