aranha 0.15.2 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cf75a29a21e24f2e85a425b8b9291c058a46eef895a8100992514dd03138b065
4
- data.tar.gz: d52cc8cf889e0d9174f13771e3c9ac15971a1f673dc8e4e15085aab3755e169b
3
+ metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
4
+ data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
5
5
  SHA512:
6
- metadata.gz: 4844ac22fc5d9b3a16e7b03d8da6e5e5fc82a95dd705a0bbbb11c029f206fdc6fb762d1fd2cd71a2c8f869084597ad3f8d65e88f79f95bf37dfcd02a67cc59c6
7
- data.tar.gz: c768ba132ef0b9afbf9630e62aeb4759ab5bdb09e8d6af2658c59b19360c2a9c11dd3c1abd49431111fa25273c895ede8eb2536d7f63ebd607a1965c9e9a0813
6
+ metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
7
+ data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -7,19 +7,6 @@ require 'aranha/manager'
7
7
 
8
8
  module Aranha
9
9
  class Processor
10
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
11
- CORE_EXCEPTIONS = [::SocketError].freeze
12
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
13
- HTTPCLIENT_EXCEPTIONS = [
14
- ::HTTPClient::BadResponseError,
15
- ::HTTPClient::ConnectTimeoutError,
16
- ::HTTPClient::ReceiveTimeoutError
17
- ].freeze
18
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
19
-
20
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
21
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
22
-
23
10
  DEFAULT_MAX_TRIES = 3
24
11
 
25
12
  attr_reader :manager
@@ -58,20 +45,20 @@ module Aranha
58
45
  def process_address(address)
59
46
  manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
60
47
  " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
61
- begin
62
- address.process
63
- @failed.delete(address.id)
64
- rescue StandardError => e
65
- process_exception(address, e)
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
66
53
  end
67
54
  end
68
55
 
69
- def process_exception(address, exception)
70
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
71
58
 
72
- @failed[address.id] ||= 0
73
- @failed[address.id] += 1
74
- manager.log_warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
75
62
  end
76
63
 
77
64
  def next_address
@@ -82,12 +69,6 @@ module Aranha
82
69
  ::Aranha::Manager.default.unprocessed_addresses
83
70
  end
84
71
 
85
- def network_exception?(exception)
86
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
87
-
88
- exception.cause.present? ? network_exception?(exception.cause) : false
89
- end
90
-
91
72
  def not_try_ids
92
73
  @failed.select { |_k, v| v > @try }.map { |k, _v| k }
93
74
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.15.2'
4
+ VERSION = '0.16.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-28 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aranha-parsers
@@ -96,6 +96,7 @@ files:
96
96
  - MIT-LICENSE
97
97
  - README.rdoc
98
98
  - lib/aranha.rb
99
+ - lib/aranha/address_processor.rb
99
100
  - lib/aranha/default_processor.rb
100
101
  - lib/aranha/manager.rb
101
102
  - lib/aranha/processor.rb
@@ -119,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
120
  - !ruby/object:Gem::Version
120
121
  version: '0'
121
122
  requirements: []
122
- rubygems_version: 3.0.9
123
+ rubygems_version: 3.1.6
123
124
  signing_key:
124
125
  specification_version: 4
125
126
  summary: Ruby utilities for web crawling.