aranha 0.15.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
|
4
|
+
data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
|
7
|
+
data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -7,19 +7,6 @@ require 'aranha/manager'
|
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
class Processor
|
10
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
11
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
12
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
13
|
-
HTTPCLIENT_EXCEPTIONS = [
|
14
|
-
::HTTPClient::BadResponseError,
|
15
|
-
::HTTPClient::ConnectTimeoutError,
|
16
|
-
::HTTPClient::ReceiveTimeoutError
|
17
|
-
].freeze
|
18
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
19
|
-
|
20
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
21
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
22
|
-
|
23
10
|
DEFAULT_MAX_TRIES = 3
|
24
11
|
|
25
12
|
attr_reader :manager
|
@@ -58,20 +45,20 @@ module Aranha
|
|
58
45
|
def process_address(address)
|
59
46
|
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
47
|
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
61
|
-
|
62
|
-
|
63
|
-
@failed.delete(address.id)
|
64
|
-
|
65
|
-
process_exception(
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
66
53
|
end
|
67
54
|
end
|
68
55
|
|
69
|
-
def process_exception(
|
70
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
71
58
|
|
72
|
-
@failed[address.id] ||= 0
|
73
|
-
@failed[address.id] += 1
|
74
|
-
manager.log_warn(
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
75
62
|
end
|
76
63
|
|
77
64
|
def next_address
|
@@ -82,12 +69,6 @@ module Aranha
|
|
82
69
|
::Aranha::Manager.default.unprocessed_addresses
|
83
70
|
end
|
84
71
|
|
85
|
-
def network_exception?(exception)
|
86
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
87
|
-
|
88
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
89
|
-
end
|
90
|
-
|
91
72
|
def not_try_ids
|
92
73
|
@failed.select { |_k, v| v > @try }.map { |k, _v| k }
|
93
74
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aranha-parsers
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- MIT-LICENSE
|
97
97
|
- README.rdoc
|
98
98
|
- lib/aranha.rb
|
99
|
+
- lib/aranha/address_processor.rb
|
99
100
|
- lib/aranha/default_processor.rb
|
100
101
|
- lib/aranha/manager.rb
|
101
102
|
- lib/aranha/processor.rb
|
@@ -119,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
120
|
- !ruby/object:Gem::Version
|
120
121
|
version: '0'
|
121
122
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
+
rubygems_version: 3.1.6
|
123
124
|
signing_key:
|
124
125
|
specification_version: 4
|
125
126
|
summary: Ruby utilities for web crawling.
|