aranha 0.15.2 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
|
4
|
+
data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
|
7
|
+
data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -7,19 +7,6 @@ require 'aranha/manager'
|
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
class Processor
|
10
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
11
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
12
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
13
|
-
HTTPCLIENT_EXCEPTIONS = [
|
14
|
-
::HTTPClient::BadResponseError,
|
15
|
-
::HTTPClient::ConnectTimeoutError,
|
16
|
-
::HTTPClient::ReceiveTimeoutError
|
17
|
-
].freeze
|
18
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
19
|
-
|
20
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
21
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
22
|
-
|
23
10
|
DEFAULT_MAX_TRIES = 3
|
24
11
|
|
25
12
|
attr_reader :manager
|
@@ -58,20 +45,20 @@ module Aranha
|
|
58
45
|
def process_address(address)
|
59
46
|
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
47
|
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
61
|
-
|
62
|
-
|
63
|
-
@failed.delete(address.id)
|
64
|
-
|
65
|
-
process_exception(
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
66
53
|
end
|
67
54
|
end
|
68
55
|
|
69
|
-
def process_exception(
|
70
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
71
58
|
|
72
|
-
@failed[address.id] ||= 0
|
73
|
-
@failed[address.id] += 1
|
74
|
-
manager.log_warn(
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
75
62
|
end
|
76
63
|
|
77
64
|
def next_address
|
@@ -82,12 +69,6 @@ module Aranha
|
|
82
69
|
::Aranha::Manager.default.unprocessed_addresses
|
83
70
|
end
|
84
71
|
|
85
|
-
def network_exception?(exception)
|
86
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
87
|
-
|
88
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
89
|
-
end
|
90
|
-
|
91
72
|
def not_try_ids
|
92
73
|
@failed.select { |_k, v| v > @try }.map { |k, _v| k }
|
93
74
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aranha-parsers
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- MIT-LICENSE
|
97
97
|
- README.rdoc
|
98
98
|
- lib/aranha.rb
|
99
|
+
- lib/aranha/address_processor.rb
|
99
100
|
- lib/aranha/default_processor.rb
|
100
101
|
- lib/aranha/manager.rb
|
101
102
|
- lib/aranha/processor.rb
|
@@ -119,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
120
|
- !ruby/object:Gem::Version
|
120
121
|
version: '0'
|
121
122
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
+
rubygems_version: 3.1.6
|
123
124
|
signing_key:
|
124
125
|
specification_version: 4
|
125
126
|
summary: Ruby utilities for web crawling.
|