aranha 0.15.0 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +3 -7
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +14 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33a45e4db83f3d2743b53386005803da35188866dad28e15313a56b747efd4d1
|
4
|
+
data.tar.gz: 77ac8f2bcd922652fe62999f43d148a6557fb66d62e9edb21ba51d00ad5cb6d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9424bbcd6540d5627052427cdbc746db9f36470e35e3c6deed3b2668160de22b2163e3f0a8b5ef03c13681bb05bb42b20c1a7348b71e616facdd82bb464e1fc
|
7
|
+
data.tar.gz: 54800ff8cc979371da63d2cea2543951f08f3f9a6423d0b20de7a9399cdc19338775f087d9a6791bcd3a0ec27d0500983fd47056f4438d9603e1fac29879a7cb
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -1,11 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
class DefaultProcessor
|
7
|
-
attr_reader :source_uri, :extra_data
|
8
|
-
|
9
8
|
class << self
|
10
9
|
def sanitize_uri(uri)
|
11
10
|
return uri if uri.is_a?(Hash)
|
@@ -15,17 +14,14 @@ module Aranha
|
|
15
14
|
end
|
16
15
|
end
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
@extra_data = extra_data
|
17
|
+
common_constructor :source_uri, :extra_data do
|
18
|
+
self.source_uri = self.class.sanitize_uri(source_uri)
|
21
19
|
end
|
22
20
|
|
23
21
|
def process
|
24
22
|
raise 'Implement method process'
|
25
23
|
end
|
26
24
|
|
27
|
-
protected
|
28
|
-
|
29
25
|
def target_uri
|
30
26
|
source_uri
|
31
27
|
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -7,19 +7,6 @@ require 'aranha/manager'
|
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
class Processor
|
10
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
11
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
12
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
13
|
-
HTTPCLIENT_EXCEPTIONS = [
|
14
|
-
::HTTPClient::BadResponseError,
|
15
|
-
::HTTPClient::ConnectTimeoutError,
|
16
|
-
::HTTPClient::ReceiveTimeoutError
|
17
|
-
].freeze
|
18
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
19
|
-
|
20
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
21
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
22
|
-
|
23
10
|
DEFAULT_MAX_TRIES = 3
|
24
11
|
|
25
12
|
attr_reader :manager
|
@@ -58,20 +45,20 @@ module Aranha
|
|
58
45
|
def process_address(address)
|
59
46
|
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
47
|
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
61
|
-
|
62
|
-
|
63
|
-
@failed.delete(address.id)
|
64
|
-
|
65
|
-
process_exception(
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
66
53
|
end
|
67
54
|
end
|
68
55
|
|
69
|
-
def process_exception(
|
70
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
71
58
|
|
72
|
-
@failed[address.id] ||= 0
|
73
|
-
@failed[address.id] += 1
|
74
|
-
manager.log_warn(
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
75
62
|
end
|
76
63
|
|
77
64
|
def next_address
|
@@ -82,12 +69,6 @@ module Aranha
|
|
82
69
|
::Aranha::Manager.default.unprocessed_addresses
|
83
70
|
end
|
84
71
|
|
85
|
-
def network_exception?(exception)
|
86
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
87
|
-
|
88
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
89
|
-
end
|
90
|
-
|
91
72
|
def not_try_ids
|
92
73
|
@failed.select { |_k, v| v > @try }.map { |k, _v| k }
|
93
74
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aranha-parsers
|
@@ -50,14 +50,14 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '0.
|
53
|
+
version: '0.72'
|
54
54
|
type: :runtime
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
version: '0.
|
60
|
+
version: '0.72'
|
61
61
|
- !ruby/object:Gem::Dependency
|
62
62
|
name: httpclient
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +78,20 @@ dependencies:
|
|
78
78
|
requirements:
|
79
79
|
- - "~>"
|
80
80
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
81
|
+
version: '0.3'
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 0.3.1
|
82
85
|
type: :development
|
83
86
|
prerelease: false
|
84
87
|
version_requirements: !ruby/object:Gem::Requirement
|
85
88
|
requirements:
|
86
89
|
- - "~>"
|
87
90
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
91
|
+
version: '0.3'
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: 0.3.1
|
89
95
|
description:
|
90
96
|
email:
|
91
97
|
- eduardobogoni@gmail.com
|
@@ -96,6 +102,7 @@ files:
|
|
96
102
|
- MIT-LICENSE
|
97
103
|
- README.rdoc
|
98
104
|
- lib/aranha.rb
|
105
|
+
- lib/aranha/address_processor.rb
|
99
106
|
- lib/aranha/default_processor.rb
|
100
107
|
- lib/aranha/manager.rb
|
101
108
|
- lib/aranha/processor.rb
|
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
126
|
- !ruby/object:Gem::Version
|
120
127
|
version: '0'
|
121
128
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
129
|
+
rubygems_version: 3.1.6
|
123
130
|
signing_key:
|
124
131
|
specification_version: 4
|
125
132
|
summary: Ruby utilities for web crawling.
|