aranha 0.15.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +7 -1
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +14 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31001c1c5af252c6cb791aa0a362177859b9f8ff647fe3150df6f6515d81904a
|
4
|
+
data.tar.gz: eb574bcb2047e5cf2de295ea437d9e70d5cb680f6341e853da7097b7b2e181b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d9ec9528940576d0502b37ead25eb749118211ca5d15cf3d17cb23273d789309c7507e2f74d8053e17c5709766c0b87dabe6d64180df4a43ce060470f556029d
|
7
|
+
data.tar.gz: d5ec7d1b7432ecbfbc8e10864214aee4ce9b256b5e359b3ac11f4411b2279e44b20369c17c67cdc08fe9fda52225a1a408ec4977adc0825cdd28da5bcda8ed26
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -18,6 +18,8 @@ module Aranha
|
|
18
18
|
self.source_uri = self.class.sanitize_uri(source_uri)
|
19
19
|
end
|
20
20
|
|
21
|
+
delegate :source_address, to: :parser
|
22
|
+
|
21
23
|
def process
|
22
24
|
raise 'Implement method process'
|
23
25
|
end
|
@@ -27,7 +29,11 @@ module Aranha
|
|
27
29
|
end
|
28
30
|
|
29
31
|
def data
|
30
|
-
@data ||=
|
32
|
+
@data ||= parser.data
|
33
|
+
end
|
34
|
+
|
35
|
+
def parser
|
36
|
+
@parser ||= parser_class.new(target_uri)
|
31
37
|
end
|
32
38
|
|
33
39
|
def parser_class
|
data/lib/aranha/processor.rb
CHANGED
@@ -7,19 +7,6 @@ require 'aranha/manager'
|
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
class Processor
|
10
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
11
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
12
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
13
|
-
HTTPCLIENT_EXCEPTIONS = [
|
14
|
-
::HTTPClient::BadResponseError,
|
15
|
-
::HTTPClient::ConnectTimeoutError,
|
16
|
-
::HTTPClient::ReceiveTimeoutError
|
17
|
-
].freeze
|
18
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
19
|
-
|
20
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
21
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
22
|
-
|
23
10
|
DEFAULT_MAX_TRIES = 3
|
24
11
|
|
25
12
|
attr_reader :manager
|
@@ -58,20 +45,20 @@ module Aranha
|
|
58
45
|
def process_address(address)
|
59
46
|
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
47
|
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
61
|
-
|
62
|
-
|
63
|
-
@failed.delete(address.id)
|
64
|
-
|
65
|
-
process_exception(
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
66
53
|
end
|
67
54
|
end
|
68
55
|
|
69
|
-
def process_exception(
|
70
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
71
58
|
|
72
|
-
@failed[address.id] ||= 0
|
73
|
-
@failed[address.id] += 1
|
74
|
-
manager.log_warn(
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
75
62
|
end
|
76
63
|
|
77
64
|
def next_address
|
@@ -82,12 +69,6 @@ module Aranha
|
|
82
69
|
::Aranha::Manager.default.unprocessed_addresses
|
83
70
|
end
|
84
71
|
|
85
|
-
def network_exception?(exception)
|
86
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
87
|
-
|
88
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
89
|
-
end
|
90
|
-
|
91
72
|
def not_try_ids
|
92
73
|
@failed.select { |_k, v| v > @try }.map { |k, _v| k }
|
93
74
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aranha-parsers
|
@@ -50,14 +50,14 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '0.
|
53
|
+
version: '0.72'
|
54
54
|
type: :runtime
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
version: '0.
|
60
|
+
version: '0.72'
|
61
61
|
- !ruby/object:Gem::Dependency
|
62
62
|
name: httpclient
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +78,20 @@ dependencies:
|
|
78
78
|
requirements:
|
79
79
|
- - "~>"
|
80
80
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
81
|
+
version: '0.3'
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 0.3.1
|
82
85
|
type: :development
|
83
86
|
prerelease: false
|
84
87
|
version_requirements: !ruby/object:Gem::Requirement
|
85
88
|
requirements:
|
86
89
|
- - "~>"
|
87
90
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
91
|
+
version: '0.3'
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: 0.3.1
|
89
95
|
description:
|
90
96
|
email:
|
91
97
|
- eduardobogoni@gmail.com
|
@@ -96,6 +102,7 @@ files:
|
|
96
102
|
- MIT-LICENSE
|
97
103
|
- README.rdoc
|
98
104
|
- lib/aranha.rb
|
105
|
+
- lib/aranha/address_processor.rb
|
99
106
|
- lib/aranha/default_processor.rb
|
100
107
|
- lib/aranha/manager.rb
|
101
108
|
- lib/aranha/processor.rb
|
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
126
|
- !ruby/object:Gem::Version
|
120
127
|
version: '0'
|
121
128
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
129
|
+
rubygems_version: 3.1.6
|
123
130
|
signing_key:
|
124
131
|
specification_version: 4
|
125
132
|
summary: Ruby utilities for web crawling.
|