aranha 0.15.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +7 -1
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +14 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31001c1c5af252c6cb791aa0a362177859b9f8ff647fe3150df6f6515d81904a
|
4
|
+
data.tar.gz: eb574bcb2047e5cf2de295ea437d9e70d5cb680f6341e853da7097b7b2e181b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d9ec9528940576d0502b37ead25eb749118211ca5d15cf3d17cb23273d789309c7507e2f74d8053e17c5709766c0b87dabe6d64180df4a43ce060470f556029d
|
7
|
+
data.tar.gz: d5ec7d1b7432ecbfbc8e10864214aee4ce9b256b5e359b3ac11f4411b2279e44b20369c17c67cdc08fe9fda52225a1a408ec4977adc0825cdd28da5bcda8ed26
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -18,6 +18,8 @@ module Aranha
|
|
18
18
|
self.source_uri = self.class.sanitize_uri(source_uri)
|
19
19
|
end
|
20
20
|
|
21
|
+
delegate :source_address, to: :parser
|
22
|
+
|
21
23
|
def process
|
22
24
|
raise 'Implement method process'
|
23
25
|
end
|
@@ -27,7 +29,11 @@ module Aranha
|
|
27
29
|
end
|
28
30
|
|
29
31
|
def data
|
30
|
-
@data ||=
|
32
|
+
@data ||= parser.data
|
33
|
+
end
|
34
|
+
|
35
|
+
def parser
|
36
|
+
@parser ||= parser_class.new(target_uri)
|
31
37
|
end
|
32
38
|
|
33
39
|
def parser_class
|
data/lib/aranha/processor.rb
CHANGED
@@ -7,19 +7,6 @@ require 'aranha/manager'
|
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
class Processor
|
10
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
11
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
12
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
13
|
-
HTTPCLIENT_EXCEPTIONS = [
|
14
|
-
::HTTPClient::BadResponseError,
|
15
|
-
::HTTPClient::ConnectTimeoutError,
|
16
|
-
::HTTPClient::ReceiveTimeoutError
|
17
|
-
].freeze
|
18
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
19
|
-
|
20
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
21
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
22
|
-
|
23
10
|
DEFAULT_MAX_TRIES = 3
|
24
11
|
|
25
12
|
attr_reader :manager
|
@@ -58,20 +45,20 @@ module Aranha
|
|
58
45
|
def process_address(address)
|
59
46
|
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
47
|
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
61
|
-
|
62
|
-
|
63
|
-
@failed.delete(address.id)
|
64
|
-
|
65
|
-
process_exception(
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
66
53
|
end
|
67
54
|
end
|
68
55
|
|
69
|
-
def process_exception(
|
70
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
71
58
|
|
72
|
-
@failed[address.id] ||= 0
|
73
|
-
@failed[address.id] += 1
|
74
|
-
manager.log_warn(
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
75
62
|
end
|
76
63
|
|
77
64
|
def next_address
|
@@ -82,12 +69,6 @@ module Aranha
|
|
82
69
|
::Aranha::Manager.default.unprocessed_addresses
|
83
70
|
end
|
84
71
|
|
85
|
-
def network_exception?(exception)
|
86
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
87
|
-
|
88
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
89
|
-
end
|
90
|
-
|
91
72
|
def not_try_ids
|
92
73
|
@failed.select { |_k, v| v > @try }.map { |k, _v| k }
|
93
74
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aranha-parsers
|
@@ -50,14 +50,14 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '0.
|
53
|
+
version: '0.72'
|
54
54
|
type: :runtime
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
version: '0.
|
60
|
+
version: '0.72'
|
61
61
|
- !ruby/object:Gem::Dependency
|
62
62
|
name: httpclient
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +78,20 @@ dependencies:
|
|
78
78
|
requirements:
|
79
79
|
- - "~>"
|
80
80
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
81
|
+
version: '0.3'
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 0.3.1
|
82
85
|
type: :development
|
83
86
|
prerelease: false
|
84
87
|
version_requirements: !ruby/object:Gem::Requirement
|
85
88
|
requirements:
|
86
89
|
- - "~>"
|
87
90
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
91
|
+
version: '0.3'
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: 0.3.1
|
89
95
|
description:
|
90
96
|
email:
|
91
97
|
- eduardobogoni@gmail.com
|
@@ -96,6 +102,7 @@ files:
|
|
96
102
|
- MIT-LICENSE
|
97
103
|
- README.rdoc
|
98
104
|
- lib/aranha.rb
|
105
|
+
- lib/aranha/address_processor.rb
|
99
106
|
- lib/aranha/default_processor.rb
|
100
107
|
- lib/aranha/manager.rb
|
101
108
|
- lib/aranha/processor.rb
|
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
126
|
- !ruby/object:Gem::Version
|
120
127
|
version: '0'
|
121
128
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
129
|
+
rubygems_version: 3.1.6
|
123
130
|
signing_key:
|
124
131
|
specification_version: 4
|
125
132
|
summary: Ruby utilities for web crawling.
|