aranha 0.15.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5fac0411750b6def655452009d3d091801905c696158499d8b4b26f99ccc426
4
- data.tar.gz: cb8cd648b8603cfd1f578ba92e5017e37d6e1e19427b176d33e097e413c17baf
3
+ metadata.gz: 33a45e4db83f3d2743b53386005803da35188866dad28e15313a56b747efd4d1
4
+ data.tar.gz: 77ac8f2bcd922652fe62999f43d148a6557fb66d62e9edb21ba51d00ad5cb6d5
5
5
  SHA512:
6
- metadata.gz: c4864a35aa117b9bb00d544c013ae6caaf356b3d674657d5b1d5a3371f644b949b4fa32b6ec97d5f44fa58f65ea71db0499f18208ed885103a4f93d891d41309
7
- data.tar.gz: c20f17bb1c3d04ce9b7678d2bec181c74dcf3b930202f9c838f5046cd68b326eb660ad490c4ad4c92647f60474eefc705e6e2734a62f3b42541760d689ab9686
6
+ metadata.gz: c9424bbcd6540d5627052427cdbc746db9f36470e35e3c6deed3b2668160de22b2163e3f0a8b5ef03c13681bb05bb42b20c1a7348b71e616facdd82bb464e1fc
7
+ data.tar.gz: 54800ff8cc979371da63d2cea2543951f08f3f9a6423d0b20de7a9399cdc19338775f087d9a6791bcd3a0ec27d0500983fd47056f4438d9603e1fac29879a7cb
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -1,11 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  class DefaultProcessor
7
- attr_reader :source_uri, :extra_data
8
-
9
8
  class << self
10
9
  def sanitize_uri(uri)
11
10
  return uri if uri.is_a?(Hash)
@@ -15,17 +14,14 @@ module Aranha
15
14
  end
16
15
  end
17
16
 
18
- def initialize(source_uri, extra_data)
19
- @source_uri = self.class.sanitize_uri(source_uri)
20
- @extra_data = extra_data
17
+ common_constructor :source_uri, :extra_data do
18
+ self.source_uri = self.class.sanitize_uri(source_uri)
21
19
  end
22
20
 
23
21
  def process
24
22
  raise 'Implement method process'
25
23
  end
26
24
 
27
- protected
28
-
29
25
  def target_uri
30
26
  source_uri
31
27
  end
@@ -7,19 +7,6 @@ require 'aranha/manager'
7
7
 
8
8
  module Aranha
9
9
  class Processor
10
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
11
- CORE_EXCEPTIONS = [::SocketError].freeze
12
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
13
- HTTPCLIENT_EXCEPTIONS = [
14
- ::HTTPClient::BadResponseError,
15
- ::HTTPClient::ConnectTimeoutError,
16
- ::HTTPClient::ReceiveTimeoutError
17
- ].freeze
18
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
19
-
20
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
21
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
22
-
23
10
  DEFAULT_MAX_TRIES = 3
24
11
 
25
12
  attr_reader :manager
@@ -58,20 +45,20 @@ module Aranha
58
45
  def process_address(address)
59
46
  manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
60
47
  " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
61
- begin
62
- address.process
63
- @failed.delete(address.id)
64
- rescue StandardError => e
65
- process_exception(address, e)
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
66
53
  end
67
54
  end
68
55
 
69
- def process_exception(address, exception)
70
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
71
58
 
72
- @failed[address.id] ||= 0
73
- @failed[address.id] += 1
74
- manager.log_warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
75
62
  end
76
63
 
77
64
  def next_address
@@ -82,12 +69,6 @@ module Aranha
82
69
  ::Aranha::Manager.default.unprocessed_addresses
83
70
  end
84
71
 
85
- def network_exception?(exception)
86
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
87
-
88
- exception.cause.present? ? network_exception?(exception.cause) : false
89
- end
90
-
91
72
  def not_try_ids
92
73
  @failed.select { |_k, v| v > @try }.map { |k, _v| k }
93
74
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.15.0'
4
+ VERSION = '0.16.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-28 00:00:00.000000000 Z
11
+ date: 2021-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aranha-parsers
@@ -50,14 +50,14 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0.52'
53
+ version: '0.72'
54
54
  type: :runtime
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '0.52'
60
+ version: '0.72'
61
61
  - !ruby/object:Gem::Dependency
62
62
  name: httpclient
63
63
  requirement: !ruby/object:Gem::Requirement
@@ -78,14 +78,20 @@ dependencies:
78
78
  requirements:
79
79
  - - "~>"
80
80
  - !ruby/object:Gem::Version
81
- version: '0.1'
81
+ version: '0.3'
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 0.3.1
82
85
  type: :development
83
86
  prerelease: false
84
87
  version_requirements: !ruby/object:Gem::Requirement
85
88
  requirements:
86
89
  - - "~>"
87
90
  - !ruby/object:Gem::Version
88
- version: '0.1'
91
+ version: '0.3'
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 0.3.1
89
95
  description:
90
96
  email:
91
97
  - eduardobogoni@gmail.com
@@ -96,6 +102,7 @@ files:
96
102
  - MIT-LICENSE
97
103
  - README.rdoc
98
104
  - lib/aranha.rb
105
+ - lib/aranha/address_processor.rb
99
106
  - lib/aranha/default_processor.rb
100
107
  - lib/aranha/manager.rb
101
108
  - lib/aranha/processor.rb
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
126
  - !ruby/object:Gem::Version
120
127
  version: '0'
121
128
  requirements: []
122
- rubygems_version: 3.0.8
129
+ rubygems_version: 3.1.6
123
130
  signing_key:
124
131
  specification_version: 4
125
132
  summary: Ruby utilities for web crawling.