aranha 0.15.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cf75a29a21e24f2e85a425b8b9291c058a46eef895a8100992514dd03138b065
4
- data.tar.gz: d52cc8cf889e0d9174f13771e3c9ac15971a1f673dc8e4e15085aab3755e169b
3
+ metadata.gz: 31001c1c5af252c6cb791aa0a362177859b9f8ff647fe3150df6f6515d81904a
4
+ data.tar.gz: eb574bcb2047e5cf2de295ea437d9e70d5cb680f6341e853da7097b7b2e181b8
5
5
  SHA512:
6
- metadata.gz: 4844ac22fc5d9b3a16e7b03d8da6e5e5fc82a95dd705a0bbbb11c029f206fdc6fb762d1fd2cd71a2c8f869084597ad3f8d65e88f79f95bf37dfcd02a67cc59c6
7
- data.tar.gz: c768ba132ef0b9afbf9630e62aeb4759ab5bdb09e8d6af2658c59b19360c2a9c11dd3c1abd49431111fa25273c895ede8eb2536d7f63ebd607a1965c9e9a0813
6
+ metadata.gz: d9ec9528940576d0502b37ead25eb749118211ca5d15cf3d17cb23273d789309c7507e2f74d8053e17c5709766c0b87dabe6d64180df4a43ce060470f556029d
7
+ data.tar.gz: d5ec7d1b7432ecbfbc8e10864214aee4ce9b256b5e359b3ac11f4411b2279e44b20369c17c67cdc08fe9fda52225a1a408ec4977adc0825cdd28da5bcda8ed26
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -18,6 +18,8 @@ module Aranha
18
18
  self.source_uri = self.class.sanitize_uri(source_uri)
19
19
  end
20
20
 
21
+ delegate :source_address, to: :parser
22
+
21
23
  def process
22
24
  raise 'Implement method process'
23
25
  end
@@ -27,7 +29,11 @@ module Aranha
27
29
  end
28
30
 
29
31
  def data
30
- @data ||= parser_class.new(target_uri).data
32
+ @data ||= parser.data
33
+ end
34
+
35
+ def parser
36
+ @parser ||= parser_class.new(target_uri)
31
37
  end
32
38
 
33
39
  def parser_class
@@ -7,19 +7,6 @@ require 'aranha/manager'
7
7
 
8
8
  module Aranha
9
9
  class Processor
10
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
11
- CORE_EXCEPTIONS = [::SocketError].freeze
12
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
13
- HTTPCLIENT_EXCEPTIONS = [
14
- ::HTTPClient::BadResponseError,
15
- ::HTTPClient::ConnectTimeoutError,
16
- ::HTTPClient::ReceiveTimeoutError
17
- ].freeze
18
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
19
-
20
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
21
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
22
-
23
10
  DEFAULT_MAX_TRIES = 3
24
11
 
25
12
  attr_reader :manager
@@ -58,20 +45,20 @@ module Aranha
58
45
  def process_address(address)
59
46
  manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
60
47
  " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
61
- begin
62
- address.process
63
- @failed.delete(address.id)
64
- rescue StandardError => e
65
- process_exception(address, e)
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
66
53
  end
67
54
  end
68
55
 
69
- def process_exception(address, exception)
70
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
71
58
 
72
- @failed[address.id] ||= 0
73
- @failed[address.id] += 1
74
- manager.log_warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
75
62
  end
76
63
 
77
64
  def next_address
@@ -82,12 +69,6 @@ module Aranha
82
69
  ::Aranha::Manager.default.unprocessed_addresses
83
70
  end
84
71
 
85
- def network_exception?(exception)
86
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
87
-
88
- exception.cause.present? ? network_exception?(exception.cause) : false
89
- end
90
-
91
72
  def not_try_ids
92
73
  @failed.select { |_k, v| v > @try }.map { |k, _v| k }
93
74
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.15.2'
4
+ VERSION = '0.17.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-28 00:00:00.000000000 Z
11
+ date: 2022-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aranha-parsers
@@ -50,14 +50,14 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0.52'
53
+ version: '0.72'
54
54
  type: :runtime
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '0.52'
60
+ version: '0.72'
61
61
  - !ruby/object:Gem::Dependency
62
62
  name: httpclient
63
63
  requirement: !ruby/object:Gem::Requirement
@@ -78,14 +78,20 @@ dependencies:
78
78
  requirements:
79
79
  - - "~>"
80
80
  - !ruby/object:Gem::Version
81
- version: '0.2'
81
+ version: '0.3'
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 0.3.1
82
85
  type: :development
83
86
  prerelease: false
84
87
  version_requirements: !ruby/object:Gem::Requirement
85
88
  requirements:
86
89
  - - "~>"
87
90
  - !ruby/object:Gem::Version
88
- version: '0.2'
91
+ version: '0.3'
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 0.3.1
89
95
  description:
90
96
  email:
91
97
  - eduardobogoni@gmail.com
@@ -96,6 +102,7 @@ files:
96
102
  - MIT-LICENSE
97
103
  - README.rdoc
98
104
  - lib/aranha.rb
105
+ - lib/aranha/address_processor.rb
99
106
  - lib/aranha/default_processor.rb
100
107
  - lib/aranha/manager.rb
101
108
  - lib/aranha/processor.rb
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
126
  - !ruby/object:Gem::Version
120
127
  version: '0'
121
128
  requirements: []
122
- rubygems_version: 3.0.9
129
+ rubygems_version: 3.1.6
123
130
  signing_key:
124
131
  specification_version: 4
125
132
  summary: Ruby utilities for web crawling.