aranha 0.15.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cf75a29a21e24f2e85a425b8b9291c058a46eef895a8100992514dd03138b065
4
- data.tar.gz: d52cc8cf889e0d9174f13771e3c9ac15971a1f673dc8e4e15085aab3755e169b
3
+ metadata.gz: 31001c1c5af252c6cb791aa0a362177859b9f8ff647fe3150df6f6515d81904a
4
+ data.tar.gz: eb574bcb2047e5cf2de295ea437d9e70d5cb680f6341e853da7097b7b2e181b8
5
5
  SHA512:
6
- metadata.gz: 4844ac22fc5d9b3a16e7b03d8da6e5e5fc82a95dd705a0bbbb11c029f206fdc6fb762d1fd2cd71a2c8f869084597ad3f8d65e88f79f95bf37dfcd02a67cc59c6
7
- data.tar.gz: c768ba132ef0b9afbf9630e62aeb4759ab5bdb09e8d6af2658c59b19360c2a9c11dd3c1abd49431111fa25273c895ede8eb2536d7f63ebd607a1965c9e9a0813
6
+ metadata.gz: d9ec9528940576d0502b37ead25eb749118211ca5d15cf3d17cb23273d789309c7507e2f74d8053e17c5709766c0b87dabe6d64180df4a43ce060470f556029d
7
+ data.tar.gz: d5ec7d1b7432ecbfbc8e10864214aee4ce9b256b5e359b3ac11f4411b2279e44b20369c17c67cdc08fe9fda52225a1a408ec4977adc0825cdd28da5bcda8ed26
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -18,6 +18,8 @@ module Aranha
18
18
  self.source_uri = self.class.sanitize_uri(source_uri)
19
19
  end
20
20
 
21
+ delegate :source_address, to: :parser
22
+
21
23
  def process
22
24
  raise 'Implement method process'
23
25
  end
@@ -27,7 +29,11 @@ module Aranha
27
29
  end
28
30
 
29
31
  def data
30
- @data ||= parser_class.new(target_uri).data
32
+ @data ||= parser.data
33
+ end
34
+
35
+ def parser
36
+ @parser ||= parser_class.new(target_uri)
31
37
  end
32
38
 
33
39
  def parser_class
@@ -7,19 +7,6 @@ require 'aranha/manager'
7
7
 
8
8
  module Aranha
9
9
  class Processor
10
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
11
- CORE_EXCEPTIONS = [::SocketError].freeze
12
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
13
- HTTPCLIENT_EXCEPTIONS = [
14
- ::HTTPClient::BadResponseError,
15
- ::HTTPClient::ConnectTimeoutError,
16
- ::HTTPClient::ReceiveTimeoutError
17
- ].freeze
18
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
19
-
20
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
21
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
22
-
23
10
  DEFAULT_MAX_TRIES = 3
24
11
 
25
12
  attr_reader :manager
@@ -58,20 +45,20 @@ module Aranha
58
45
  def process_address(address)
59
46
  manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
60
47
  " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
61
- begin
62
- address.process
63
- @failed.delete(address.id)
64
- rescue StandardError => e
65
- process_exception(address, e)
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
66
53
  end
67
54
  end
68
55
 
69
- def process_exception(address, exception)
70
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
71
58
 
72
- @failed[address.id] ||= 0
73
- @failed[address.id] += 1
74
- manager.log_warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
75
62
  end
76
63
 
77
64
  def next_address
@@ -82,12 +69,6 @@ module Aranha
82
69
  ::Aranha::Manager.default.unprocessed_addresses
83
70
  end
84
71
 
85
- def network_exception?(exception)
86
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
87
-
88
- exception.cause.present? ? network_exception?(exception.cause) : false
89
- end
90
-
91
72
  def not_try_ids
92
73
  @failed.select { |_k, v| v > @try }.map { |k, _v| k }
93
74
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.15.2'
4
+ VERSION = '0.17.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-28 00:00:00.000000000 Z
11
+ date: 2022-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aranha-parsers
@@ -50,14 +50,14 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0.52'
53
+ version: '0.72'
54
54
  type: :runtime
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '0.52'
60
+ version: '0.72'
61
61
  - !ruby/object:Gem::Dependency
62
62
  name: httpclient
63
63
  requirement: !ruby/object:Gem::Requirement
@@ -78,14 +78,20 @@ dependencies:
78
78
  requirements:
79
79
  - - "~>"
80
80
  - !ruby/object:Gem::Version
81
- version: '0.2'
81
+ version: '0.3'
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 0.3.1
82
85
  type: :development
83
86
  prerelease: false
84
87
  version_requirements: !ruby/object:Gem::Requirement
85
88
  requirements:
86
89
  - - "~>"
87
90
  - !ruby/object:Gem::Version
88
- version: '0.2'
91
+ version: '0.3'
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 0.3.1
89
95
  description:
90
96
  email:
91
97
  - eduardobogoni@gmail.com
@@ -96,6 +102,7 @@ files:
96
102
  - MIT-LICENSE
97
103
  - README.rdoc
98
104
  - lib/aranha.rb
105
+ - lib/aranha/address_processor.rb
99
106
  - lib/aranha/default_processor.rb
100
107
  - lib/aranha/manager.rb
101
108
  - lib/aranha/processor.rb
@@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
126
  - !ruby/object:Gem::Version
120
127
  version: '0'
121
128
  requirements: []
122
- rubygems_version: 3.0.9
129
+ rubygems_version: 3.1.6
123
130
  signing_key:
124
131
  specification_version: 4
125
132
  summary: Ruby utilities for web crawling.