aranha 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/models/aranha/address.rb +23 -2
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +6 -0
- data/lib/aranha/default_processor.rb +3 -2
- data/lib/aranha/parsers/base.rb +1 -1
- data/lib/aranha/processor.rb +19 -9
- data/lib/aranha/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
|
4
|
+
data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
|
7
|
+
data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
|
@@ -17,9 +17,10 @@ module Aranha
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
def add(url, processor)
|
20
|
+
def add(url, processor, extra_data = nil)
|
21
21
|
a = find_or_initialize_by(url: url)
|
22
22
|
a.processor = processor
|
23
|
+
a.extra_data = extra_data.to_yaml
|
23
24
|
a.save!
|
24
25
|
end
|
25
26
|
|
@@ -49,10 +50,30 @@ module Aranha
|
|
49
50
|
|
50
51
|
def process
|
51
52
|
ActiveRecord::Base.transaction do
|
52
|
-
|
53
|
+
instanciate_processor.process
|
53
54
|
self.processed_at = Time.zone.now
|
54
55
|
save!
|
55
56
|
end
|
56
57
|
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def instanciate_processor
|
62
|
+
if processor_instancier_arity == 2 || processor_instancier_arity < 0
|
63
|
+
processor_instancier.call(url, YAML.load(extra_data))
|
64
|
+
elsif processor_instancier_arity == 1
|
65
|
+
processor_instancier.call(url)
|
66
|
+
else
|
67
|
+
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def processor_instancier
|
72
|
+
processor.constantize.method(:new)
|
73
|
+
end
|
74
|
+
|
75
|
+
def processor_instancier_arity
|
76
|
+
processor.constantize.instance_method(:initialize).arity
|
77
|
+
end
|
57
78
|
end
|
58
79
|
end
|
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
module Aranha
|
4
4
|
class DefaultProcessor
|
5
|
-
attr_reader :source_uri
|
5
|
+
attr_reader :source_uri, :extra_data
|
6
6
|
|
7
|
-
def initialize(source_uri)
|
7
|
+
def initialize(source_uri, extra_data)
|
8
8
|
unless source_uri.is_a?(Addressable::URI)
|
9
9
|
source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
|
10
10
|
end
|
11
11
|
@source_uri = Addressable::URI.parse(source_uri)
|
12
|
+
@extra_data = extra_data
|
12
13
|
end
|
13
14
|
|
14
15
|
def process
|
data/lib/aranha/parsers/base.rb
CHANGED
data/lib/aranha/processor.rb
CHANGED
@@ -5,10 +5,19 @@ require_dependency 'aranha/parsers/invalid_state_exception'
|
|
5
5
|
|
6
6
|
module Aranha
|
7
7
|
class Processor
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
9
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
10
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
11
|
+
HTTPCLIENT_EXCEPTIONS = [
|
12
|
+
::HTTPClient::BadResponseError,
|
13
|
+
::HTTPClient::ConnectTimeoutError,
|
14
|
+
::HTTPClient::ReceiveTimeoutError
|
15
|
+
].freeze
|
16
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
17
|
+
|
18
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
19
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
20
|
+
|
12
21
|
DEFAULT_MAX_TRIES = 3
|
13
22
|
|
14
23
|
def initialize
|
@@ -48,8 +57,8 @@ module Aranha
|
|
48
57
|
begin
|
49
58
|
address.process
|
50
59
|
@failed.delete(address.id)
|
51
|
-
rescue StandardError =>
|
52
|
-
process_exception(address,
|
60
|
+
rescue StandardError => e
|
61
|
+
process_exception(address, e)
|
53
62
|
end
|
54
63
|
end
|
55
64
|
|
@@ -70,7 +79,8 @@ module Aranha
|
|
70
79
|
end
|
71
80
|
|
72
81
|
def network_exception?(exception)
|
73
|
-
NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
82
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
83
|
+
exception.cause.present? ? network_exception?(exception.cause) : false
|
74
84
|
end
|
75
85
|
|
76
86
|
def not_try_ids
|
@@ -85,8 +95,8 @@ module Aranha
|
|
85
95
|
@max_tries ||= begin
|
86
96
|
r = Integer(ENV['ARANHA_MAX_TRIES'])
|
87
97
|
r <= 0 ? 0 : r
|
88
|
-
|
89
|
-
|
98
|
+
rescue ArgumentError, TypeError
|
99
|
+
DEFAULT_MAX_TRIES
|
90
100
|
end
|
91
101
|
end
|
92
102
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02
|
11
|
+
date: 2019-05-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -99,6 +99,7 @@ files:
|
|
99
99
|
- app/views/layouts/aranha/application.html.erb
|
100
100
|
- config/routes.rb
|
101
101
|
- db/migrate/20171201021251_create_aranha_addresses.rb
|
102
|
+
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
102
103
|
- lib/aranha.rb
|
103
104
|
- lib/aranha/default_processor.rb
|
104
105
|
- lib/aranha/dom_elements_traverser.rb
|