aranha 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/app/models/aranha/address.rb +23 -2
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +6 -0
- data/lib/aranha/default_processor.rb +3 -2
- data/lib/aranha/parsers/base.rb +1 -1
- data/lib/aranha/processor.rb +19 -9
- data/lib/aranha/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
|
4
|
+
data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
|
7
|
+
data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
|
@@ -17,9 +17,10 @@ module Aranha
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
def add(url, processor)
|
20
|
+
def add(url, processor, extra_data = nil)
|
21
21
|
a = find_or_initialize_by(url: url)
|
22
22
|
a.processor = processor
|
23
|
+
a.extra_data = extra_data.to_yaml
|
23
24
|
a.save!
|
24
25
|
end
|
25
26
|
|
@@ -49,10 +50,30 @@ module Aranha
|
|
49
50
|
|
50
51
|
def process
|
51
52
|
ActiveRecord::Base.transaction do
|
52
|
-
|
53
|
+
instanciate_processor.process
|
53
54
|
self.processed_at = Time.zone.now
|
54
55
|
save!
|
55
56
|
end
|
56
57
|
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def instanciate_processor
|
62
|
+
if processor_instancier_arity == 2 || processor_instancier_arity < 0
|
63
|
+
processor_instancier.call(url, YAML.load(extra_data))
|
64
|
+
elsif processor_instancier_arity == 1
|
65
|
+
processor_instancier.call(url)
|
66
|
+
else
|
67
|
+
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def processor_instancier
|
72
|
+
processor.constantize.method(:new)
|
73
|
+
end
|
74
|
+
|
75
|
+
def processor_instancier_arity
|
76
|
+
processor.constantize.instance_method(:initialize).arity
|
77
|
+
end
|
57
78
|
end
|
58
79
|
end
|
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
module Aranha
|
4
4
|
class DefaultProcessor
|
5
|
-
attr_reader :source_uri
|
5
|
+
attr_reader :source_uri, :extra_data
|
6
6
|
|
7
|
-
def initialize(source_uri)
|
7
|
+
def initialize(source_uri, extra_data)
|
8
8
|
unless source_uri.is_a?(Addressable::URI)
|
9
9
|
source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
|
10
10
|
end
|
11
11
|
@source_uri = Addressable::URI.parse(source_uri)
|
12
|
+
@extra_data = extra_data
|
12
13
|
end
|
13
14
|
|
14
15
|
def process
|
data/lib/aranha/parsers/base.rb
CHANGED
data/lib/aranha/processor.rb
CHANGED
@@ -5,10 +5,19 @@ require_dependency 'aranha/parsers/invalid_state_exception'
|
|
5
5
|
|
6
6
|
module Aranha
|
7
7
|
class Processor
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
9
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
10
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
11
|
+
HTTPCLIENT_EXCEPTIONS = [
|
12
|
+
::HTTPClient::BadResponseError,
|
13
|
+
::HTTPClient::ConnectTimeoutError,
|
14
|
+
::HTTPClient::ReceiveTimeoutError
|
15
|
+
].freeze
|
16
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
17
|
+
|
18
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
19
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
20
|
+
|
12
21
|
DEFAULT_MAX_TRIES = 3
|
13
22
|
|
14
23
|
def initialize
|
@@ -48,8 +57,8 @@ module Aranha
|
|
48
57
|
begin
|
49
58
|
address.process
|
50
59
|
@failed.delete(address.id)
|
51
|
-
rescue StandardError =>
|
52
|
-
process_exception(address,
|
60
|
+
rescue StandardError => e
|
61
|
+
process_exception(address, e)
|
53
62
|
end
|
54
63
|
end
|
55
64
|
|
@@ -70,7 +79,8 @@ module Aranha
|
|
70
79
|
end
|
71
80
|
|
72
81
|
def network_exception?(exception)
|
73
|
-
NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
82
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
83
|
+
exception.cause.present? ? network_exception?(exception.cause) : false
|
74
84
|
end
|
75
85
|
|
76
86
|
def not_try_ids
|
@@ -85,8 +95,8 @@ module Aranha
|
|
85
95
|
@max_tries ||= begin
|
86
96
|
r = Integer(ENV['ARANHA_MAX_TRIES'])
|
87
97
|
r <= 0 ? 0 : r
|
88
|
-
|
89
|
-
|
98
|
+
rescue ArgumentError, TypeError
|
99
|
+
DEFAULT_MAX_TRIES
|
90
100
|
end
|
91
101
|
end
|
92
102
|
end
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02
|
11
|
+
date: 2019-05-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -99,6 +99,7 @@ files:
|
|
99
99
|
- app/views/layouts/aranha/application.html.erb
|
100
100
|
- config/routes.rb
|
101
101
|
- db/migrate/20171201021251_create_aranha_addresses.rb
|
102
|
+
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
102
103
|
- lib/aranha.rb
|
103
104
|
- lib/aranha/default_processor.rb
|
104
105
|
- lib/aranha/dom_elements_traverser.rb
|