aranha 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffc06565f11caae4796f17fde34ca461a30bda384b278205cbe516a870944e3a
4
- data.tar.gz: 1d68d7b086aafe552001dfa0441973e5c40347d542f27f4798547ea0f316c84b
3
+ metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
4
+ data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
5
5
  SHA512:
6
- metadata.gz: 67b8db80ac9a675177babac7cebdf934383371264b4e398c4ae9c69932f5e3a0796c56aeccbccab6fdde039223b5bc8b1209b6fb6fcb18be9235239fcabb6a0d
7
- data.tar.gz: 7a8f00e58e0282a3c899cce7789f3b45a565c199c5c182d6592094c83579c6b17f4136d8fa9de51f4a5f571091591de3b5b34d2eb4405b7259c595330ef91f3d
6
+ metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
7
+ data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
@@ -17,9 +17,10 @@ module Aranha
17
17
  end
18
18
  end
19
19
 
20
- def add(url, processor)
20
+ def add(url, processor, extra_data = nil)
21
21
  a = find_or_initialize_by(url: url)
22
22
  a.processor = processor
23
+ a.extra_data = extra_data.to_yaml
23
24
  a.save!
24
25
  end
25
26
 
@@ -49,10 +50,30 @@ module Aranha
49
50
 
50
51
  def process
51
52
  ActiveRecord::Base.transaction do
52
- processor.constantize.new(url).process
53
+ instanciate_processor.process
53
54
  self.processed_at = Time.zone.now
54
55
  save!
55
56
  end
56
57
  end
58
+
59
+ private
60
+
61
+ def instanciate_processor
62
+ if processor_instancier_arity == 2 || processor_instancier_arity < 0
63
+ processor_instancier.call(url, YAML.load(extra_data))
64
+ elsif processor_instancier_arity == 1
65
+ processor_instancier.call(url)
66
+ else
67
+ raise("#{processor}.initialize should has 1 or 2 or * arguments")
68
+ end
69
+ end
70
+
71
+ def processor_instancier
72
+ processor.constantize.method(:new)
73
+ end
74
+
75
+ def processor_instancier_arity
76
+ processor.constantize.instance_method(:initialize).arity
77
+ end
57
78
  end
58
79
  end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ class AddExtraDataToAranhaAddresses < ActiveRecord::Migration
3
+ def change
4
+ add_column :aranha_addresses, :extra_data, :text
5
+ end
6
+ end
@@ -2,13 +2,14 @@
2
2
 
3
3
  module Aranha
4
4
  class DefaultProcessor
5
- attr_reader :source_uri
5
+ attr_reader :source_uri, :extra_data
6
6
 
7
- def initialize(source_uri)
7
+ def initialize(source_uri, extra_data)
8
8
  unless source_uri.is_a?(Addressable::URI)
9
9
  source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
10
10
  end
11
11
  @source_uri = Addressable::URI.parse(source_uri)
12
+ @extra_data = extra_data
12
13
  end
13
14
 
14
15
  def process
@@ -33,7 +33,7 @@ module Aranha
33
33
  end
34
34
 
35
35
  def content_file
36
- ::File.open(@url.gsub(%r{\Afile://}, ''), &:read)
36
+ ::File.open(@url.to_s.gsub(%r{\Afile://}, ''), &:read)
37
37
  end
38
38
 
39
39
  def content_get
@@ -5,10 +5,19 @@ require_dependency 'aranha/parsers/invalid_state_exception'
5
5
 
6
6
  module Aranha
7
7
  class Processor
8
- NETWORK_EXCEPTIONS = [::HTTPClient::BadResponseError, Errno::ECONNRESET,
9
- ::Net::HTTPFatalError, HTTPClient::ConnectTimeoutError,
10
- ::HTTPClient::ReceiveTimeoutError,
11
- ::Aranha::Parsers::InvalidStateException].freeze
8
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
9
+ CORE_EXCEPTIONS = [::SocketError].freeze
10
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
11
+ HTTPCLIENT_EXCEPTIONS = [
12
+ ::HTTPClient::BadResponseError,
13
+ ::HTTPClient::ConnectTimeoutError,
14
+ ::HTTPClient::ReceiveTimeoutError
15
+ ].freeze
16
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
17
+
18
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
19
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
20
+
12
21
  DEFAULT_MAX_TRIES = 3
13
22
 
14
23
  def initialize
@@ -48,8 +57,8 @@ module Aranha
48
57
  begin
49
58
  address.process
50
59
  @failed.delete(address.id)
51
- rescue StandardError => ex
52
- process_exception(address, ex)
60
+ rescue StandardError => e
61
+ process_exception(address, e)
53
62
  end
54
63
  end
55
64
 
@@ -70,7 +79,8 @@ module Aranha
70
79
  end
71
80
 
72
81
  def network_exception?(exception)
73
- NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
82
+ return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
83
+ exception.cause.present? ? network_exception?(exception.cause) : false
74
84
  end
75
85
 
76
86
  def not_try_ids
@@ -85,8 +95,8 @@ module Aranha
85
95
  @max_tries ||= begin
86
96
  r = Integer(ENV['ARANHA_MAX_TRIES'])
87
97
  r <= 0 ? 0 : r
88
- rescue ArgumentError, TypeError
89
- DEFAULT_MAX_TRIES
98
+ rescue ArgumentError, TypeError
99
+ DEFAULT_MAX_TRIES
90
100
  end
91
101
  end
92
102
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.2.1'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-14 00:00:00.000000000 Z
11
+ date: 2019-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -99,6 +99,7 @@ files:
99
99
  - app/views/layouts/aranha/application.html.erb
100
100
  - config/routes.rb
101
101
  - db/migrate/20171201021251_create_aranha_addresses.rb
102
+ - db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
102
103
  - lib/aranha.rb
103
104
  - lib/aranha/default_processor.rb
104
105
  - lib/aranha/dom_elements_traverser.rb