aranha 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffc06565f11caae4796f17fde34ca461a30bda384b278205cbe516a870944e3a
4
- data.tar.gz: 1d68d7b086aafe552001dfa0441973e5c40347d542f27f4798547ea0f316c84b
3
+ metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
4
+ data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
5
5
  SHA512:
6
- metadata.gz: 67b8db80ac9a675177babac7cebdf934383371264b4e398c4ae9c69932f5e3a0796c56aeccbccab6fdde039223b5bc8b1209b6fb6fcb18be9235239fcabb6a0d
7
- data.tar.gz: 7a8f00e58e0282a3c899cce7789f3b45a565c199c5c182d6592094c83579c6b17f4136d8fa9de51f4a5f571091591de3b5b34d2eb4405b7259c595330ef91f3d
6
+ metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
7
+ data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
@@ -17,9 +17,10 @@ module Aranha
17
17
  end
18
18
  end
19
19
 
20
- def add(url, processor)
20
+ def add(url, processor, extra_data = nil)
21
21
  a = find_or_initialize_by(url: url)
22
22
  a.processor = processor
23
+ a.extra_data = extra_data.to_yaml
23
24
  a.save!
24
25
  end
25
26
 
@@ -49,10 +50,30 @@ module Aranha
49
50
 
50
51
  def process
51
52
  ActiveRecord::Base.transaction do
52
- processor.constantize.new(url).process
53
+ instanciate_processor.process
53
54
  self.processed_at = Time.zone.now
54
55
  save!
55
56
  end
56
57
  end
58
+
59
+ private
60
+
61
+ def instanciate_processor
62
+ if processor_instancier_arity == 2 || processor_instancier_arity < 0
63
+ processor_instancier.call(url, YAML.load(extra_data))
64
+ elsif processor_instancier_arity == 1
65
+ processor_instancier.call(url)
66
+ else
67
+ raise("#{processor}.initialize should has 1 or 2 or * arguments")
68
+ end
69
+ end
70
+
71
+ def processor_instancier
72
+ processor.constantize.method(:new)
73
+ end
74
+
75
+ def processor_instancier_arity
76
+ processor.constantize.instance_method(:initialize).arity
77
+ end
57
78
  end
58
79
  end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ class AddExtraDataToAranhaAddresses < ActiveRecord::Migration
3
+ def change
4
+ add_column :aranha_addresses, :extra_data, :text
5
+ end
6
+ end
@@ -2,13 +2,14 @@
2
2
 
3
3
  module Aranha
4
4
  class DefaultProcessor
5
- attr_reader :source_uri
5
+ attr_reader :source_uri, :extra_data
6
6
 
7
- def initialize(source_uri)
7
+ def initialize(source_uri, extra_data)
8
8
  unless source_uri.is_a?(Addressable::URI)
9
9
  source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
10
10
  end
11
11
  @source_uri = Addressable::URI.parse(source_uri)
12
+ @extra_data = extra_data
12
13
  end
13
14
 
14
15
  def process
@@ -33,7 +33,7 @@ module Aranha
33
33
  end
34
34
 
35
35
  def content_file
36
- ::File.open(@url.gsub(%r{\Afile://}, ''), &:read)
36
+ ::File.open(@url.to_s.gsub(%r{\Afile://}, ''), &:read)
37
37
  end
38
38
 
39
39
  def content_get
@@ -5,10 +5,19 @@ require_dependency 'aranha/parsers/invalid_state_exception'
5
5
 
6
6
  module Aranha
7
7
  class Processor
8
- NETWORK_EXCEPTIONS = [::HTTPClient::BadResponseError, Errno::ECONNRESET,
9
- ::Net::HTTPFatalError, HTTPClient::ConnectTimeoutError,
10
- ::HTTPClient::ReceiveTimeoutError,
11
- ::Aranha::Parsers::InvalidStateException].freeze
8
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
9
+ CORE_EXCEPTIONS = [::SocketError].freeze
10
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
11
+ HTTPCLIENT_EXCEPTIONS = [
12
+ ::HTTPClient::BadResponseError,
13
+ ::HTTPClient::ConnectTimeoutError,
14
+ ::HTTPClient::ReceiveTimeoutError
15
+ ].freeze
16
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
17
+
18
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
19
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
20
+
12
21
  DEFAULT_MAX_TRIES = 3
13
22
 
14
23
  def initialize
@@ -48,8 +57,8 @@ module Aranha
48
57
  begin
49
58
  address.process
50
59
  @failed.delete(address.id)
51
- rescue StandardError => ex
52
- process_exception(address, ex)
60
+ rescue StandardError => e
61
+ process_exception(address, e)
53
62
  end
54
63
  end
55
64
 
@@ -70,7 +79,8 @@ module Aranha
70
79
  end
71
80
 
72
81
  def network_exception?(exception)
73
- NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
82
+ return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
83
+ exception.cause.present? ? network_exception?(exception.cause) : false
74
84
  end
75
85
 
76
86
  def not_try_ids
@@ -85,8 +95,8 @@ module Aranha
85
95
  @max_tries ||= begin
86
96
  r = Integer(ENV['ARANHA_MAX_TRIES'])
87
97
  r <= 0 ? 0 : r
88
- rescue ArgumentError, TypeError
89
- DEFAULT_MAX_TRIES
98
+ rescue ArgumentError, TypeError
99
+ DEFAULT_MAX_TRIES
90
100
  end
91
101
  end
92
102
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.2.1'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-14 00:00:00.000000000 Z
11
+ date: 2019-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -99,6 +99,7 @@ files:
99
99
  - app/views/layouts/aranha/application.html.erb
100
100
  - config/routes.rb
101
101
  - db/migrate/20171201021251_create_aranha_addresses.rb
102
+ - db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
102
103
  - lib/aranha.rb
103
104
  - lib/aranha/default_processor.rb
104
105
  - lib/aranha/dom_elements_traverser.rb