aranha 0.14.5 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b89c63a38f3c2f658263dde445f10b45628ecd19069d6a78a0d5e838a1a6127
4
- data.tar.gz: 15cfb49d733aba80bd316813d21c587c47c4799af6a60d9a6147bed1e08f132f
3
+ metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
4
+ data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
5
5
  SHA512:
6
- metadata.gz: fe5b484b9709033c8f87db0f8070c83d5dd328109d2d6bfb12a7ef356ba7268ec45cf728f5554dbd3015e28f60d14be2b81d6ac33f836365024fb4e88f1ed7e1
7
- data.tar.gz: 9456fafe876c0894f810575fe41f078703ce2f9b75930c967ed4ffd9d33b93eeeb80e4b899e9bc12f31612d397fb1d462ab29520e7df4f71956fb64c489628f6
6
+ metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
7
+ data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
data/README.rdoc CHANGED
@@ -1,3 +1,3 @@
1
1
  = Aranha
2
2
 
3
- Rails utilities for web crawling.
3
+ Ruby utilities for web crawling.
data/lib/aranha.rb CHANGED
@@ -1,15 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'httpclient'
4
- require 'active_support/dependencies'
5
- require 'active_scaffold'
3
+ require 'eac_ruby_utils/core_ext'
6
4
 
7
5
  module Aranha
8
- require 'aranha/default_processor'
9
- require 'aranha/dom_elements_traverser'
10
- require 'aranha/engine'
11
- require 'aranha/fixtures'
12
- require 'aranha/processor'
13
- require 'aranha/parsers'
14
- require 'aranha/selenium'
6
+ require_sub __FILE__
15
7
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -1,11 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  class DefaultProcessor
7
- attr_reader :source_uri, :extra_data
8
-
9
8
  class << self
10
9
  def sanitize_uri(uri)
11
10
  return uri if uri.is_a?(Hash)
@@ -15,17 +14,14 @@ module Aranha
15
14
  end
16
15
  end
17
16
 
18
- def initialize(source_uri, extra_data)
19
- @source_uri = self.class.sanitize_uri(source_uri)
20
- @extra_data = extra_data
17
+ common_constructor :source_uri, :extra_data do
18
+ self.source_uri = self.class.sanitize_uri(source_uri)
21
19
  end
22
20
 
23
21
  def process
24
22
  raise 'Implement method process'
25
23
  end
26
24
 
27
- protected
28
-
29
25
  def target_uri
30
26
  source_uri
31
27
  end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/abstract_methods'
4
+
5
+ module Aranha
6
+ class Manager
7
+ include ::EacRubyUtils::AbstractMethods
8
+
9
+ class << self
10
+ attr_accessor :default
11
+ end
12
+
13
+ def addresses_count
14
+ raise_abstract_method(__method__)
15
+ end
16
+
17
+ def add_address(_uri, _processor_class, _extra_data = nil)
18
+ raise_abstract_method(__method__)
19
+ end
20
+
21
+ def add_start_point(uri, processor_class, extra_data = nil)
22
+ start_points_var << ::EacRubyUtils::Struct.new(
23
+ uri: uri, processor_class: processor_class, extra_data: extra_data
24
+ )
25
+ end
26
+
27
+ def clear_expired_addresses
28
+ raise_abstract_method(__method__)
29
+ end
30
+
31
+ def init
32
+ clear_expired_addresses
33
+ start_points_to_addresses
34
+ end
35
+
36
+ def log_info(_message)
37
+ raise_abstract_method(__method__)
38
+ end
39
+
40
+ def log_warn(_message)
41
+ raise_abstract_method(__method__)
42
+ end
43
+
44
+ def start_points
45
+ start_points_var.to_enum
46
+ end
47
+
48
+ def start_points_to_addresses
49
+ start_points_var.each do |sp|
50
+ add_address(sp.uri, sp.processor_class, sp.extra_data)
51
+ end
52
+ end
53
+
54
+ def unprocessed_addresses
55
+ raise_abstract_method(__method__)
56
+ end
57
+
58
+ private
59
+
60
+ def start_points_var
61
+ @start_points_var ||= []
62
+ end
63
+ end
64
+ end
@@ -1,30 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'net/http'
4
+ require 'httpclient'
4
5
  require 'aranha/parsers/invalid_state_exception'
6
+ require 'aranha/manager'
5
7
 
6
8
  module Aranha
7
9
  class Processor
8
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
9
- CORE_EXCEPTIONS = [::SocketError].freeze
10
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
11
- HTTPCLIENT_EXCEPTIONS = [
12
- ::HTTPClient::BadResponseError,
13
- ::HTTPClient::ConnectTimeoutError,
14
- ::HTTPClient::ReceiveTimeoutError
15
- ].freeze
16
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
17
-
18
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
19
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
20
-
21
10
  DEFAULT_MAX_TRIES = 3
22
11
 
23
- def initialize
24
- ::Aranha::Address.clear_expired
25
- ::Aranha::Address.add_start_points
12
+ attr_reader :manager
13
+
14
+ def initialize(manager = nil)
15
+ @manager = manager || ::Aranha::Manager.default
26
16
  @failed = {}
27
17
  @try = 0
18
+ self.manager.init
28
19
  process_loop
29
20
  raise "Addresses failed: #{@failed.count}" if @failed.any?
30
21
  end
@@ -32,7 +23,7 @@ module Aranha
32
23
  private
33
24
 
34
25
  def process_loop
35
- Rails.logger.info("Max tries: #{max_tries_s}")
26
+ manager.log_info("Max tries: #{max_tries_s}")
36
27
  loop do
37
28
  break if process_next_address
38
29
  end
@@ -52,22 +43,22 @@ module Aranha
52
43
  end
53
44
 
54
45
  def process_address(address)
55
- Rails.logger.info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
56
- " Unprocessed: #{unprocessed.count}/#{Aranha::Address.count})")
57
- begin
58
- address.process
59
- @failed.delete(address.id)
60
- rescue StandardError => e
61
- process_exception(address, e)
46
+ manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
47
+ " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
62
53
  end
63
54
  end
64
55
 
65
- def process_exception(address, exception)
66
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
67
58
 
68
- @failed[address.id] ||= 0
69
- @failed[address.id] += 1
70
- Rails.logger.warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
71
62
  end
72
63
 
73
64
  def next_address
@@ -75,13 +66,7 @@ module Aranha
75
66
  end
76
67
 
77
68
  def unprocessed
78
- ::Aranha::Address.unprocessed
79
- end
80
-
81
- def network_exception?(exception)
82
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
83
-
84
- exception.cause.present? ? network_exception?(exception.cause) : false
69
+ ::Aranha::Manager.default.unprocessed_addresses
85
70
  end
86
71
 
87
72
  def not_try_ids
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.14.5'
4
+ VERSION = '0.16.0'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.5
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-14 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: active_scaffold
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: 3.4.41.1
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: 3.4.41.1
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: aranha-parsers
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -58,34 +44,20 @@ dependencies:
58
44
  - - ">="
59
45
  - !ruby/object:Gem::Version
60
46
  version: 0.1.2
61
- - !ruby/object:Gem::Dependency
62
- name: eac_rails_utils
63
- requirement: !ruby/object:Gem::Requirement
64
- requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '0.11'
68
- type: :runtime
69
- prerelease: false
70
- version_requirements: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '0.11'
75
47
  - !ruby/object:Gem::Dependency
76
48
  name: eac_ruby_utils
77
49
  requirement: !ruby/object:Gem::Requirement
78
50
  requirements:
79
51
  - - "~>"
80
52
  - !ruby/object:Gem::Version
81
- version: '0.35'
53
+ version: '0.52'
82
54
  type: :runtime
83
55
  prerelease: false
84
56
  version_requirements: !ruby/object:Gem::Requirement
85
57
  requirements:
86
58
  - - "~>"
87
59
  - !ruby/object:Gem::Version
88
- version: '0.35'
60
+ version: '0.52'
89
61
  - !ruby/object:Gem::Dependency
90
62
  name: httpclient
91
63
  requirement: !ruby/object:Gem::Requirement
@@ -100,48 +72,20 @@ dependencies:
100
72
  - - ">="
101
73
  - !ruby/object:Gem::Version
102
74
  version: '2.6'
103
- - !ruby/object:Gem::Dependency
104
- name: rails
105
- requirement: !ruby/object:Gem::Requirement
106
- requirements:
107
- - - ">="
108
- - !ruby/object:Gem::Version
109
- version: 4.2.11.3
110
- type: :runtime
111
- prerelease: false
112
- version_requirements: !ruby/object:Gem::Requirement
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 4.2.11.3
117
75
  - !ruby/object:Gem::Dependency
118
76
  name: eac_ruby_gem_support
119
77
  requirement: !ruby/object:Gem::Requirement
120
78
  requirements:
121
79
  - - "~>"
122
80
  - !ruby/object:Gem::Version
123
- version: '0.1'
81
+ version: '0.2'
124
82
  type: :development
125
83
  prerelease: false
126
84
  version_requirements: !ruby/object:Gem::Requirement
127
85
  requirements:
128
86
  - - "~>"
129
87
  - !ruby/object:Gem::Version
130
- version: '0.1'
131
- - !ruby/object:Gem::Dependency
132
- name: sqlite3
133
- requirement: !ruby/object:Gem::Requirement
134
- requirements:
135
- - - ">="
136
- - !ruby/object:Gem::Version
137
- version: '0'
138
- type: :development
139
- prerelease: false
140
- version_requirements: !ruby/object:Gem::Requirement
141
- requirements:
142
- - - ">="
143
- - !ruby/object:Gem::Version
144
- version: '0'
88
+ version: '0.2'
145
89
  description:
146
90
  email:
147
91
  - eduardobogoni@gmail.com
@@ -151,29 +95,12 @@ extra_rdoc_files: []
151
95
  files:
152
96
  - MIT-LICENSE
153
97
  - README.rdoc
154
- - app/assets/javascripts/aranha/application.js
155
- - app/assets/stylesheets/aranha/application.css
156
- - app/controllers/aranha/addresses_controller.rb
157
- - app/helpers/aranha/application_helper.rb
158
- - app/models/aranha/address.rb
159
- - app/views/layouts/aranha/application.html.erb
160
- - config/locales/en.yml
161
- - config/locales/pt-BR.yml
162
- - config/routes.rb
163
- - db/migrate/20171201021251_create_aranha_addresses.rb
164
- - db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
165
98
  - lib/aranha.rb
99
+ - lib/aranha/address_processor.rb
166
100
  - lib/aranha/default_processor.rb
167
- - lib/aranha/dom_elements_traverser.rb
168
- - lib/aranha/dom_elements_traverser/conditions.rb
169
- - lib/aranha/dom_elements_traverser/cursor.rb
170
- - lib/aranha/dom_elements_traverser/data.rb
171
- - lib/aranha/engine.rb
172
- - lib/aranha/fixtures.rb
173
- - lib/aranha/fixtures/download.rb
101
+ - lib/aranha/manager.rb
174
102
  - lib/aranha/processor.rb
175
103
  - lib/aranha/version.rb
176
- - lib/tasks/aranha_tasks.rake
177
104
  homepage:
178
105
  licenses:
179
106
  - MIT
@@ -193,8 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
120
  - !ruby/object:Gem::Version
194
121
  version: '0'
195
122
  requirements: []
196
- rubygems_version: 3.0.8
123
+ rubygems_version: 3.1.6
197
124
  signing_key:
198
125
  specification_version: 4
199
- summary: Rails utilities for web crawling.
126
+ summary: Ruby utilities for web crawling.
200
127
  test_files: []
@@ -1,14 +0,0 @@
1
- // This is a manifest file that'll be compiled into application.js, which will include all the files
2
- // listed below.
3
- //
4
- // Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
5
- // or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
6
- //
7
- // It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
8
- // compiled file.
9
- //
10
- // Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
11
- // about supported directives.
12
- //
13
- //= require_tree .
14
- //= require active_scaffold
@@ -1,16 +0,0 @@
1
- /*
2
- * This is a manifest file that'll be compiled into application.css, which will include all the files
3
- * listed below.
4
- *
5
- * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
6
- * or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
7
- *
8
- * You're free to add application-wide styles to this file and they'll appear at the bottom of the
9
- * compiled file so the styles you add here take precedence over styles defined in any styles
10
- * defined in the other CSS/SCSS files in this directory. It is generally better to create a new
11
- * file per style scope.
12
- *
13
- *= require_tree .
14
- *= require_self
15
- *= require active_scaffold
16
- */
@@ -1,8 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class AddressesController < ::ApplicationController
5
- active_scaffold :'aranha/address' do |_conf|
6
- end
7
- end
8
- end
@@ -1,6 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module ApplicationHelper
5
- end
6
- end
@@ -1,98 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'eac_ruby_utils/yaml'
4
-
5
- module Aranha
6
- class Address < ActiveRecord::Base
7
- include ::EacRailsUtils::Models::InequalityQueries
8
-
9
- add_inequality_queries(:created_at)
10
-
11
- class << self
12
- def set_start_point(url, processor)
13
- start_points[url] = processor
14
- end
15
-
16
- def add_start_points
17
- ::Rails.logger.info("Start points: #{start_points.count}")
18
- start_points.each do |url, processor|
19
- add(url, processor)
20
- end
21
- end
22
-
23
- def add(url, processor, extra_data = nil)
24
- a = find_or_initialize_by(url: sanitize_url(url))
25
- a.processor = processor
26
- a.extra_data = extra_data.to_yaml
27
- a.save!
28
- end
29
-
30
- def clear_expired
31
- q = by_created_at_lt(Time.zone.now - 12.hours)
32
- Rails.logger.info("Addresses expired: #{q.count}")
33
- q.destroy_all
34
- end
35
-
36
- private
37
-
38
- def sanitize_url(url)
39
- if url.is_a?(Hash)
40
- url.to_yaml
41
- else
42
- url.to_s
43
- end
44
- end
45
-
46
- def start_points
47
- @start_points ||= {}
48
- end
49
- end
50
-
51
- validates :url, presence: true, uniqueness: true
52
- validates :processor, presence: true
53
-
54
- scope :unprocessed, lambda {
55
- where(processed_at: nil)
56
- }
57
-
58
- def to_s
59
- "#{processor}|#{url}"
60
- end
61
-
62
- def process
63
- ActiveRecord::Base.transaction do
64
- instanciate_processor.process
65
- self.processed_at = Time.zone.now
66
- save!
67
- end
68
- end
69
-
70
- private
71
-
72
- def instanciate_processor
73
- processor_instancier.call(*processor_instancier_arguments)
74
- end
75
-
76
- def url_to_process
77
- ::EacRubyUtils::Yaml.load(url)
78
- end
79
-
80
- def processor_instancier
81
- processor.constantize.method(:new)
82
- end
83
-
84
- def processor_instancier_arguments
85
- if processor_instancier_arity == 2 || processor_instancier_arity.negative?
86
- [url_to_process, EacRubyUtils::Yaml.load(extra_data)]
87
- elsif processor_instancier_arity == 1
88
- [processor_instancier.call(url_to_process)]
89
- else
90
- raise("#{processor}.initialize should has 1 or 2 or * arguments")
91
- end
92
- end
93
-
94
- def processor_instancier_arity
95
- processor.constantize.instance_method(:initialize).arity
96
- end
97
- end
98
- end
@@ -1,12 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <title>Aranha</title>
5
- <%= stylesheet_link_tag "aranha/application", media: "all" %>
6
- <%= javascript_include_tag "aranha/application" %>
7
- <%= csrf_meta_tags %>
8
- </head>
9
- <body>
10
- <%= yield %>
11
- </body>
12
- </html>
@@ -1,6 +0,0 @@
1
- en:
2
- activerecord:
3
- models:
4
- aranha/address:
5
- one: Aranha address
6
- other: Aranha addresses
@@ -1,6 +0,0 @@
1
- pt-BR:
2
- activerecord:
3
- models:
4
- aranha/address:
5
- one: Endereço Aranha
6
- other: Endereços Aranha
data/config/routes.rb DELETED
@@ -1,6 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- Aranha::Engine.routes.draw do
4
- concern :active_scaffold, ActiveScaffold::Routing::Basic.new(association: true)
5
- resources(:addresses, concerns: :active_scaffold)
6
- end
@@ -1,15 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class CreateAranhaAddresses < (
4
- Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
5
- )
6
- def change
7
- create_table :aranha_addresses do |t|
8
- t.string :url
9
- t.string :processor
10
- t.timestamp :processed_at
11
-
12
- t.timestamps null: false
13
- end
14
- end
15
- end
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class AddExtraDataToAranhaAddresses < (
4
- Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
5
- )
6
- def change
7
- add_column :aranha_addresses, :extra_data, :text
8
- end
9
- end
@@ -1,44 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/dom_elements_traverser/conditions'
4
- require 'aranha/dom_elements_traverser/data'
5
- require 'aranha/dom_elements_traverser/cursor'
6
-
7
- module Aranha
8
- class DomElementsTraverser
9
- include ::Aranha::DomElementsTraverser::Conditions
10
- include ::Aranha::DomElementsTraverser::Cursor
11
- include ::Aranha::DomElementsTraverser::Data
12
-
13
- class << self
14
- def traverse(options, &block)
15
- new(elements_from_options(options), &block)
16
- end
17
-
18
- def empty
19
- new([])
20
- end
21
-
22
- private
23
-
24
- def elements_from_options(options)
25
- options = ::EacRubyUtils::OptionsConsumer.new(options)
26
- elements = nil
27
- options.consume(:children_of) { |v| elements = v.children.to_a }
28
- raise 'None option of [:children_of] defined' unless elements
29
-
30
- options.validate
31
- elements
32
- end
33
- end
34
-
35
- private
36
-
37
- def initialize(elements, &block)
38
- @elements = elements
39
- @index = 0
40
- @data = {}
41
- instance_eval(&block) if block
42
- end
43
- end
44
- end
@@ -1,32 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class DomElementsTraverser
5
- module Conditions
6
- private
7
-
8
- def match_conditions?(conditions)
9
- raise "No element (Conditions: #{conditions})" unless current
10
-
11
- conditions.all? { |key, value| match_condition?(key, value) }
12
- end
13
-
14
- def match_condition?(key, value)
15
- case key.to_sym
16
- when :text then match_text_condition?(value)
17
- when :name then match_name_condition?(value)
18
- else raise "Unknown key condition: (#{key})"
19
- end
20
- end
21
-
22
- def match_name_condition?(tag_name)
23
- current.name.casecmp(tag_name.to_s).zero?
24
- end
25
-
26
- def match_text_condition?(texts)
27
- texts = [texts.to_s] unless texts.is_a?(Array)
28
- texts.all? { |t| current.text.downcase.include?(t.downcase) }
29
- end
30
- end
31
- end
32
- end
@@ -1,48 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'eac_ruby_utils/options_consumer'
4
-
5
- module Aranha
6
- class DomElementsTraverser
7
- module Cursor
8
- private
9
-
10
- def current
11
- @elements[@index]
12
- end
13
-
14
- def skip
15
- @index += 1
16
- end
17
-
18
- def skip_until(options)
19
- oc = ::EacRubyUtils::OptionsConsumer.new(options)
20
- optional = oc.consume(:optional, false)
21
- while current
22
- break if match_conditions?(oc.left_data)
23
-
24
- skip
25
- end
26
- raise "No element found for conditions #{oc.left_data}" unless current || optional
27
-
28
- current
29
- end
30
-
31
- def skip_until_after(conditions)
32
- skip_until(conditions)
33
- skip
34
- current
35
- end
36
-
37
- def if_found(conditions, &block)
38
- marked = @index
39
- skip_until({ optional: true }.merge(conditions))
40
- if current
41
- instance_eval(&block) if block
42
- else
43
- @index = marked
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class DomElementsTraverser
5
- module Data
6
- def data
7
- @data.dup
8
- end
9
-
10
- private
11
-
12
- def store(key, options = {}, &converter)
13
- validate(options)
14
- value = store_value(options, converter)
15
- @data[key] = value
16
- r = current
17
- skip
18
- r
19
- end
20
-
21
- def store_value(options, converter)
22
- value = if options.key?(:attribute)
23
- current.attribute(options[:attribute]).value
24
- else
25
- current.text.strip
26
- end
27
- converter ? converter.call(value) : value
28
- end
29
-
30
- def validate(options)
31
- return unless options.key?(:validate)
32
- return if match_conditions?(options[:validate])
33
-
34
- raise "Element does not match conditions #{options[:validate]}" \
35
- " (Element: |#{current}|#{current.name}|)"
36
- end
37
- end
38
- end
39
- end
data/lib/aranha/engine.rb DELETED
@@ -1,13 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class Engine < ::Rails::Engine
5
- isolate_namespace Aranha
6
-
7
- initializer :append_migrations do |app|
8
- config.paths['db/migrate'].expanded.each do |expanded_path|
9
- app.config.paths['db/migrate'] << expanded_path
10
- end
11
- end
12
- end
13
- end
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module Fixtures
5
- require 'aranha/fixtures/download'
6
- end
7
- end
@@ -1,72 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/base'
4
- require 'aranha/parsers/source_address'
5
- require 'aranha/parsers/source_target_fixtures'
6
-
7
- module Aranha
8
- module Fixtures
9
- class Download
10
- attr_reader :pending
11
-
12
- def initialize(options)
13
- @prefix = options.fetch(:prefix)
14
- @prefix = '' if @prefix.blank?
15
- @download = options.fetch(:download)
16
- @pending = options.fetch(:pending)
17
- end
18
-
19
- def run
20
- url_files.each do |f|
21
- Rails.logger.info(relative_path(f))
22
- download(url(f), target(f)) if @download
23
- end
24
- end
25
-
26
- private
27
-
28
- def url_files
29
- Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
30
- end
31
-
32
- def select_path?(path)
33
- return false unless match_prefix_pattern(path)
34
-
35
- !pending || !source_exist?(path)
36
- end
37
-
38
- def match_prefix_pattern(path)
39
- relative_path(path).start_with?(@prefix)
40
- end
41
-
42
- def fixtures_root
43
- Rails.root.to_s
44
- end
45
-
46
- def download(url, target)
47
- Rails.logger.info "Baixando \"#{url}\"..."
48
- content = ::Aranha::Parsers::Base.new(url).content
49
- raise "Content is blank for \"#{url}\"" if content.blank?
50
-
51
- File.open(target, 'wb') { |file| file.write(content) }
52
- end
53
-
54
- def url(file)
55
- ::Aranha::Parsers::SourceAddress.from_file(file)
56
- end
57
-
58
- def target(file)
59
- File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
60
- end
61
-
62
- def relative_path(path)
63
- path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
64
- end
65
-
66
- def source_exist?(path)
67
- stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
68
- stf.source_file(::File.basename(path, '.url')).present?
69
- end
70
- end
71
- end
72
- end
@@ -1,22 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- namespace(:aranha) do
4
- task process: :environment do
5
- ::Aranha::Processor.new
6
- end
7
-
8
- task clear: :environment do
9
- Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
10
- end
11
-
12
- namespace :fixtures do
13
- desc 'Download remote content for fixtures.'
14
- task download: :environment do
15
- ::Aranha::Fixtures::Download.new(
16
- prefix: ENV['PREFIX'],
17
- download: ENV['DOWNLOAD'].present?,
18
- pending: ENV['PENDING'].present?
19
- ).run
20
- end
21
- end
22
- end