aranha 0.14.5 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b89c63a38f3c2f658263dde445f10b45628ecd19069d6a78a0d5e838a1a6127
4
- data.tar.gz: 15cfb49d733aba80bd316813d21c587c47c4799af6a60d9a6147bed1e08f132f
3
+ metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
4
+ data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
5
5
  SHA512:
6
- metadata.gz: fe5b484b9709033c8f87db0f8070c83d5dd328109d2d6bfb12a7ef356ba7268ec45cf728f5554dbd3015e28f60d14be2b81d6ac33f836365024fb4e88f1ed7e1
7
- data.tar.gz: 9456fafe876c0894f810575fe41f078703ce2f9b75930c967ed4ffd9d33b93eeeb80e4b899e9bc12f31612d397fb1d462ab29520e7df4f71956fb64c489628f6
6
+ metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
7
+ data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
data/README.rdoc CHANGED
@@ -1,3 +1,3 @@
1
1
  = Aranha
2
2
 
3
- Rails utilities for web crawling.
3
+ Ruby utilities for web crawling.
data/lib/aranha.rb CHANGED
@@ -1,15 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'httpclient'
4
- require 'active_support/dependencies'
5
- require 'active_scaffold'
3
+ require 'eac_ruby_utils/core_ext'
6
4
 
7
5
  module Aranha
8
- require 'aranha/default_processor'
9
- require 'aranha/dom_elements_traverser'
10
- require 'aranha/engine'
11
- require 'aranha/fixtures'
12
- require 'aranha/processor'
13
- require 'aranha/parsers'
14
- require 'aranha/selenium'
6
+ require_sub __FILE__
15
7
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ class AddressProcessor
7
+ ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
8
+ CORE_EXCEPTIONS = [::SocketError].freeze
9
+ ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
10
+ HTTPCLIENT_EXCEPTIONS = [
11
+ ::HTTPClient::BadResponseError,
12
+ ::HTTPClient::ConnectTimeoutError,
13
+ ::HTTPClient::ReceiveTimeoutError
14
+ ].freeze
15
+ NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
16
+
17
+ NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
18
+ HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
19
+
20
+ class << self
21
+ def rescuable_error?(error)
22
+ return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
23
+
24
+ error.cause.present? ? network_error?(error.cause) : false
25
+ end
26
+ end
27
+
28
+ enable_simple_cache
29
+ common_constructor :address
30
+
31
+ def successful?
32
+ error.blank?
33
+ end
34
+
35
+ def rescuable_error?
36
+ self.class.rescuable_error?(error)
37
+ end
38
+
39
+ private
40
+
41
+ def error_uncached
42
+ address.process
43
+ nil
44
+ rescue ::StandardError => e
45
+ e
46
+ end
47
+ end
48
+ end
@@ -1,11 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  class DefaultProcessor
7
- attr_reader :source_uri, :extra_data
8
-
9
8
  class << self
10
9
  def sanitize_uri(uri)
11
10
  return uri if uri.is_a?(Hash)
@@ -15,17 +14,14 @@ module Aranha
15
14
  end
16
15
  end
17
16
 
18
- def initialize(source_uri, extra_data)
19
- @source_uri = self.class.sanitize_uri(source_uri)
20
- @extra_data = extra_data
17
+ common_constructor :source_uri, :extra_data do
18
+ self.source_uri = self.class.sanitize_uri(source_uri)
21
19
  end
22
20
 
23
21
  def process
24
22
  raise 'Implement method process'
25
23
  end
26
24
 
27
- protected
28
-
29
25
  def target_uri
30
26
  source_uri
31
27
  end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/abstract_methods'
4
+
5
+ module Aranha
6
+ class Manager
7
+ include ::EacRubyUtils::AbstractMethods
8
+
9
+ class << self
10
+ attr_accessor :default
11
+ end
12
+
13
+ def addresses_count
14
+ raise_abstract_method(__method__)
15
+ end
16
+
17
+ def add_address(_uri, _processor_class, _extra_data = nil)
18
+ raise_abstract_method(__method__)
19
+ end
20
+
21
+ def add_start_point(uri, processor_class, extra_data = nil)
22
+ start_points_var << ::EacRubyUtils::Struct.new(
23
+ uri: uri, processor_class: processor_class, extra_data: extra_data
24
+ )
25
+ end
26
+
27
+ def clear_expired_addresses
28
+ raise_abstract_method(__method__)
29
+ end
30
+
31
+ def init
32
+ clear_expired_addresses
33
+ start_points_to_addresses
34
+ end
35
+
36
+ def log_info(_message)
37
+ raise_abstract_method(__method__)
38
+ end
39
+
40
+ def log_warn(_message)
41
+ raise_abstract_method(__method__)
42
+ end
43
+
44
+ def start_points
45
+ start_points_var.to_enum
46
+ end
47
+
48
+ def start_points_to_addresses
49
+ start_points_var.each do |sp|
50
+ add_address(sp.uri, sp.processor_class, sp.extra_data)
51
+ end
52
+ end
53
+
54
+ def unprocessed_addresses
55
+ raise_abstract_method(__method__)
56
+ end
57
+
58
+ private
59
+
60
+ def start_points_var
61
+ @start_points_var ||= []
62
+ end
63
+ end
64
+ end
@@ -1,30 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'net/http'
4
+ require 'httpclient'
4
5
  require 'aranha/parsers/invalid_state_exception'
6
+ require 'aranha/manager'
5
7
 
6
8
  module Aranha
7
9
  class Processor
8
- ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
9
- CORE_EXCEPTIONS = [::SocketError].freeze
10
- ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
11
- HTTPCLIENT_EXCEPTIONS = [
12
- ::HTTPClient::BadResponseError,
13
- ::HTTPClient::ConnectTimeoutError,
14
- ::HTTPClient::ReceiveTimeoutError
15
- ].freeze
16
- NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
17
-
18
- NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
19
- HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
20
-
21
10
  DEFAULT_MAX_TRIES = 3
22
11
 
23
- def initialize
24
- ::Aranha::Address.clear_expired
25
- ::Aranha::Address.add_start_points
12
+ attr_reader :manager
13
+
14
+ def initialize(manager = nil)
15
+ @manager = manager || ::Aranha::Manager.default
26
16
  @failed = {}
27
17
  @try = 0
18
+ self.manager.init
28
19
  process_loop
29
20
  raise "Addresses failed: #{@failed.count}" if @failed.any?
30
21
  end
@@ -32,7 +23,7 @@ module Aranha
32
23
  private
33
24
 
34
25
  def process_loop
35
- Rails.logger.info("Max tries: #{max_tries_s}")
26
+ manager.log_info("Max tries: #{max_tries_s}")
36
27
  loop do
37
28
  break if process_next_address
38
29
  end
@@ -52,22 +43,22 @@ module Aranha
52
43
  end
53
44
 
54
45
  def process_address(address)
55
- Rails.logger.info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
56
- " Unprocessed: #{unprocessed.count}/#{Aranha::Address.count})")
57
- begin
58
- address.process
59
- @failed.delete(address.id)
60
- rescue StandardError => e
61
- process_exception(address, e)
46
+ manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
47
+ " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
48
+ ap = ::Aranha::AddressProcessor.new(address)
49
+ if ap.successful?
50
+ @failed.delete(ap.address.id)
51
+ else
52
+ process_exception(ap)
62
53
  end
63
54
  end
64
55
 
65
- def process_exception(address, exception)
66
- raise exception unless network_exception?(exception)
56
+ def process_exception(address_processor)
57
+ raise address_processor.error unless address_processor.rescuable_error?
67
58
 
68
- @failed[address.id] ||= 0
69
- @failed[address.id] += 1
70
- Rails.logger.warn(exception)
59
+ @failed[address_processor.address.id] ||= 0
60
+ @failed[address_processor.address.id] += 1
61
+ manager.log_warn(address_processor.error)
71
62
  end
72
63
 
73
64
  def next_address
@@ -75,13 +66,7 @@ module Aranha
75
66
  end
76
67
 
77
68
  def unprocessed
78
- ::Aranha::Address.unprocessed
79
- end
80
-
81
- def network_exception?(exception)
82
- return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
83
-
84
- exception.cause.present? ? network_exception?(exception.cause) : false
69
+ ::Aranha::Manager.default.unprocessed_addresses
85
70
  end
86
71
 
87
72
  def not_try_ids
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.14.5'
4
+ VERSION = '0.16.0'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.5
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-14 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: active_scaffold
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: 3.4.41.1
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: 3.4.41.1
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: aranha-parsers
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -58,34 +44,20 @@ dependencies:
58
44
  - - ">="
59
45
  - !ruby/object:Gem::Version
60
46
  version: 0.1.2
61
- - !ruby/object:Gem::Dependency
62
- name: eac_rails_utils
63
- requirement: !ruby/object:Gem::Requirement
64
- requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '0.11'
68
- type: :runtime
69
- prerelease: false
70
- version_requirements: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '0.11'
75
47
  - !ruby/object:Gem::Dependency
76
48
  name: eac_ruby_utils
77
49
  requirement: !ruby/object:Gem::Requirement
78
50
  requirements:
79
51
  - - "~>"
80
52
  - !ruby/object:Gem::Version
81
- version: '0.35'
53
+ version: '0.52'
82
54
  type: :runtime
83
55
  prerelease: false
84
56
  version_requirements: !ruby/object:Gem::Requirement
85
57
  requirements:
86
58
  - - "~>"
87
59
  - !ruby/object:Gem::Version
88
- version: '0.35'
60
+ version: '0.52'
89
61
  - !ruby/object:Gem::Dependency
90
62
  name: httpclient
91
63
  requirement: !ruby/object:Gem::Requirement
@@ -100,48 +72,20 @@ dependencies:
100
72
  - - ">="
101
73
  - !ruby/object:Gem::Version
102
74
  version: '2.6'
103
- - !ruby/object:Gem::Dependency
104
- name: rails
105
- requirement: !ruby/object:Gem::Requirement
106
- requirements:
107
- - - ">="
108
- - !ruby/object:Gem::Version
109
- version: 4.2.11.3
110
- type: :runtime
111
- prerelease: false
112
- version_requirements: !ruby/object:Gem::Requirement
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 4.2.11.3
117
75
  - !ruby/object:Gem::Dependency
118
76
  name: eac_ruby_gem_support
119
77
  requirement: !ruby/object:Gem::Requirement
120
78
  requirements:
121
79
  - - "~>"
122
80
  - !ruby/object:Gem::Version
123
- version: '0.1'
81
+ version: '0.2'
124
82
  type: :development
125
83
  prerelease: false
126
84
  version_requirements: !ruby/object:Gem::Requirement
127
85
  requirements:
128
86
  - - "~>"
129
87
  - !ruby/object:Gem::Version
130
- version: '0.1'
131
- - !ruby/object:Gem::Dependency
132
- name: sqlite3
133
- requirement: !ruby/object:Gem::Requirement
134
- requirements:
135
- - - ">="
136
- - !ruby/object:Gem::Version
137
- version: '0'
138
- type: :development
139
- prerelease: false
140
- version_requirements: !ruby/object:Gem::Requirement
141
- requirements:
142
- - - ">="
143
- - !ruby/object:Gem::Version
144
- version: '0'
88
+ version: '0.2'
145
89
  description:
146
90
  email:
147
91
  - eduardobogoni@gmail.com
@@ -151,29 +95,12 @@ extra_rdoc_files: []
151
95
  files:
152
96
  - MIT-LICENSE
153
97
  - README.rdoc
154
- - app/assets/javascripts/aranha/application.js
155
- - app/assets/stylesheets/aranha/application.css
156
- - app/controllers/aranha/addresses_controller.rb
157
- - app/helpers/aranha/application_helper.rb
158
- - app/models/aranha/address.rb
159
- - app/views/layouts/aranha/application.html.erb
160
- - config/locales/en.yml
161
- - config/locales/pt-BR.yml
162
- - config/routes.rb
163
- - db/migrate/20171201021251_create_aranha_addresses.rb
164
- - db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
165
98
  - lib/aranha.rb
99
+ - lib/aranha/address_processor.rb
166
100
  - lib/aranha/default_processor.rb
167
- - lib/aranha/dom_elements_traverser.rb
168
- - lib/aranha/dom_elements_traverser/conditions.rb
169
- - lib/aranha/dom_elements_traverser/cursor.rb
170
- - lib/aranha/dom_elements_traverser/data.rb
171
- - lib/aranha/engine.rb
172
- - lib/aranha/fixtures.rb
173
- - lib/aranha/fixtures/download.rb
101
+ - lib/aranha/manager.rb
174
102
  - lib/aranha/processor.rb
175
103
  - lib/aranha/version.rb
176
- - lib/tasks/aranha_tasks.rake
177
104
  homepage:
178
105
  licenses:
179
106
  - MIT
@@ -193,8 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
120
  - !ruby/object:Gem::Version
194
121
  version: '0'
195
122
  requirements: []
196
- rubygems_version: 3.0.8
123
+ rubygems_version: 3.1.6
197
124
  signing_key:
198
125
  specification_version: 4
199
- summary: Rails utilities for web crawling.
126
+ summary: Ruby utilities for web crawling.
200
127
  test_files: []
@@ -1,14 +0,0 @@
1
- // This is a manifest file that'll be compiled into application.js, which will include all the files
2
- // listed below.
3
- //
4
- // Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
5
- // or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
6
- //
7
- // It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
8
- // compiled file.
9
- //
10
- // Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
11
- // about supported directives.
12
- //
13
- //= require_tree .
14
- //= require active_scaffold
@@ -1,16 +0,0 @@
1
- /*
2
- * This is a manifest file that'll be compiled into application.css, which will include all the files
3
- * listed below.
4
- *
5
- * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
6
- * or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
7
- *
8
- * You're free to add application-wide styles to this file and they'll appear at the bottom of the
9
- * compiled file so the styles you add here take precedence over styles defined in any styles
10
- * defined in the other CSS/SCSS files in this directory. It is generally better to create a new
11
- * file per style scope.
12
- *
13
- *= require_tree .
14
- *= require_self
15
- *= require active_scaffold
16
- */
@@ -1,8 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class AddressesController < ::ApplicationController
5
- active_scaffold :'aranha/address' do |_conf|
6
- end
7
- end
8
- end
@@ -1,6 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module ApplicationHelper
5
- end
6
- end
@@ -1,98 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'eac_ruby_utils/yaml'
4
-
5
- module Aranha
6
- class Address < ActiveRecord::Base
7
- include ::EacRailsUtils::Models::InequalityQueries
8
-
9
- add_inequality_queries(:created_at)
10
-
11
- class << self
12
- def set_start_point(url, processor)
13
- start_points[url] = processor
14
- end
15
-
16
- def add_start_points
17
- ::Rails.logger.info("Start points: #{start_points.count}")
18
- start_points.each do |url, processor|
19
- add(url, processor)
20
- end
21
- end
22
-
23
- def add(url, processor, extra_data = nil)
24
- a = find_or_initialize_by(url: sanitize_url(url))
25
- a.processor = processor
26
- a.extra_data = extra_data.to_yaml
27
- a.save!
28
- end
29
-
30
- def clear_expired
31
- q = by_created_at_lt(Time.zone.now - 12.hours)
32
- Rails.logger.info("Addresses expired: #{q.count}")
33
- q.destroy_all
34
- end
35
-
36
- private
37
-
38
- def sanitize_url(url)
39
- if url.is_a?(Hash)
40
- url.to_yaml
41
- else
42
- url.to_s
43
- end
44
- end
45
-
46
- def start_points
47
- @start_points ||= {}
48
- end
49
- end
50
-
51
- validates :url, presence: true, uniqueness: true
52
- validates :processor, presence: true
53
-
54
- scope :unprocessed, lambda {
55
- where(processed_at: nil)
56
- }
57
-
58
- def to_s
59
- "#{processor}|#{url}"
60
- end
61
-
62
- def process
63
- ActiveRecord::Base.transaction do
64
- instanciate_processor.process
65
- self.processed_at = Time.zone.now
66
- save!
67
- end
68
- end
69
-
70
- private
71
-
72
- def instanciate_processor
73
- processor_instancier.call(*processor_instancier_arguments)
74
- end
75
-
76
- def url_to_process
77
- ::EacRubyUtils::Yaml.load(url)
78
- end
79
-
80
- def processor_instancier
81
- processor.constantize.method(:new)
82
- end
83
-
84
- def processor_instancier_arguments
85
- if processor_instancier_arity == 2 || processor_instancier_arity.negative?
86
- [url_to_process, EacRubyUtils::Yaml.load(extra_data)]
87
- elsif processor_instancier_arity == 1
88
- [processor_instancier.call(url_to_process)]
89
- else
90
- raise("#{processor}.initialize should has 1 or 2 or * arguments")
91
- end
92
- end
93
-
94
- def processor_instancier_arity
95
- processor.constantize.instance_method(:initialize).arity
96
- end
97
- end
98
- end
@@ -1,12 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <title>Aranha</title>
5
- <%= stylesheet_link_tag "aranha/application", media: "all" %>
6
- <%= javascript_include_tag "aranha/application" %>
7
- <%= csrf_meta_tags %>
8
- </head>
9
- <body>
10
- <%= yield %>
11
- </body>
12
- </html>
@@ -1,6 +0,0 @@
1
- en:
2
- activerecord:
3
- models:
4
- aranha/address:
5
- one: Aranha address
6
- other: Aranha addresses
@@ -1,6 +0,0 @@
1
- pt-BR:
2
- activerecord:
3
- models:
4
- aranha/address:
5
- one: Endereço Aranha
6
- other: Endereços Aranha
data/config/routes.rb DELETED
@@ -1,6 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- Aranha::Engine.routes.draw do
4
- concern :active_scaffold, ActiveScaffold::Routing::Basic.new(association: true)
5
- resources(:addresses, concerns: :active_scaffold)
6
- end
@@ -1,15 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class CreateAranhaAddresses < (
4
- Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
5
- )
6
- def change
7
- create_table :aranha_addresses do |t|
8
- t.string :url
9
- t.string :processor
10
- t.timestamp :processed_at
11
-
12
- t.timestamps null: false
13
- end
14
- end
15
- end
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class AddExtraDataToAranhaAddresses < (
4
- Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
5
- )
6
- def change
7
- add_column :aranha_addresses, :extra_data, :text
8
- end
9
- end
@@ -1,44 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/dom_elements_traverser/conditions'
4
- require 'aranha/dom_elements_traverser/data'
5
- require 'aranha/dom_elements_traverser/cursor'
6
-
7
- module Aranha
8
- class DomElementsTraverser
9
- include ::Aranha::DomElementsTraverser::Conditions
10
- include ::Aranha::DomElementsTraverser::Cursor
11
- include ::Aranha::DomElementsTraverser::Data
12
-
13
- class << self
14
- def traverse(options, &block)
15
- new(elements_from_options(options), &block)
16
- end
17
-
18
- def empty
19
- new([])
20
- end
21
-
22
- private
23
-
24
- def elements_from_options(options)
25
- options = ::EacRubyUtils::OptionsConsumer.new(options)
26
- elements = nil
27
- options.consume(:children_of) { |v| elements = v.children.to_a }
28
- raise 'None option of [:children_of] defined' unless elements
29
-
30
- options.validate
31
- elements
32
- end
33
- end
34
-
35
- private
36
-
37
- def initialize(elements, &block)
38
- @elements = elements
39
- @index = 0
40
- @data = {}
41
- instance_eval(&block) if block
42
- end
43
- end
44
- end
@@ -1,32 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class DomElementsTraverser
5
- module Conditions
6
- private
7
-
8
- def match_conditions?(conditions)
9
- raise "No element (Conditions: #{conditions})" unless current
10
-
11
- conditions.all? { |key, value| match_condition?(key, value) }
12
- end
13
-
14
- def match_condition?(key, value)
15
- case key.to_sym
16
- when :text then match_text_condition?(value)
17
- when :name then match_name_condition?(value)
18
- else raise "Unknown key condition: (#{key})"
19
- end
20
- end
21
-
22
- def match_name_condition?(tag_name)
23
- current.name.casecmp(tag_name.to_s).zero?
24
- end
25
-
26
- def match_text_condition?(texts)
27
- texts = [texts.to_s] unless texts.is_a?(Array)
28
- texts.all? { |t| current.text.downcase.include?(t.downcase) }
29
- end
30
- end
31
- end
32
- end
@@ -1,48 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'eac_ruby_utils/options_consumer'
4
-
5
- module Aranha
6
- class DomElementsTraverser
7
- module Cursor
8
- private
9
-
10
- def current
11
- @elements[@index]
12
- end
13
-
14
- def skip
15
- @index += 1
16
- end
17
-
18
- def skip_until(options)
19
- oc = ::EacRubyUtils::OptionsConsumer.new(options)
20
- optional = oc.consume(:optional, false)
21
- while current
22
- break if match_conditions?(oc.left_data)
23
-
24
- skip
25
- end
26
- raise "No element found for conditions #{oc.left_data}" unless current || optional
27
-
28
- current
29
- end
30
-
31
- def skip_until_after(conditions)
32
- skip_until(conditions)
33
- skip
34
- current
35
- end
36
-
37
- def if_found(conditions, &block)
38
- marked = @index
39
- skip_until({ optional: true }.merge(conditions))
40
- if current
41
- instance_eval(&block) if block
42
- else
43
- @index = marked
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class DomElementsTraverser
5
- module Data
6
- def data
7
- @data.dup
8
- end
9
-
10
- private
11
-
12
- def store(key, options = {}, &converter)
13
- validate(options)
14
- value = store_value(options, converter)
15
- @data[key] = value
16
- r = current
17
- skip
18
- r
19
- end
20
-
21
- def store_value(options, converter)
22
- value = if options.key?(:attribute)
23
- current.attribute(options[:attribute]).value
24
- else
25
- current.text.strip
26
- end
27
- converter ? converter.call(value) : value
28
- end
29
-
30
- def validate(options)
31
- return unless options.key?(:validate)
32
- return if match_conditions?(options[:validate])
33
-
34
- raise "Element does not match conditions #{options[:validate]}" \
35
- " (Element: |#{current}|#{current.name}|)"
36
- end
37
- end
38
- end
39
- end
data/lib/aranha/engine.rb DELETED
@@ -1,13 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- class Engine < ::Rails::Engine
5
- isolate_namespace Aranha
6
-
7
- initializer :append_migrations do |app|
8
- config.paths['db/migrate'].expanded.each do |expanded_path|
9
- app.config.paths['db/migrate'] << expanded_path
10
- end
11
- end
12
- end
13
- end
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module Fixtures
5
- require 'aranha/fixtures/download'
6
- end
7
- end
@@ -1,72 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/base'
4
- require 'aranha/parsers/source_address'
5
- require 'aranha/parsers/source_target_fixtures'
6
-
7
- module Aranha
8
- module Fixtures
9
- class Download
10
- attr_reader :pending
11
-
12
- def initialize(options)
13
- @prefix = options.fetch(:prefix)
14
- @prefix = '' if @prefix.blank?
15
- @download = options.fetch(:download)
16
- @pending = options.fetch(:pending)
17
- end
18
-
19
- def run
20
- url_files.each do |f|
21
- Rails.logger.info(relative_path(f))
22
- download(url(f), target(f)) if @download
23
- end
24
- end
25
-
26
- private
27
-
28
- def url_files
29
- Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
30
- end
31
-
32
- def select_path?(path)
33
- return false unless match_prefix_pattern(path)
34
-
35
- !pending || !source_exist?(path)
36
- end
37
-
38
- def match_prefix_pattern(path)
39
- relative_path(path).start_with?(@prefix)
40
- end
41
-
42
- def fixtures_root
43
- Rails.root.to_s
44
- end
45
-
46
- def download(url, target)
47
- Rails.logger.info "Baixando \"#{url}\"..."
48
- content = ::Aranha::Parsers::Base.new(url).content
49
- raise "Content is blank for \"#{url}\"" if content.blank?
50
-
51
- File.open(target, 'wb') { |file| file.write(content) }
52
- end
53
-
54
- def url(file)
55
- ::Aranha::Parsers::SourceAddress.from_file(file)
56
- end
57
-
58
- def target(file)
59
- File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
60
- end
61
-
62
- def relative_path(path)
63
- path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
64
- end
65
-
66
- def source_exist?(path)
67
- stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
68
- stf.source_file(::File.basename(path, '.url')).present?
69
- end
70
- end
71
- end
72
- end
@@ -1,22 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- namespace(:aranha) do
4
- task process: :environment do
5
- ::Aranha::Processor.new
6
- end
7
-
8
- task clear: :environment do
9
- Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
10
- end
11
-
12
- namespace :fixtures do
13
- desc 'Download remote content for fixtures.'
14
- task download: :environment do
15
- ::Aranha::Fixtures::Download.new(
16
- prefix: ENV['PREFIX'],
17
- download: ENV['DOWNLOAD'].present?,
18
- pending: ENV['PENDING'].present?
19
- ).run
20
- end
21
- end
22
- end