aranha 0.14.5 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.rdoc +1 -1
- data/lib/aranha.rb +2 -10
- data/lib/aranha/manager.rb +64 -0
- data/lib/aranha/processor.rb +12 -8
- data/lib/aranha/version.rb +1 -1
- metadata +6 -80
- data/app/assets/javascripts/aranha/application.js +0 -14
- data/app/assets/stylesheets/aranha/application.css +0 -16
- data/app/controllers/aranha/addresses_controller.rb +0 -8
- data/app/helpers/aranha/application_helper.rb +0 -6
- data/app/models/aranha/address.rb +0 -98
- data/app/views/layouts/aranha/application.html.erb +0 -12
- data/config/locales/en.yml +0 -6
- data/config/locales/pt-BR.yml +0 -6
- data/config/routes.rb +0 -6
- data/db/migrate/20171201021251_create_aranha_addresses.rb +0 -15
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +0 -9
- data/lib/aranha/dom_elements_traverser.rb +0 -44
- data/lib/aranha/dom_elements_traverser/conditions.rb +0 -32
- data/lib/aranha/dom_elements_traverser/cursor.rb +0 -48
- data/lib/aranha/dom_elements_traverser/data.rb +0 -39
- data/lib/aranha/engine.rb +0 -13
- data/lib/aranha/fixtures.rb +0 -7
- data/lib/aranha/fixtures/download.rb +0 -72
- data/lib/tasks/aranha_tasks.rake +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5fac0411750b6def655452009d3d091801905c696158499d8b4b26f99ccc426
|
4
|
+
data.tar.gz: cb8cd648b8603cfd1f578ba92e5017e37d6e1e19427b176d33e097e413c17baf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4864a35aa117b9bb00d544c013ae6caaf356b3d674657d5b1d5a3371f644b949b4fa32b6ec97d5f44fa58f65ea71db0499f18208ed885103a4f93d891d41309
|
7
|
+
data.tar.gz: c20f17bb1c3d04ce9b7678d2bec181c74dcf3b930202f9c838f5046cd68b326eb660ad490c4ad4c92647f60474eefc705e6e2734a62f3b42541760d689ab9686
|
data/README.rdoc
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'active_support/dependencies'
|
5
|
-
require 'active_scaffold'
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
6
4
|
|
7
5
|
module Aranha
|
8
|
-
|
9
|
-
require 'aranha/dom_elements_traverser'
|
10
|
-
require 'aranha/engine'
|
11
|
-
require 'aranha/fixtures'
|
12
|
-
require 'aranha/processor'
|
13
|
-
require 'aranha/parsers'
|
14
|
-
require 'aranha/selenium'
|
6
|
+
require_sub __FILE__
|
15
7
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/abstract_methods'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class Manager
|
7
|
+
include ::EacRubyUtils::AbstractMethods
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :default
|
11
|
+
end
|
12
|
+
|
13
|
+
def addresses_count
|
14
|
+
raise_abstract_method(__method__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_address(_uri, _processor_class, _extra_data = nil)
|
18
|
+
raise_abstract_method(__method__)
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_start_point(uri, processor_class, extra_data = nil)
|
22
|
+
start_points_var << ::EacRubyUtils::Struct.new(
|
23
|
+
uri: uri, processor_class: processor_class, extra_data: extra_data
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def clear_expired_addresses
|
28
|
+
raise_abstract_method(__method__)
|
29
|
+
end
|
30
|
+
|
31
|
+
def init
|
32
|
+
clear_expired_addresses
|
33
|
+
start_points_to_addresses
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_info(_message)
|
37
|
+
raise_abstract_method(__method__)
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_warn(_message)
|
41
|
+
raise_abstract_method(__method__)
|
42
|
+
end
|
43
|
+
|
44
|
+
def start_points
|
45
|
+
start_points_var.to_enum
|
46
|
+
end
|
47
|
+
|
48
|
+
def start_points_to_addresses
|
49
|
+
start_points_var.each do |sp|
|
50
|
+
add_address(sp.uri, sp.processor_class, sp.extra_data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def unprocessed_addresses
|
55
|
+
raise_abstract_method(__method__)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def start_points_var
|
61
|
+
@start_points_var ||= []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'net/http'
|
4
|
+
require 'httpclient'
|
4
5
|
require 'aranha/parsers/invalid_state_exception'
|
6
|
+
require 'aranha/manager'
|
5
7
|
|
6
8
|
module Aranha
|
7
9
|
class Processor
|
@@ -20,11 +22,13 @@ module Aranha
|
|
20
22
|
|
21
23
|
DEFAULT_MAX_TRIES = 3
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
attr_reader :manager
|
26
|
+
|
27
|
+
def initialize(manager = nil)
|
28
|
+
@manager = manager || ::Aranha::Manager.default
|
26
29
|
@failed = {}
|
27
30
|
@try = 0
|
31
|
+
self.manager.init
|
28
32
|
process_loop
|
29
33
|
raise "Addresses failed: #{@failed.count}" if @failed.any?
|
30
34
|
end
|
@@ -32,7 +36,7 @@ module Aranha
|
|
32
36
|
private
|
33
37
|
|
34
38
|
def process_loop
|
35
|
-
|
39
|
+
manager.log_info("Max tries: #{max_tries_s}")
|
36
40
|
loop do
|
37
41
|
break if process_next_address
|
38
42
|
end
|
@@ -52,8 +56,8 @@ module Aranha
|
|
52
56
|
end
|
53
57
|
|
54
58
|
def process_address(address)
|
55
|
-
|
56
|
-
" Unprocessed: #{unprocessed.count}/#{Aranha::
|
59
|
+
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
|
+
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
57
61
|
begin
|
58
62
|
address.process
|
59
63
|
@failed.delete(address.id)
|
@@ -67,7 +71,7 @@ module Aranha
|
|
67
71
|
|
68
72
|
@failed[address.id] ||= 0
|
69
73
|
@failed[address.id] += 1
|
70
|
-
|
74
|
+
manager.log_warn(exception)
|
71
75
|
end
|
72
76
|
|
73
77
|
def next_address
|
@@ -75,7 +79,7 @@ module Aranha
|
|
75
79
|
end
|
76
80
|
|
77
81
|
def unprocessed
|
78
|
-
::Aranha::
|
82
|
+
::Aranha::Manager.default.unprocessed_addresses
|
79
83
|
end
|
80
84
|
|
81
85
|
def network_exception?(exception)
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: active_scaffold
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 3.4.41.1
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 3.4.41.1
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: aranha-parsers
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,34 +44,20 @@ dependencies:
|
|
58
44
|
- - ">="
|
59
45
|
- !ruby/object:Gem::Version
|
60
46
|
version: 0.1.2
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: eac_rails_utils
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '0.11'
|
68
|
-
type: :runtime
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '0.11'
|
75
47
|
- !ruby/object:Gem::Dependency
|
76
48
|
name: eac_ruby_utils
|
77
49
|
requirement: !ruby/object:Gem::Requirement
|
78
50
|
requirements:
|
79
51
|
- - "~>"
|
80
52
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
53
|
+
version: '0.52'
|
82
54
|
type: :runtime
|
83
55
|
prerelease: false
|
84
56
|
version_requirements: !ruby/object:Gem::Requirement
|
85
57
|
requirements:
|
86
58
|
- - "~>"
|
87
59
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
60
|
+
version: '0.52'
|
89
61
|
- !ruby/object:Gem::Dependency
|
90
62
|
name: httpclient
|
91
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,20 +72,6 @@ dependencies:
|
|
100
72
|
- - ">="
|
101
73
|
- !ruby/object:Gem::Version
|
102
74
|
version: '2.6'
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rails
|
105
|
-
requirement: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - ">="
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: 4.2.11.3
|
110
|
-
type: :runtime
|
111
|
-
prerelease: false
|
112
|
-
version_requirements: !ruby/object:Gem::Requirement
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 4.2.11.3
|
117
75
|
- !ruby/object:Gem::Dependency
|
118
76
|
name: eac_ruby_gem_support
|
119
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,20 +86,6 @@ dependencies:
|
|
128
86
|
- - "~>"
|
129
87
|
- !ruby/object:Gem::Version
|
130
88
|
version: '0.1'
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: sqlite3
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - ">="
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '0'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '0'
|
145
89
|
description:
|
146
90
|
email:
|
147
91
|
- eduardobogoni@gmail.com
|
@@ -151,29 +95,11 @@ extra_rdoc_files: []
|
|
151
95
|
files:
|
152
96
|
- MIT-LICENSE
|
153
97
|
- README.rdoc
|
154
|
-
- app/assets/javascripts/aranha/application.js
|
155
|
-
- app/assets/stylesheets/aranha/application.css
|
156
|
-
- app/controllers/aranha/addresses_controller.rb
|
157
|
-
- app/helpers/aranha/application_helper.rb
|
158
|
-
- app/models/aranha/address.rb
|
159
|
-
- app/views/layouts/aranha/application.html.erb
|
160
|
-
- config/locales/en.yml
|
161
|
-
- config/locales/pt-BR.yml
|
162
|
-
- config/routes.rb
|
163
|
-
- db/migrate/20171201021251_create_aranha_addresses.rb
|
164
|
-
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
165
98
|
- lib/aranha.rb
|
166
99
|
- lib/aranha/default_processor.rb
|
167
|
-
- lib/aranha/
|
168
|
-
- lib/aranha/dom_elements_traverser/conditions.rb
|
169
|
-
- lib/aranha/dom_elements_traverser/cursor.rb
|
170
|
-
- lib/aranha/dom_elements_traverser/data.rb
|
171
|
-
- lib/aranha/engine.rb
|
172
|
-
- lib/aranha/fixtures.rb
|
173
|
-
- lib/aranha/fixtures/download.rb
|
100
|
+
- lib/aranha/manager.rb
|
174
101
|
- lib/aranha/processor.rb
|
175
102
|
- lib/aranha/version.rb
|
176
|
-
- lib/tasks/aranha_tasks.rake
|
177
103
|
homepage:
|
178
104
|
licenses:
|
179
105
|
- MIT
|
@@ -196,5 +122,5 @@ requirements: []
|
|
196
122
|
rubygems_version: 3.0.8
|
197
123
|
signing_key:
|
198
124
|
specification_version: 4
|
199
|
-
summary:
|
125
|
+
summary: Ruby utilities for web crawling.
|
200
126
|
test_files: []
|
@@ -1,14 +0,0 @@
|
|
1
|
-
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
-
// listed below.
|
3
|
-
//
|
4
|
-
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
-
// or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
|
6
|
-
//
|
7
|
-
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
-
// compiled file.
|
9
|
-
//
|
10
|
-
// Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
|
11
|
-
// about supported directives.
|
12
|
-
//
|
13
|
-
//= require_tree .
|
14
|
-
//= require active_scaffold
|
@@ -1,16 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
-
* listed below.
|
4
|
-
*
|
5
|
-
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
-
* or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
|
7
|
-
*
|
8
|
-
* You're free to add application-wide styles to this file and they'll appear at the bottom of the
|
9
|
-
* compiled file so the styles you add here take precedence over styles defined in any styles
|
10
|
-
* defined in the other CSS/SCSS files in this directory. It is generally better to create a new
|
11
|
-
* file per style scope.
|
12
|
-
*
|
13
|
-
*= require_tree .
|
14
|
-
*= require_self
|
15
|
-
*= require active_scaffold
|
16
|
-
*/
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/yaml'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class Address < ActiveRecord::Base
|
7
|
-
include ::EacRailsUtils::Models::InequalityQueries
|
8
|
-
|
9
|
-
add_inequality_queries(:created_at)
|
10
|
-
|
11
|
-
class << self
|
12
|
-
def set_start_point(url, processor)
|
13
|
-
start_points[url] = processor
|
14
|
-
end
|
15
|
-
|
16
|
-
def add_start_points
|
17
|
-
::Rails.logger.info("Start points: #{start_points.count}")
|
18
|
-
start_points.each do |url, processor|
|
19
|
-
add(url, processor)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def add(url, processor, extra_data = nil)
|
24
|
-
a = find_or_initialize_by(url: sanitize_url(url))
|
25
|
-
a.processor = processor
|
26
|
-
a.extra_data = extra_data.to_yaml
|
27
|
-
a.save!
|
28
|
-
end
|
29
|
-
|
30
|
-
def clear_expired
|
31
|
-
q = by_created_at_lt(Time.zone.now - 12.hours)
|
32
|
-
Rails.logger.info("Addresses expired: #{q.count}")
|
33
|
-
q.destroy_all
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def sanitize_url(url)
|
39
|
-
if url.is_a?(Hash)
|
40
|
-
url.to_yaml
|
41
|
-
else
|
42
|
-
url.to_s
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def start_points
|
47
|
-
@start_points ||= {}
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
validates :url, presence: true, uniqueness: true
|
52
|
-
validates :processor, presence: true
|
53
|
-
|
54
|
-
scope :unprocessed, lambda {
|
55
|
-
where(processed_at: nil)
|
56
|
-
}
|
57
|
-
|
58
|
-
def to_s
|
59
|
-
"#{processor}|#{url}"
|
60
|
-
end
|
61
|
-
|
62
|
-
def process
|
63
|
-
ActiveRecord::Base.transaction do
|
64
|
-
instanciate_processor.process
|
65
|
-
self.processed_at = Time.zone.now
|
66
|
-
save!
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def instanciate_processor
|
73
|
-
processor_instancier.call(*processor_instancier_arguments)
|
74
|
-
end
|
75
|
-
|
76
|
-
def url_to_process
|
77
|
-
::EacRubyUtils::Yaml.load(url)
|
78
|
-
end
|
79
|
-
|
80
|
-
def processor_instancier
|
81
|
-
processor.constantize.method(:new)
|
82
|
-
end
|
83
|
-
|
84
|
-
def processor_instancier_arguments
|
85
|
-
if processor_instancier_arity == 2 || processor_instancier_arity.negative?
|
86
|
-
[url_to_process, EacRubyUtils::Yaml.load(extra_data)]
|
87
|
-
elsif processor_instancier_arity == 1
|
88
|
-
[processor_instancier.call(url_to_process)]
|
89
|
-
else
|
90
|
-
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def processor_instancier_arity
|
95
|
-
processor.constantize.instance_method(:initialize).arity
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
data/config/locales/en.yml
DELETED
data/config/locales/pt-BR.yml
DELETED
data/config/routes.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class CreateAranhaAddresses < (
|
4
|
-
Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
|
5
|
-
)
|
6
|
-
def change
|
7
|
-
create_table :aranha_addresses do |t|
|
8
|
-
t.string :url
|
9
|
-
t.string :processor
|
10
|
-
t.timestamp :processed_at
|
11
|
-
|
12
|
-
t.timestamps null: false
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/dom_elements_traverser/conditions'
|
4
|
-
require 'aranha/dom_elements_traverser/data'
|
5
|
-
require 'aranha/dom_elements_traverser/cursor'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
class DomElementsTraverser
|
9
|
-
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
-
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
-
include ::Aranha::DomElementsTraverser::Data
|
12
|
-
|
13
|
-
class << self
|
14
|
-
def traverse(options, &block)
|
15
|
-
new(elements_from_options(options), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def empty
|
19
|
-
new([])
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def elements_from_options(options)
|
25
|
-
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
-
elements = nil
|
27
|
-
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
-
raise 'None option of [:children_of] defined' unless elements
|
29
|
-
|
30
|
-
options.validate
|
31
|
-
elements
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def initialize(elements, &block)
|
38
|
-
@elements = elements
|
39
|
-
@index = 0
|
40
|
-
@data = {}
|
41
|
-
instance_eval(&block) if block
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Conditions
|
6
|
-
private
|
7
|
-
|
8
|
-
def match_conditions?(conditions)
|
9
|
-
raise "No element (Conditions: #{conditions})" unless current
|
10
|
-
|
11
|
-
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def match_condition?(key, value)
|
15
|
-
case key.to_sym
|
16
|
-
when :text then match_text_condition?(value)
|
17
|
-
when :name then match_name_condition?(value)
|
18
|
-
else raise "Unknown key condition: (#{key})"
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def match_name_condition?(tag_name)
|
23
|
-
current.name.casecmp(tag_name.to_s).zero?
|
24
|
-
end
|
25
|
-
|
26
|
-
def match_text_condition?(texts)
|
27
|
-
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
-
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/options_consumer'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class DomElementsTraverser
|
7
|
-
module Cursor
|
8
|
-
private
|
9
|
-
|
10
|
-
def current
|
11
|
-
@elements[@index]
|
12
|
-
end
|
13
|
-
|
14
|
-
def skip
|
15
|
-
@index += 1
|
16
|
-
end
|
17
|
-
|
18
|
-
def skip_until(options)
|
19
|
-
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
20
|
-
optional = oc.consume(:optional, false)
|
21
|
-
while current
|
22
|
-
break if match_conditions?(oc.left_data)
|
23
|
-
|
24
|
-
skip
|
25
|
-
end
|
26
|
-
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
27
|
-
|
28
|
-
current
|
29
|
-
end
|
30
|
-
|
31
|
-
def skip_until_after(conditions)
|
32
|
-
skip_until(conditions)
|
33
|
-
skip
|
34
|
-
current
|
35
|
-
end
|
36
|
-
|
37
|
-
def if_found(conditions, &block)
|
38
|
-
marked = @index
|
39
|
-
skip_until({ optional: true }.merge(conditions))
|
40
|
-
if current
|
41
|
-
instance_eval(&block) if block
|
42
|
-
else
|
43
|
-
@index = marked
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Data
|
6
|
-
def data
|
7
|
-
@data.dup
|
8
|
-
end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
def store(key, options = {}, &converter)
|
13
|
-
validate(options)
|
14
|
-
value = store_value(options, converter)
|
15
|
-
@data[key] = value
|
16
|
-
r = current
|
17
|
-
skip
|
18
|
-
r
|
19
|
-
end
|
20
|
-
|
21
|
-
def store_value(options, converter)
|
22
|
-
value = if options.key?(:attribute)
|
23
|
-
current.attribute(options[:attribute]).value
|
24
|
-
else
|
25
|
-
current.text.strip
|
26
|
-
end
|
27
|
-
converter ? converter.call(value) : value
|
28
|
-
end
|
29
|
-
|
30
|
-
def validate(options)
|
31
|
-
return unless options.key?(:validate)
|
32
|
-
return if match_conditions?(options[:validate])
|
33
|
-
|
34
|
-
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
-
" (Element: |#{current}|#{current.name}|)"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/aranha/engine.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class Engine < ::Rails::Engine
|
5
|
-
isolate_namespace Aranha
|
6
|
-
|
7
|
-
initializer :append_migrations do |app|
|
8
|
-
config.paths['db/migrate'].expanded.each do |expanded_path|
|
9
|
-
app.config.paths['db/migrate'] << expanded_path
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/lib/aranha/fixtures.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/parsers/base'
|
4
|
-
require 'aranha/parsers/source_address'
|
5
|
-
require 'aranha/parsers/source_target_fixtures'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
module Fixtures
|
9
|
-
class Download
|
10
|
-
attr_reader :pending
|
11
|
-
|
12
|
-
def initialize(options)
|
13
|
-
@prefix = options.fetch(:prefix)
|
14
|
-
@prefix = '' if @prefix.blank?
|
15
|
-
@download = options.fetch(:download)
|
16
|
-
@pending = options.fetch(:pending)
|
17
|
-
end
|
18
|
-
|
19
|
-
def run
|
20
|
-
url_files.each do |f|
|
21
|
-
Rails.logger.info(relative_path(f))
|
22
|
-
download(url(f), target(f)) if @download
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def url_files
|
29
|
-
Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
|
30
|
-
end
|
31
|
-
|
32
|
-
def select_path?(path)
|
33
|
-
return false unless match_prefix_pattern(path)
|
34
|
-
|
35
|
-
!pending || !source_exist?(path)
|
36
|
-
end
|
37
|
-
|
38
|
-
def match_prefix_pattern(path)
|
39
|
-
relative_path(path).start_with?(@prefix)
|
40
|
-
end
|
41
|
-
|
42
|
-
def fixtures_root
|
43
|
-
Rails.root.to_s
|
44
|
-
end
|
45
|
-
|
46
|
-
def download(url, target)
|
47
|
-
Rails.logger.info "Baixando \"#{url}\"..."
|
48
|
-
content = ::Aranha::Parsers::Base.new(url).content
|
49
|
-
raise "Content is blank for \"#{url}\"" if content.blank?
|
50
|
-
|
51
|
-
File.open(target, 'wb') { |file| file.write(content) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def url(file)
|
55
|
-
::Aranha::Parsers::SourceAddress.from_file(file)
|
56
|
-
end
|
57
|
-
|
58
|
-
def target(file)
|
59
|
-
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
60
|
-
end
|
61
|
-
|
62
|
-
def relative_path(path)
|
63
|
-
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
64
|
-
end
|
65
|
-
|
66
|
-
def source_exist?(path)
|
67
|
-
stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
|
68
|
-
stf.source_file(::File.basename(path, '.url')).present?
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/lib/tasks/aranha_tasks.rake
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
namespace(:aranha) do
|
4
|
-
task process: :environment do
|
5
|
-
::Aranha::Processor.new
|
6
|
-
end
|
7
|
-
|
8
|
-
task clear: :environment do
|
9
|
-
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
|
-
end
|
11
|
-
|
12
|
-
namespace :fixtures do
|
13
|
-
desc 'Download remote content for fixtures.'
|
14
|
-
task download: :environment do
|
15
|
-
::Aranha::Fixtures::Download.new(
|
16
|
-
prefix: ENV['PREFIX'],
|
17
|
-
download: ENV['DOWNLOAD'].present?,
|
18
|
-
pending: ENV['PENDING'].present?
|
19
|
-
).run
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|