aranha 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.rdoc +1 -1
- data/lib/aranha.rb +2 -10
- data/lib/aranha/manager.rb +64 -0
- data/lib/aranha/processor.rb +12 -8
- data/lib/aranha/version.rb +1 -1
- metadata +7 -82
- data/app/assets/javascripts/aranha/application.js +0 -14
- data/app/assets/stylesheets/aranha/application.css +0 -16
- data/app/controllers/aranha/addresses_controller.rb +0 -8
- data/app/helpers/aranha/application_helper.rb +0 -6
- data/app/models/aranha/address.rb +0 -98
- data/app/views/layouts/aranha/application.html.erb +0 -12
- data/config/locales/en.yml +0 -6
- data/config/locales/pt-BR.yml +0 -6
- data/config/routes.rb +0 -5
- data/db/migrate/20171201021251_create_aranha_addresses.rb +0 -13
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +0 -7
- data/lib/aranha/dom_elements_traverser.rb +0 -44
- data/lib/aranha/dom_elements_traverser/conditions.rb +0 -32
- data/lib/aranha/dom_elements_traverser/cursor.rb +0 -48
- data/lib/aranha/dom_elements_traverser/data.rb +0 -39
- data/lib/aranha/engine.rb +0 -13
- data/lib/aranha/fixtures.rb +0 -7
- data/lib/aranha/fixtures/download.rb +0 -72
- data/lib/tasks/aranha_tasks.rake +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5fac0411750b6def655452009d3d091801905c696158499d8b4b26f99ccc426
|
4
|
+
data.tar.gz: cb8cd648b8603cfd1f578ba92e5017e37d6e1e19427b176d33e097e413c17baf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4864a35aa117b9bb00d544c013ae6caaf356b3d674657d5b1d5a3371f644b949b4fa32b6ec97d5f44fa58f65ea71db0499f18208ed885103a4f93d891d41309
|
7
|
+
data.tar.gz: c20f17bb1c3d04ce9b7678d2bec181c74dcf3b930202f9c838f5046cd68b326eb660ad490c4ad4c92647f60474eefc705e6e2734a62f3b42541760d689ab9686
|
data/README.rdoc
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'active_support/dependencies'
|
5
|
-
require 'active_scaffold'
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
6
4
|
|
7
5
|
module Aranha
|
8
|
-
|
9
|
-
require 'aranha/dom_elements_traverser'
|
10
|
-
require 'aranha/engine'
|
11
|
-
require 'aranha/fixtures'
|
12
|
-
require 'aranha/processor'
|
13
|
-
require 'aranha/parsers'
|
14
|
-
require 'aranha/selenium'
|
6
|
+
require_sub __FILE__
|
15
7
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/abstract_methods'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class Manager
|
7
|
+
include ::EacRubyUtils::AbstractMethods
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :default
|
11
|
+
end
|
12
|
+
|
13
|
+
def addresses_count
|
14
|
+
raise_abstract_method(__method__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_address(_uri, _processor_class, _extra_data = nil)
|
18
|
+
raise_abstract_method(__method__)
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_start_point(uri, processor_class, extra_data = nil)
|
22
|
+
start_points_var << ::EacRubyUtils::Struct.new(
|
23
|
+
uri: uri, processor_class: processor_class, extra_data: extra_data
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def clear_expired_addresses
|
28
|
+
raise_abstract_method(__method__)
|
29
|
+
end
|
30
|
+
|
31
|
+
def init
|
32
|
+
clear_expired_addresses
|
33
|
+
start_points_to_addresses
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_info(_message)
|
37
|
+
raise_abstract_method(__method__)
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_warn(_message)
|
41
|
+
raise_abstract_method(__method__)
|
42
|
+
end
|
43
|
+
|
44
|
+
def start_points
|
45
|
+
start_points_var.to_enum
|
46
|
+
end
|
47
|
+
|
48
|
+
def start_points_to_addresses
|
49
|
+
start_points_var.each do |sp|
|
50
|
+
add_address(sp.uri, sp.processor_class, sp.extra_data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def unprocessed_addresses
|
55
|
+
raise_abstract_method(__method__)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def start_points_var
|
61
|
+
@start_points_var ||= []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'net/http'
|
4
|
+
require 'httpclient'
|
4
5
|
require 'aranha/parsers/invalid_state_exception'
|
6
|
+
require 'aranha/manager'
|
5
7
|
|
6
8
|
module Aranha
|
7
9
|
class Processor
|
@@ -20,11 +22,13 @@ module Aranha
|
|
20
22
|
|
21
23
|
DEFAULT_MAX_TRIES = 3
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
attr_reader :manager
|
26
|
+
|
27
|
+
def initialize(manager = nil)
|
28
|
+
@manager = manager || ::Aranha::Manager.default
|
26
29
|
@failed = {}
|
27
30
|
@try = 0
|
31
|
+
self.manager.init
|
28
32
|
process_loop
|
29
33
|
raise "Addresses failed: #{@failed.count}" if @failed.any?
|
30
34
|
end
|
@@ -32,7 +36,7 @@ module Aranha
|
|
32
36
|
private
|
33
37
|
|
34
38
|
def process_loop
|
35
|
-
|
39
|
+
manager.log_info("Max tries: #{max_tries_s}")
|
36
40
|
loop do
|
37
41
|
break if process_next_address
|
38
42
|
end
|
@@ -52,8 +56,8 @@ module Aranha
|
|
52
56
|
end
|
53
57
|
|
54
58
|
def process_address(address)
|
55
|
-
|
56
|
-
" Unprocessed: #{unprocessed.count}/#{Aranha::
|
59
|
+
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
60
|
+
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
57
61
|
begin
|
58
62
|
address.process
|
59
63
|
@failed.delete(address.id)
|
@@ -67,7 +71,7 @@ module Aranha
|
|
67
71
|
|
68
72
|
@failed[address.id] ||= 0
|
69
73
|
@failed[address.id] += 1
|
70
|
-
|
74
|
+
manager.log_warn(exception)
|
71
75
|
end
|
72
76
|
|
73
77
|
def next_address
|
@@ -75,7 +79,7 @@ module Aranha
|
|
75
79
|
end
|
76
80
|
|
77
81
|
def unprocessed
|
78
|
-
::Aranha::
|
82
|
+
::Aranha::Manager.default.unprocessed_addresses
|
79
83
|
end
|
80
84
|
|
81
85
|
def network_exception?(exception)
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: active_scaffold
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 3.4.41.1
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 3.4.41.1
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: aranha-parsers
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,34 +44,20 @@ dependencies:
|
|
58
44
|
- - ">="
|
59
45
|
- !ruby/object:Gem::Version
|
60
46
|
version: 0.1.2
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: eac_rails_utils
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '0.11'
|
68
|
-
type: :runtime
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '0.11'
|
75
47
|
- !ruby/object:Gem::Dependency
|
76
48
|
name: eac_ruby_utils
|
77
49
|
requirement: !ruby/object:Gem::Requirement
|
78
50
|
requirements:
|
79
51
|
- - "~>"
|
80
52
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
53
|
+
version: '0.52'
|
82
54
|
type: :runtime
|
83
55
|
prerelease: false
|
84
56
|
version_requirements: !ruby/object:Gem::Requirement
|
85
57
|
requirements:
|
86
58
|
- - "~>"
|
87
59
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
60
|
+
version: '0.52'
|
89
61
|
- !ruby/object:Gem::Dependency
|
90
62
|
name: httpclient
|
91
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,20 +72,6 @@ dependencies:
|
|
100
72
|
- - ">="
|
101
73
|
- !ruby/object:Gem::Version
|
102
74
|
version: '2.6'
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rails
|
105
|
-
requirement: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - "~>"
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: 4.2.10
|
110
|
-
type: :runtime
|
111
|
-
prerelease: false
|
112
|
-
version_requirements: !ruby/object:Gem::Requirement
|
113
|
-
requirements:
|
114
|
-
- - "~>"
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 4.2.10
|
117
75
|
- !ruby/object:Gem::Dependency
|
118
76
|
name: eac_ruby_gem_support
|
119
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,20 +86,6 @@ dependencies:
|
|
128
86
|
- - "~>"
|
129
87
|
- !ruby/object:Gem::Version
|
130
88
|
version: '0.1'
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: sqlite3
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - ">="
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '0'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '0'
|
145
89
|
description:
|
146
90
|
email:
|
147
91
|
- eduardobogoni@gmail.com
|
@@ -151,29 +95,11 @@ extra_rdoc_files: []
|
|
151
95
|
files:
|
152
96
|
- MIT-LICENSE
|
153
97
|
- README.rdoc
|
154
|
-
- app/assets/javascripts/aranha/application.js
|
155
|
-
- app/assets/stylesheets/aranha/application.css
|
156
|
-
- app/controllers/aranha/addresses_controller.rb
|
157
|
-
- app/helpers/aranha/application_helper.rb
|
158
|
-
- app/models/aranha/address.rb
|
159
|
-
- app/views/layouts/aranha/application.html.erb
|
160
|
-
- config/locales/en.yml
|
161
|
-
- config/locales/pt-BR.yml
|
162
|
-
- config/routes.rb
|
163
|
-
- db/migrate/20171201021251_create_aranha_addresses.rb
|
164
|
-
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
165
98
|
- lib/aranha.rb
|
166
99
|
- lib/aranha/default_processor.rb
|
167
|
-
- lib/aranha/
|
168
|
-
- lib/aranha/dom_elements_traverser/conditions.rb
|
169
|
-
- lib/aranha/dom_elements_traverser/cursor.rb
|
170
|
-
- lib/aranha/dom_elements_traverser/data.rb
|
171
|
-
- lib/aranha/engine.rb
|
172
|
-
- lib/aranha/fixtures.rb
|
173
|
-
- lib/aranha/fixtures/download.rb
|
100
|
+
- lib/aranha/manager.rb
|
174
101
|
- lib/aranha/processor.rb
|
175
102
|
- lib/aranha/version.rb
|
176
|
-
- lib/tasks/aranha_tasks.rake
|
177
103
|
homepage:
|
178
104
|
licenses:
|
179
105
|
- MIT
|
@@ -193,9 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
119
|
- !ruby/object:Gem::Version
|
194
120
|
version: '0'
|
195
121
|
requirements: []
|
196
|
-
|
197
|
-
rubygems_version: 2.7.7
|
122
|
+
rubygems_version: 3.0.8
|
198
123
|
signing_key:
|
199
124
|
specification_version: 4
|
200
|
-
summary:
|
125
|
+
summary: Ruby utilities for web crawling.
|
201
126
|
test_files: []
|
@@ -1,14 +0,0 @@
|
|
1
|
-
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
-
// listed below.
|
3
|
-
//
|
4
|
-
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
-
// or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
|
6
|
-
//
|
7
|
-
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
-
// compiled file.
|
9
|
-
//
|
10
|
-
// Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
|
11
|
-
// about supported directives.
|
12
|
-
//
|
13
|
-
//= require_tree .
|
14
|
-
//= require active_scaffold
|
@@ -1,16 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
-
* listed below.
|
4
|
-
*
|
5
|
-
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
-
* or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
|
7
|
-
*
|
8
|
-
* You're free to add application-wide styles to this file and they'll appear at the bottom of the
|
9
|
-
* compiled file so the styles you add here take precedence over styles defined in any styles
|
10
|
-
* defined in the other CSS/SCSS files in this directory. It is generally better to create a new
|
11
|
-
* file per style scope.
|
12
|
-
*
|
13
|
-
*= require_tree .
|
14
|
-
*= require_self
|
15
|
-
*= require active_scaffold
|
16
|
-
*/
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/yaml'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class Address < ActiveRecord::Base
|
7
|
-
include ::EacRailsUtils::Models::InequalityQueries
|
8
|
-
|
9
|
-
add_inequality_queries(:created_at)
|
10
|
-
|
11
|
-
class << self
|
12
|
-
def set_start_point(url, processor)
|
13
|
-
start_points[url] = processor
|
14
|
-
end
|
15
|
-
|
16
|
-
def add_start_points
|
17
|
-
::Rails.logger.info("Start points: #{start_points.count}")
|
18
|
-
start_points.each do |url, processor|
|
19
|
-
add(url, processor)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def add(url, processor, extra_data = nil)
|
24
|
-
a = find_or_initialize_by(url: sanitize_url(url))
|
25
|
-
a.processor = processor
|
26
|
-
a.extra_data = extra_data.to_yaml
|
27
|
-
a.save!
|
28
|
-
end
|
29
|
-
|
30
|
-
def clear_expired
|
31
|
-
q = by_created_at_lt(Time.zone.now - 12.hours)
|
32
|
-
Rails.logger.info("Addresses expired: #{q.count}")
|
33
|
-
q.destroy_all
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def sanitize_url(url)
|
39
|
-
if url.is_a?(Hash)
|
40
|
-
url.to_yaml
|
41
|
-
else
|
42
|
-
url.to_s
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def start_points
|
47
|
-
@start_points ||= {}
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
validates :url, presence: true, uniqueness: true
|
52
|
-
validates :processor, presence: true
|
53
|
-
|
54
|
-
scope :unprocessed, lambda {
|
55
|
-
where(processed_at: nil)
|
56
|
-
}
|
57
|
-
|
58
|
-
def to_s
|
59
|
-
"#{processor}|#{url}"
|
60
|
-
end
|
61
|
-
|
62
|
-
def process
|
63
|
-
ActiveRecord::Base.transaction do
|
64
|
-
instanciate_processor.process
|
65
|
-
self.processed_at = Time.zone.now
|
66
|
-
save!
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def instanciate_processor
|
73
|
-
processor_instancier.call(*processor_instancier_arguments)
|
74
|
-
end
|
75
|
-
|
76
|
-
def url_to_process
|
77
|
-
::EacRubyUtils::Yaml.load_common(url)
|
78
|
-
end
|
79
|
-
|
80
|
-
def processor_instancier
|
81
|
-
processor.constantize.method(:new)
|
82
|
-
end
|
83
|
-
|
84
|
-
def processor_instancier_arguments
|
85
|
-
if processor_instancier_arity == 2 || processor_instancier_arity.negative?
|
86
|
-
[url_to_process, EacRubyUtils::Yaml.load_common(extra_data)]
|
87
|
-
elsif processor_instancier_arity == 1
|
88
|
-
[processor_instancier.call(url_to_process)]
|
89
|
-
else
|
90
|
-
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def processor_instancier_arity
|
95
|
-
processor.constantize.instance_method(:initialize).arity
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
data/config/locales/en.yml
DELETED
data/config/locales/pt-BR.yml
DELETED
data/config/routes.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class CreateAranhaAddresses < ActiveRecord::Migration
|
4
|
-
def change
|
5
|
-
create_table :aranha_addresses do |t|
|
6
|
-
t.string :url
|
7
|
-
t.string :processor
|
8
|
-
t.timestamp :processed_at
|
9
|
-
|
10
|
-
t.timestamps null: false
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/dom_elements_traverser/conditions'
|
4
|
-
require 'aranha/dom_elements_traverser/data'
|
5
|
-
require 'aranha/dom_elements_traverser/cursor'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
class DomElementsTraverser
|
9
|
-
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
-
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
-
include ::Aranha::DomElementsTraverser::Data
|
12
|
-
|
13
|
-
class << self
|
14
|
-
def traverse(options, &block)
|
15
|
-
new(elements_from_options(options), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def empty
|
19
|
-
new([])
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def elements_from_options(options)
|
25
|
-
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
-
elements = nil
|
27
|
-
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
-
raise 'None option of [:children_of] defined' unless elements
|
29
|
-
|
30
|
-
options.validate
|
31
|
-
elements
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def initialize(elements, &block)
|
38
|
-
@elements = elements
|
39
|
-
@index = 0
|
40
|
-
@data = {}
|
41
|
-
instance_eval(&block) if block
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Conditions
|
6
|
-
private
|
7
|
-
|
8
|
-
def match_conditions?(conditions)
|
9
|
-
raise "No element (Conditions: #{conditions})" unless current
|
10
|
-
|
11
|
-
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def match_condition?(key, value)
|
15
|
-
case key.to_sym
|
16
|
-
when :text then match_text_condition?(value)
|
17
|
-
when :name then match_name_condition?(value)
|
18
|
-
else raise "Unknown key condition: (#{key})"
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def match_name_condition?(tag_name)
|
23
|
-
current.name.casecmp(tag_name.to_s).zero?
|
24
|
-
end
|
25
|
-
|
26
|
-
def match_text_condition?(texts)
|
27
|
-
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
-
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/options_consumer'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class DomElementsTraverser
|
7
|
-
module Cursor
|
8
|
-
private
|
9
|
-
|
10
|
-
def current
|
11
|
-
@elements[@index]
|
12
|
-
end
|
13
|
-
|
14
|
-
def skip
|
15
|
-
@index += 1
|
16
|
-
end
|
17
|
-
|
18
|
-
def skip_until(options)
|
19
|
-
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
20
|
-
optional = oc.consume(:optional, false)
|
21
|
-
while current
|
22
|
-
break if match_conditions?(oc.left_data)
|
23
|
-
|
24
|
-
skip
|
25
|
-
end
|
26
|
-
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
27
|
-
|
28
|
-
current
|
29
|
-
end
|
30
|
-
|
31
|
-
def skip_until_after(conditions)
|
32
|
-
skip_until(conditions)
|
33
|
-
skip
|
34
|
-
current
|
35
|
-
end
|
36
|
-
|
37
|
-
def if_found(conditions, &block)
|
38
|
-
marked = @index
|
39
|
-
skip_until({ optional: true }.merge(conditions))
|
40
|
-
if current
|
41
|
-
instance_eval(&block) if block
|
42
|
-
else
|
43
|
-
@index = marked
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Data
|
6
|
-
def data
|
7
|
-
@data.dup
|
8
|
-
end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
def store(key, options = {}, &converter)
|
13
|
-
validate(options)
|
14
|
-
value = store_value(options, converter)
|
15
|
-
@data[key] = value
|
16
|
-
r = current
|
17
|
-
skip
|
18
|
-
r
|
19
|
-
end
|
20
|
-
|
21
|
-
def store_value(options, converter)
|
22
|
-
value = if options.key?(:attribute)
|
23
|
-
current.attribute(options[:attribute]).value
|
24
|
-
else
|
25
|
-
current.text.strip
|
26
|
-
end
|
27
|
-
converter ? converter.call(value) : value
|
28
|
-
end
|
29
|
-
|
30
|
-
def validate(options)
|
31
|
-
return unless options.key?(:validate)
|
32
|
-
return if match_conditions?(options[:validate])
|
33
|
-
|
34
|
-
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
-
" (Element: |#{current}|#{current.name}|)"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/aranha/engine.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class Engine < ::Rails::Engine
|
5
|
-
isolate_namespace Aranha
|
6
|
-
|
7
|
-
initializer :append_migrations do |app|
|
8
|
-
config.paths['db/migrate'].expanded.each do |expanded_path|
|
9
|
-
app.config.paths['db/migrate'] << expanded_path
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/lib/aranha/fixtures.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/parsers/base'
|
4
|
-
require 'aranha/parsers/source_address'
|
5
|
-
require 'aranha/parsers/source_target_fixtures'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
module Fixtures
|
9
|
-
class Download
|
10
|
-
attr_reader :pending
|
11
|
-
|
12
|
-
def initialize(options)
|
13
|
-
@prefix = options.fetch(:prefix)
|
14
|
-
@prefix = '' if @prefix.blank?
|
15
|
-
@download = options.fetch(:download)
|
16
|
-
@pending = options.fetch(:pending)
|
17
|
-
end
|
18
|
-
|
19
|
-
def run
|
20
|
-
url_files.each do |f|
|
21
|
-
Rails.logger.info(relative_path(f))
|
22
|
-
download(url(f), target(f)) if @download
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def url_files
|
29
|
-
Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
|
30
|
-
end
|
31
|
-
|
32
|
-
def select_path?(path)
|
33
|
-
return false unless match_prefix_pattern(path)
|
34
|
-
|
35
|
-
!pending || !source_exist?(path)
|
36
|
-
end
|
37
|
-
|
38
|
-
def match_prefix_pattern(path)
|
39
|
-
relative_path(path).start_with?(@prefix)
|
40
|
-
end
|
41
|
-
|
42
|
-
def fixtures_root
|
43
|
-
Rails.root.to_s
|
44
|
-
end
|
45
|
-
|
46
|
-
def download(url, target)
|
47
|
-
Rails.logger.info "Baixando \"#{url}\"..."
|
48
|
-
content = ::Aranha::Parsers::Base.new(url).content
|
49
|
-
raise "Content is blank for \"#{url}\"" if content.blank?
|
50
|
-
|
51
|
-
File.open(target, 'wb') { |file| file.write(content) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def url(file)
|
55
|
-
::Aranha::Parsers::SourceAddress.from_file(file)
|
56
|
-
end
|
57
|
-
|
58
|
-
def target(file)
|
59
|
-
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
60
|
-
end
|
61
|
-
|
62
|
-
def relative_path(path)
|
63
|
-
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
64
|
-
end
|
65
|
-
|
66
|
-
def source_exist?(path)
|
67
|
-
stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
|
68
|
-
stf.source_file(::File.basename(path, '.url')).present?
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/lib/tasks/aranha_tasks.rake
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
namespace(:aranha) do
|
4
|
-
task process: :environment do
|
5
|
-
::Aranha::Processor.new
|
6
|
-
end
|
7
|
-
|
8
|
-
task clear: :environment do
|
9
|
-
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
|
-
end
|
11
|
-
|
12
|
-
namespace :fixtures do
|
13
|
-
desc 'Download remote content for fixtures.'
|
14
|
-
task download: :environment do
|
15
|
-
::Aranha::Fixtures::Download.new(
|
16
|
-
prefix: ENV['PREFIX'],
|
17
|
-
download: ENV['DOWNLOAD'].present?,
|
18
|
-
pending: ENV['PENDING'].present?
|
19
|
-
).run
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|