aranha 0.14.5 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.rdoc +1 -1
- data/lib/aranha.rb +2 -10
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +3 -7
- data/lib/aranha/manager.rb +64 -0
- data/lib/aranha/processor.rb +21 -36
- data/lib/aranha/version.rb +1 -1
- metadata +10 -83
- data/app/assets/javascripts/aranha/application.js +0 -14
- data/app/assets/stylesheets/aranha/application.css +0 -16
- data/app/controllers/aranha/addresses_controller.rb +0 -8
- data/app/helpers/aranha/application_helper.rb +0 -6
- data/app/models/aranha/address.rb +0 -98
- data/app/views/layouts/aranha/application.html.erb +0 -12
- data/config/locales/en.yml +0 -6
- data/config/locales/pt-BR.yml +0 -6
- data/config/routes.rb +0 -6
- data/db/migrate/20171201021251_create_aranha_addresses.rb +0 -15
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +0 -9
- data/lib/aranha/dom_elements_traverser.rb +0 -44
- data/lib/aranha/dom_elements_traverser/conditions.rb +0 -32
- data/lib/aranha/dom_elements_traverser/cursor.rb +0 -48
- data/lib/aranha/dom_elements_traverser/data.rb +0 -39
- data/lib/aranha/engine.rb +0 -13
- data/lib/aranha/fixtures.rb +0 -7
- data/lib/aranha/fixtures/download.rb +0 -72
- data/lib/tasks/aranha_tasks.rake +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
|
4
|
+
data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
|
7
|
+
data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
|
data/README.rdoc
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'active_support/dependencies'
|
5
|
-
require 'active_scaffold'
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
6
4
|
|
7
5
|
module Aranha
|
8
|
-
|
9
|
-
require 'aranha/dom_elements_traverser'
|
10
|
-
require 'aranha/engine'
|
11
|
-
require 'aranha/fixtures'
|
12
|
-
require 'aranha/processor'
|
13
|
-
require 'aranha/parsers'
|
14
|
-
require 'aranha/selenium'
|
6
|
+
require_sub __FILE__
|
15
7
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -1,11 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
class DefaultProcessor
|
7
|
-
attr_reader :source_uri, :extra_data
|
8
|
-
|
9
8
|
class << self
|
10
9
|
def sanitize_uri(uri)
|
11
10
|
return uri if uri.is_a?(Hash)
|
@@ -15,17 +14,14 @@ module Aranha
|
|
15
14
|
end
|
16
15
|
end
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
@extra_data = extra_data
|
17
|
+
common_constructor :source_uri, :extra_data do
|
18
|
+
self.source_uri = self.class.sanitize_uri(source_uri)
|
21
19
|
end
|
22
20
|
|
23
21
|
def process
|
24
22
|
raise 'Implement method process'
|
25
23
|
end
|
26
24
|
|
27
|
-
protected
|
28
|
-
|
29
25
|
def target_uri
|
30
26
|
source_uri
|
31
27
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/abstract_methods'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class Manager
|
7
|
+
include ::EacRubyUtils::AbstractMethods
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :default
|
11
|
+
end
|
12
|
+
|
13
|
+
def addresses_count
|
14
|
+
raise_abstract_method(__method__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_address(_uri, _processor_class, _extra_data = nil)
|
18
|
+
raise_abstract_method(__method__)
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_start_point(uri, processor_class, extra_data = nil)
|
22
|
+
start_points_var << ::EacRubyUtils::Struct.new(
|
23
|
+
uri: uri, processor_class: processor_class, extra_data: extra_data
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def clear_expired_addresses
|
28
|
+
raise_abstract_method(__method__)
|
29
|
+
end
|
30
|
+
|
31
|
+
def init
|
32
|
+
clear_expired_addresses
|
33
|
+
start_points_to_addresses
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_info(_message)
|
37
|
+
raise_abstract_method(__method__)
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_warn(_message)
|
41
|
+
raise_abstract_method(__method__)
|
42
|
+
end
|
43
|
+
|
44
|
+
def start_points
|
45
|
+
start_points_var.to_enum
|
46
|
+
end
|
47
|
+
|
48
|
+
def start_points_to_addresses
|
49
|
+
start_points_var.each do |sp|
|
50
|
+
add_address(sp.uri, sp.processor_class, sp.extra_data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def unprocessed_addresses
|
55
|
+
raise_abstract_method(__method__)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def start_points_var
|
61
|
+
@start_points_var ||= []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,30 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'net/http'
|
4
|
+
require 'httpclient'
|
4
5
|
require 'aranha/parsers/invalid_state_exception'
|
6
|
+
require 'aranha/manager'
|
5
7
|
|
6
8
|
module Aranha
|
7
9
|
class Processor
|
8
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
9
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
10
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
11
|
-
HTTPCLIENT_EXCEPTIONS = [
|
12
|
-
::HTTPClient::BadResponseError,
|
13
|
-
::HTTPClient::ConnectTimeoutError,
|
14
|
-
::HTTPClient::ReceiveTimeoutError
|
15
|
-
].freeze
|
16
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
17
|
-
|
18
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
19
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
20
|
-
|
21
10
|
DEFAULT_MAX_TRIES = 3
|
22
11
|
|
23
|
-
|
24
|
-
|
25
|
-
|
12
|
+
attr_reader :manager
|
13
|
+
|
14
|
+
def initialize(manager = nil)
|
15
|
+
@manager = manager || ::Aranha::Manager.default
|
26
16
|
@failed = {}
|
27
17
|
@try = 0
|
18
|
+
self.manager.init
|
28
19
|
process_loop
|
29
20
|
raise "Addresses failed: #{@failed.count}" if @failed.any?
|
30
21
|
end
|
@@ -32,7 +23,7 @@ module Aranha
|
|
32
23
|
private
|
33
24
|
|
34
25
|
def process_loop
|
35
|
-
|
26
|
+
manager.log_info("Max tries: #{max_tries_s}")
|
36
27
|
loop do
|
37
28
|
break if process_next_address
|
38
29
|
end
|
@@ -52,22 +43,22 @@ module Aranha
|
|
52
43
|
end
|
53
44
|
|
54
45
|
def process_address(address)
|
55
|
-
|
56
|
-
" Unprocessed: #{unprocessed.count}/#{Aranha::
|
57
|
-
|
58
|
-
|
59
|
-
@failed.delete(address.id)
|
60
|
-
|
61
|
-
process_exception(
|
46
|
+
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
47
|
+
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
62
53
|
end
|
63
54
|
end
|
64
55
|
|
65
|
-
def process_exception(
|
66
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
67
58
|
|
68
|
-
@failed[address.id] ||= 0
|
69
|
-
@failed[address.id] += 1
|
70
|
-
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
71
62
|
end
|
72
63
|
|
73
64
|
def next_address
|
@@ -75,13 +66,7 @@ module Aranha
|
|
75
66
|
end
|
76
67
|
|
77
68
|
def unprocessed
|
78
|
-
::Aranha::
|
79
|
-
end
|
80
|
-
|
81
|
-
def network_exception?(exception)
|
82
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
83
|
-
|
84
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
69
|
+
::Aranha::Manager.default.unprocessed_addresses
|
85
70
|
end
|
86
71
|
|
87
72
|
def not_try_ids
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: active_scaffold
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 3.4.41.1
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 3.4.41.1
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: aranha-parsers
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,34 +44,20 @@ dependencies:
|
|
58
44
|
- - ">="
|
59
45
|
- !ruby/object:Gem::Version
|
60
46
|
version: 0.1.2
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: eac_rails_utils
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '0.11'
|
68
|
-
type: :runtime
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '0.11'
|
75
47
|
- !ruby/object:Gem::Dependency
|
76
48
|
name: eac_ruby_utils
|
77
49
|
requirement: !ruby/object:Gem::Requirement
|
78
50
|
requirements:
|
79
51
|
- - "~>"
|
80
52
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
53
|
+
version: '0.52'
|
82
54
|
type: :runtime
|
83
55
|
prerelease: false
|
84
56
|
version_requirements: !ruby/object:Gem::Requirement
|
85
57
|
requirements:
|
86
58
|
- - "~>"
|
87
59
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
60
|
+
version: '0.52'
|
89
61
|
- !ruby/object:Gem::Dependency
|
90
62
|
name: httpclient
|
91
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,48 +72,20 @@ dependencies:
|
|
100
72
|
- - ">="
|
101
73
|
- !ruby/object:Gem::Version
|
102
74
|
version: '2.6'
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rails
|
105
|
-
requirement: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - ">="
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: 4.2.11.3
|
110
|
-
type: :runtime
|
111
|
-
prerelease: false
|
112
|
-
version_requirements: !ruby/object:Gem::Requirement
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 4.2.11.3
|
117
75
|
- !ruby/object:Gem::Dependency
|
118
76
|
name: eac_ruby_gem_support
|
119
77
|
requirement: !ruby/object:Gem::Requirement
|
120
78
|
requirements:
|
121
79
|
- - "~>"
|
122
80
|
- !ruby/object:Gem::Version
|
123
|
-
version: '0.
|
81
|
+
version: '0.2'
|
124
82
|
type: :development
|
125
83
|
prerelease: false
|
126
84
|
version_requirements: !ruby/object:Gem::Requirement
|
127
85
|
requirements:
|
128
86
|
- - "~>"
|
129
87
|
- !ruby/object:Gem::Version
|
130
|
-
version: '0.
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: sqlite3
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - ">="
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '0'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '0'
|
88
|
+
version: '0.2'
|
145
89
|
description:
|
146
90
|
email:
|
147
91
|
- eduardobogoni@gmail.com
|
@@ -151,29 +95,12 @@ extra_rdoc_files: []
|
|
151
95
|
files:
|
152
96
|
- MIT-LICENSE
|
153
97
|
- README.rdoc
|
154
|
-
- app/assets/javascripts/aranha/application.js
|
155
|
-
- app/assets/stylesheets/aranha/application.css
|
156
|
-
- app/controllers/aranha/addresses_controller.rb
|
157
|
-
- app/helpers/aranha/application_helper.rb
|
158
|
-
- app/models/aranha/address.rb
|
159
|
-
- app/views/layouts/aranha/application.html.erb
|
160
|
-
- config/locales/en.yml
|
161
|
-
- config/locales/pt-BR.yml
|
162
|
-
- config/routes.rb
|
163
|
-
- db/migrate/20171201021251_create_aranha_addresses.rb
|
164
|
-
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
165
98
|
- lib/aranha.rb
|
99
|
+
- lib/aranha/address_processor.rb
|
166
100
|
- lib/aranha/default_processor.rb
|
167
|
-
- lib/aranha/
|
168
|
-
- lib/aranha/dom_elements_traverser/conditions.rb
|
169
|
-
- lib/aranha/dom_elements_traverser/cursor.rb
|
170
|
-
- lib/aranha/dom_elements_traverser/data.rb
|
171
|
-
- lib/aranha/engine.rb
|
172
|
-
- lib/aranha/fixtures.rb
|
173
|
-
- lib/aranha/fixtures/download.rb
|
101
|
+
- lib/aranha/manager.rb
|
174
102
|
- lib/aranha/processor.rb
|
175
103
|
- lib/aranha/version.rb
|
176
|
-
- lib/tasks/aranha_tasks.rake
|
177
104
|
homepage:
|
178
105
|
licenses:
|
179
106
|
- MIT
|
@@ -193,8 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
120
|
- !ruby/object:Gem::Version
|
194
121
|
version: '0'
|
195
122
|
requirements: []
|
196
|
-
rubygems_version: 3.
|
123
|
+
rubygems_version: 3.1.6
|
197
124
|
signing_key:
|
198
125
|
specification_version: 4
|
199
|
-
summary:
|
126
|
+
summary: Ruby utilities for web crawling.
|
200
127
|
test_files: []
|
@@ -1,14 +0,0 @@
|
|
1
|
-
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
-
// listed below.
|
3
|
-
//
|
4
|
-
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
-
// or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
|
6
|
-
//
|
7
|
-
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
-
// compiled file.
|
9
|
-
//
|
10
|
-
// Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
|
11
|
-
// about supported directives.
|
12
|
-
//
|
13
|
-
//= require_tree .
|
14
|
-
//= require active_scaffold
|
@@ -1,16 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
-
* listed below.
|
4
|
-
*
|
5
|
-
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
-
* or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
|
7
|
-
*
|
8
|
-
* You're free to add application-wide styles to this file and they'll appear at the bottom of the
|
9
|
-
* compiled file so the styles you add here take precedence over styles defined in any styles
|
10
|
-
* defined in the other CSS/SCSS files in this directory. It is generally better to create a new
|
11
|
-
* file per style scope.
|
12
|
-
*
|
13
|
-
*= require_tree .
|
14
|
-
*= require_self
|
15
|
-
*= require active_scaffold
|
16
|
-
*/
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/yaml'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class Address < ActiveRecord::Base
|
7
|
-
include ::EacRailsUtils::Models::InequalityQueries
|
8
|
-
|
9
|
-
add_inequality_queries(:created_at)
|
10
|
-
|
11
|
-
class << self
|
12
|
-
def set_start_point(url, processor)
|
13
|
-
start_points[url] = processor
|
14
|
-
end
|
15
|
-
|
16
|
-
def add_start_points
|
17
|
-
::Rails.logger.info("Start points: #{start_points.count}")
|
18
|
-
start_points.each do |url, processor|
|
19
|
-
add(url, processor)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def add(url, processor, extra_data = nil)
|
24
|
-
a = find_or_initialize_by(url: sanitize_url(url))
|
25
|
-
a.processor = processor
|
26
|
-
a.extra_data = extra_data.to_yaml
|
27
|
-
a.save!
|
28
|
-
end
|
29
|
-
|
30
|
-
def clear_expired
|
31
|
-
q = by_created_at_lt(Time.zone.now - 12.hours)
|
32
|
-
Rails.logger.info("Addresses expired: #{q.count}")
|
33
|
-
q.destroy_all
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def sanitize_url(url)
|
39
|
-
if url.is_a?(Hash)
|
40
|
-
url.to_yaml
|
41
|
-
else
|
42
|
-
url.to_s
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def start_points
|
47
|
-
@start_points ||= {}
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
validates :url, presence: true, uniqueness: true
|
52
|
-
validates :processor, presence: true
|
53
|
-
|
54
|
-
scope :unprocessed, lambda {
|
55
|
-
where(processed_at: nil)
|
56
|
-
}
|
57
|
-
|
58
|
-
def to_s
|
59
|
-
"#{processor}|#{url}"
|
60
|
-
end
|
61
|
-
|
62
|
-
def process
|
63
|
-
ActiveRecord::Base.transaction do
|
64
|
-
instanciate_processor.process
|
65
|
-
self.processed_at = Time.zone.now
|
66
|
-
save!
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def instanciate_processor
|
73
|
-
processor_instancier.call(*processor_instancier_arguments)
|
74
|
-
end
|
75
|
-
|
76
|
-
def url_to_process
|
77
|
-
::EacRubyUtils::Yaml.load(url)
|
78
|
-
end
|
79
|
-
|
80
|
-
def processor_instancier
|
81
|
-
processor.constantize.method(:new)
|
82
|
-
end
|
83
|
-
|
84
|
-
def processor_instancier_arguments
|
85
|
-
if processor_instancier_arity == 2 || processor_instancier_arity.negative?
|
86
|
-
[url_to_process, EacRubyUtils::Yaml.load(extra_data)]
|
87
|
-
elsif processor_instancier_arity == 1
|
88
|
-
[processor_instancier.call(url_to_process)]
|
89
|
-
else
|
90
|
-
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def processor_instancier_arity
|
95
|
-
processor.constantize.instance_method(:initialize).arity
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
data/config/locales/en.yml
DELETED
data/config/locales/pt-BR.yml
DELETED
data/config/routes.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class CreateAranhaAddresses < (
|
4
|
-
Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
|
5
|
-
)
|
6
|
-
def change
|
7
|
-
create_table :aranha_addresses do |t|
|
8
|
-
t.string :url
|
9
|
-
t.string :processor
|
10
|
-
t.timestamp :processed_at
|
11
|
-
|
12
|
-
t.timestamps null: false
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/dom_elements_traverser/conditions'
|
4
|
-
require 'aranha/dom_elements_traverser/data'
|
5
|
-
require 'aranha/dom_elements_traverser/cursor'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
class DomElementsTraverser
|
9
|
-
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
-
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
-
include ::Aranha::DomElementsTraverser::Data
|
12
|
-
|
13
|
-
class << self
|
14
|
-
def traverse(options, &block)
|
15
|
-
new(elements_from_options(options), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def empty
|
19
|
-
new([])
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def elements_from_options(options)
|
25
|
-
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
-
elements = nil
|
27
|
-
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
-
raise 'None option of [:children_of] defined' unless elements
|
29
|
-
|
30
|
-
options.validate
|
31
|
-
elements
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def initialize(elements, &block)
|
38
|
-
@elements = elements
|
39
|
-
@index = 0
|
40
|
-
@data = {}
|
41
|
-
instance_eval(&block) if block
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Conditions
|
6
|
-
private
|
7
|
-
|
8
|
-
def match_conditions?(conditions)
|
9
|
-
raise "No element (Conditions: #{conditions})" unless current
|
10
|
-
|
11
|
-
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def match_condition?(key, value)
|
15
|
-
case key.to_sym
|
16
|
-
when :text then match_text_condition?(value)
|
17
|
-
when :name then match_name_condition?(value)
|
18
|
-
else raise "Unknown key condition: (#{key})"
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def match_name_condition?(tag_name)
|
23
|
-
current.name.casecmp(tag_name.to_s).zero?
|
24
|
-
end
|
25
|
-
|
26
|
-
def match_text_condition?(texts)
|
27
|
-
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
-
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/options_consumer'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class DomElementsTraverser
|
7
|
-
module Cursor
|
8
|
-
private
|
9
|
-
|
10
|
-
def current
|
11
|
-
@elements[@index]
|
12
|
-
end
|
13
|
-
|
14
|
-
def skip
|
15
|
-
@index += 1
|
16
|
-
end
|
17
|
-
|
18
|
-
def skip_until(options)
|
19
|
-
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
20
|
-
optional = oc.consume(:optional, false)
|
21
|
-
while current
|
22
|
-
break if match_conditions?(oc.left_data)
|
23
|
-
|
24
|
-
skip
|
25
|
-
end
|
26
|
-
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
27
|
-
|
28
|
-
current
|
29
|
-
end
|
30
|
-
|
31
|
-
def skip_until_after(conditions)
|
32
|
-
skip_until(conditions)
|
33
|
-
skip
|
34
|
-
current
|
35
|
-
end
|
36
|
-
|
37
|
-
def if_found(conditions, &block)
|
38
|
-
marked = @index
|
39
|
-
skip_until({ optional: true }.merge(conditions))
|
40
|
-
if current
|
41
|
-
instance_eval(&block) if block
|
42
|
-
else
|
43
|
-
@index = marked
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Data
|
6
|
-
def data
|
7
|
-
@data.dup
|
8
|
-
end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
def store(key, options = {}, &converter)
|
13
|
-
validate(options)
|
14
|
-
value = store_value(options, converter)
|
15
|
-
@data[key] = value
|
16
|
-
r = current
|
17
|
-
skip
|
18
|
-
r
|
19
|
-
end
|
20
|
-
|
21
|
-
def store_value(options, converter)
|
22
|
-
value = if options.key?(:attribute)
|
23
|
-
current.attribute(options[:attribute]).value
|
24
|
-
else
|
25
|
-
current.text.strip
|
26
|
-
end
|
27
|
-
converter ? converter.call(value) : value
|
28
|
-
end
|
29
|
-
|
30
|
-
def validate(options)
|
31
|
-
return unless options.key?(:validate)
|
32
|
-
return if match_conditions?(options[:validate])
|
33
|
-
|
34
|
-
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
-
" (Element: |#{current}|#{current.name}|)"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/aranha/engine.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class Engine < ::Rails::Engine
|
5
|
-
isolate_namespace Aranha
|
6
|
-
|
7
|
-
initializer :append_migrations do |app|
|
8
|
-
config.paths['db/migrate'].expanded.each do |expanded_path|
|
9
|
-
app.config.paths['db/migrate'] << expanded_path
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/lib/aranha/fixtures.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/parsers/base'
|
4
|
-
require 'aranha/parsers/source_address'
|
5
|
-
require 'aranha/parsers/source_target_fixtures'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
module Fixtures
|
9
|
-
class Download
|
10
|
-
attr_reader :pending
|
11
|
-
|
12
|
-
def initialize(options)
|
13
|
-
@prefix = options.fetch(:prefix)
|
14
|
-
@prefix = '' if @prefix.blank?
|
15
|
-
@download = options.fetch(:download)
|
16
|
-
@pending = options.fetch(:pending)
|
17
|
-
end
|
18
|
-
|
19
|
-
def run
|
20
|
-
url_files.each do |f|
|
21
|
-
Rails.logger.info(relative_path(f))
|
22
|
-
download(url(f), target(f)) if @download
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def url_files
|
29
|
-
Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
|
30
|
-
end
|
31
|
-
|
32
|
-
def select_path?(path)
|
33
|
-
return false unless match_prefix_pattern(path)
|
34
|
-
|
35
|
-
!pending || !source_exist?(path)
|
36
|
-
end
|
37
|
-
|
38
|
-
def match_prefix_pattern(path)
|
39
|
-
relative_path(path).start_with?(@prefix)
|
40
|
-
end
|
41
|
-
|
42
|
-
def fixtures_root
|
43
|
-
Rails.root.to_s
|
44
|
-
end
|
45
|
-
|
46
|
-
def download(url, target)
|
47
|
-
Rails.logger.info "Baixando \"#{url}\"..."
|
48
|
-
content = ::Aranha::Parsers::Base.new(url).content
|
49
|
-
raise "Content is blank for \"#{url}\"" if content.blank?
|
50
|
-
|
51
|
-
File.open(target, 'wb') { |file| file.write(content) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def url(file)
|
55
|
-
::Aranha::Parsers::SourceAddress.from_file(file)
|
56
|
-
end
|
57
|
-
|
58
|
-
def target(file)
|
59
|
-
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
60
|
-
end
|
61
|
-
|
62
|
-
def relative_path(path)
|
63
|
-
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
64
|
-
end
|
65
|
-
|
66
|
-
def source_exist?(path)
|
67
|
-
stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
|
68
|
-
stf.source_file(::File.basename(path, '.url')).present?
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/lib/tasks/aranha_tasks.rake
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
namespace(:aranha) do
|
4
|
-
task process: :environment do
|
5
|
-
::Aranha::Processor.new
|
6
|
-
end
|
7
|
-
|
8
|
-
task clear: :environment do
|
9
|
-
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
|
-
end
|
11
|
-
|
12
|
-
namespace :fixtures do
|
13
|
-
desc 'Download remote content for fixtures.'
|
14
|
-
task download: :environment do
|
15
|
-
::Aranha::Fixtures::Download.new(
|
16
|
-
prefix: ENV['PREFIX'],
|
17
|
-
download: ENV['DOWNLOAD'].present?,
|
18
|
-
pending: ENV['PENDING'].present?
|
19
|
-
).run
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|