aranha 0.14.5 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.rdoc +1 -1
- data/lib/aranha.rb +2 -10
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +3 -7
- data/lib/aranha/manager.rb +64 -0
- data/lib/aranha/processor.rb +21 -36
- data/lib/aranha/version.rb +1 -1
- metadata +10 -83
- data/app/assets/javascripts/aranha/application.js +0 -14
- data/app/assets/stylesheets/aranha/application.css +0 -16
- data/app/controllers/aranha/addresses_controller.rb +0 -8
- data/app/helpers/aranha/application_helper.rb +0 -6
- data/app/models/aranha/address.rb +0 -98
- data/app/views/layouts/aranha/application.html.erb +0 -12
- data/config/locales/en.yml +0 -6
- data/config/locales/pt-BR.yml +0 -6
- data/config/routes.rb +0 -6
- data/db/migrate/20171201021251_create_aranha_addresses.rb +0 -15
- data/db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb +0 -9
- data/lib/aranha/dom_elements_traverser.rb +0 -44
- data/lib/aranha/dom_elements_traverser/conditions.rb +0 -32
- data/lib/aranha/dom_elements_traverser/cursor.rb +0 -48
- data/lib/aranha/dom_elements_traverser/data.rb +0 -39
- data/lib/aranha/engine.rb +0 -13
- data/lib/aranha/fixtures.rb +0 -7
- data/lib/aranha/fixtures/download.rb +0 -72
- data/lib/tasks/aranha_tasks.rake +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc769d0b380643afe3740301d894fd70e93d3df19d3f6cf4bde1d7633d059d4f
|
4
|
+
data.tar.gz: 8d019e0d77e967ca19ff7325dc0081aa19bcdb114ea1812e49e245d121ca57c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16fd673ddb8be560b62a3dbab2e4e81e4b7b66842d5594ee4eb63639ba7a3bec6a43f239d69918c0417ed7a132069eea9bdf6ac741e41988106cb8f0ec1e0e56
|
7
|
+
data.tar.gz: 7b5c39602e3694ffbca23db1578cb87fc7dab222dfe67dd3b87e4c19fc36277efa339f10692bd2fd77c3d3218fd4201e31229d6d37b75ad209b200cc87b1229b
|
data/README.rdoc
CHANGED
data/lib/aranha.rb
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'active_support/dependencies'
|
5
|
-
require 'active_scaffold'
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
6
4
|
|
7
5
|
module Aranha
|
8
|
-
|
9
|
-
require 'aranha/dom_elements_traverser'
|
10
|
-
require 'aranha/engine'
|
11
|
-
require 'aranha/fixtures'
|
12
|
-
require 'aranha/processor'
|
13
|
-
require 'aranha/parsers'
|
14
|
-
require 'aranha/selenium'
|
6
|
+
require_sub __FILE__
|
15
7
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class AddressProcessor
|
7
|
+
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
8
|
+
CORE_EXCEPTIONS = [::SocketError].freeze
|
9
|
+
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
10
|
+
HTTPCLIENT_EXCEPTIONS = [
|
11
|
+
::HTTPClient::BadResponseError,
|
12
|
+
::HTTPClient::ConnectTimeoutError,
|
13
|
+
::HTTPClient::ReceiveTimeoutError
|
14
|
+
].freeze
|
15
|
+
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
16
|
+
|
17
|
+
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
18
|
+
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def rescuable_error?(error)
|
22
|
+
return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
|
23
|
+
|
24
|
+
error.cause.present? ? network_error?(error.cause) : false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
enable_simple_cache
|
29
|
+
common_constructor :address
|
30
|
+
|
31
|
+
def successful?
|
32
|
+
error.blank?
|
33
|
+
end
|
34
|
+
|
35
|
+
def rescuable_error?
|
36
|
+
self.class.rescuable_error?(error)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def error_uncached
|
42
|
+
address.process
|
43
|
+
nil
|
44
|
+
rescue ::StandardError => e
|
45
|
+
e
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -1,11 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
class DefaultProcessor
|
7
|
-
attr_reader :source_uri, :extra_data
|
8
|
-
|
9
8
|
class << self
|
10
9
|
def sanitize_uri(uri)
|
11
10
|
return uri if uri.is_a?(Hash)
|
@@ -15,17 +14,14 @@ module Aranha
|
|
15
14
|
end
|
16
15
|
end
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
@extra_data = extra_data
|
17
|
+
common_constructor :source_uri, :extra_data do
|
18
|
+
self.source_uri = self.class.sanitize_uri(source_uri)
|
21
19
|
end
|
22
20
|
|
23
21
|
def process
|
24
22
|
raise 'Implement method process'
|
25
23
|
end
|
26
24
|
|
27
|
-
protected
|
28
|
-
|
29
25
|
def target_uri
|
30
26
|
source_uri
|
31
27
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/abstract_methods'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
class Manager
|
7
|
+
include ::EacRubyUtils::AbstractMethods
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :default
|
11
|
+
end
|
12
|
+
|
13
|
+
def addresses_count
|
14
|
+
raise_abstract_method(__method__)
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_address(_uri, _processor_class, _extra_data = nil)
|
18
|
+
raise_abstract_method(__method__)
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_start_point(uri, processor_class, extra_data = nil)
|
22
|
+
start_points_var << ::EacRubyUtils::Struct.new(
|
23
|
+
uri: uri, processor_class: processor_class, extra_data: extra_data
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def clear_expired_addresses
|
28
|
+
raise_abstract_method(__method__)
|
29
|
+
end
|
30
|
+
|
31
|
+
def init
|
32
|
+
clear_expired_addresses
|
33
|
+
start_points_to_addresses
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_info(_message)
|
37
|
+
raise_abstract_method(__method__)
|
38
|
+
end
|
39
|
+
|
40
|
+
def log_warn(_message)
|
41
|
+
raise_abstract_method(__method__)
|
42
|
+
end
|
43
|
+
|
44
|
+
def start_points
|
45
|
+
start_points_var.to_enum
|
46
|
+
end
|
47
|
+
|
48
|
+
def start_points_to_addresses
|
49
|
+
start_points_var.each do |sp|
|
50
|
+
add_address(sp.uri, sp.processor_class, sp.extra_data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def unprocessed_addresses
|
55
|
+
raise_abstract_method(__method__)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def start_points_var
|
61
|
+
@start_points_var ||= []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/aranha/processor.rb
CHANGED
@@ -1,30 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'net/http'
|
4
|
+
require 'httpclient'
|
4
5
|
require 'aranha/parsers/invalid_state_exception'
|
6
|
+
require 'aranha/manager'
|
5
7
|
|
6
8
|
module Aranha
|
7
9
|
class Processor
|
8
|
-
ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
|
9
|
-
CORE_EXCEPTIONS = [::SocketError].freeze
|
10
|
-
ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
|
11
|
-
HTTPCLIENT_EXCEPTIONS = [
|
12
|
-
::HTTPClient::BadResponseError,
|
13
|
-
::HTTPClient::ConnectTimeoutError,
|
14
|
-
::HTTPClient::ReceiveTimeoutError
|
15
|
-
].freeze
|
16
|
-
NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
|
17
|
-
|
18
|
-
NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
|
19
|
-
HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
|
20
|
-
|
21
10
|
DEFAULT_MAX_TRIES = 3
|
22
11
|
|
23
|
-
|
24
|
-
|
25
|
-
|
12
|
+
attr_reader :manager
|
13
|
+
|
14
|
+
def initialize(manager = nil)
|
15
|
+
@manager = manager || ::Aranha::Manager.default
|
26
16
|
@failed = {}
|
27
17
|
@try = 0
|
18
|
+
self.manager.init
|
28
19
|
process_loop
|
29
20
|
raise "Addresses failed: #{@failed.count}" if @failed.any?
|
30
21
|
end
|
@@ -32,7 +23,7 @@ module Aranha
|
|
32
23
|
private
|
33
24
|
|
34
25
|
def process_loop
|
35
|
-
|
26
|
+
manager.log_info("Max tries: #{max_tries_s}")
|
36
27
|
loop do
|
37
28
|
break if process_next_address
|
38
29
|
end
|
@@ -52,22 +43,22 @@ module Aranha
|
|
52
43
|
end
|
53
44
|
|
54
45
|
def process_address(address)
|
55
|
-
|
56
|
-
" Unprocessed: #{unprocessed.count}/#{Aranha::
|
57
|
-
|
58
|
-
|
59
|
-
@failed.delete(address.id)
|
60
|
-
|
61
|
-
process_exception(
|
46
|
+
manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
|
47
|
+
" Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
|
48
|
+
ap = ::Aranha::AddressProcessor.new(address)
|
49
|
+
if ap.successful?
|
50
|
+
@failed.delete(ap.address.id)
|
51
|
+
else
|
52
|
+
process_exception(ap)
|
62
53
|
end
|
63
54
|
end
|
64
55
|
|
65
|
-
def process_exception(
|
66
|
-
raise
|
56
|
+
def process_exception(address_processor)
|
57
|
+
raise address_processor.error unless address_processor.rescuable_error?
|
67
58
|
|
68
|
-
@failed[address.id] ||= 0
|
69
|
-
@failed[address.id] += 1
|
70
|
-
|
59
|
+
@failed[address_processor.address.id] ||= 0
|
60
|
+
@failed[address_processor.address.id] += 1
|
61
|
+
manager.log_warn(address_processor.error)
|
71
62
|
end
|
72
63
|
|
73
64
|
def next_address
|
@@ -75,13 +66,7 @@ module Aranha
|
|
75
66
|
end
|
76
67
|
|
77
68
|
def unprocessed
|
78
|
-
::Aranha::
|
79
|
-
end
|
80
|
-
|
81
|
-
def network_exception?(exception)
|
82
|
-
return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
|
83
|
-
|
84
|
-
exception.cause.present? ? network_exception?(exception.cause) : false
|
69
|
+
::Aranha::Manager.default.unprocessed_addresses
|
85
70
|
end
|
86
71
|
|
87
72
|
def not_try_ids
|
data/lib/aranha/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: active_scaffold
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 3.4.41.1
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 3.4.41.1
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: aranha-parsers
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,34 +44,20 @@ dependencies:
|
|
58
44
|
- - ">="
|
59
45
|
- !ruby/object:Gem::Version
|
60
46
|
version: 0.1.2
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: eac_rails_utils
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '0.11'
|
68
|
-
type: :runtime
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '0.11'
|
75
47
|
- !ruby/object:Gem::Dependency
|
76
48
|
name: eac_ruby_utils
|
77
49
|
requirement: !ruby/object:Gem::Requirement
|
78
50
|
requirements:
|
79
51
|
- - "~>"
|
80
52
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
53
|
+
version: '0.52'
|
82
54
|
type: :runtime
|
83
55
|
prerelease: false
|
84
56
|
version_requirements: !ruby/object:Gem::Requirement
|
85
57
|
requirements:
|
86
58
|
- - "~>"
|
87
59
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
60
|
+
version: '0.52'
|
89
61
|
- !ruby/object:Gem::Dependency
|
90
62
|
name: httpclient
|
91
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,48 +72,20 @@ dependencies:
|
|
100
72
|
- - ">="
|
101
73
|
- !ruby/object:Gem::Version
|
102
74
|
version: '2.6'
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rails
|
105
|
-
requirement: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - ">="
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: 4.2.11.3
|
110
|
-
type: :runtime
|
111
|
-
prerelease: false
|
112
|
-
version_requirements: !ruby/object:Gem::Requirement
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 4.2.11.3
|
117
75
|
- !ruby/object:Gem::Dependency
|
118
76
|
name: eac_ruby_gem_support
|
119
77
|
requirement: !ruby/object:Gem::Requirement
|
120
78
|
requirements:
|
121
79
|
- - "~>"
|
122
80
|
- !ruby/object:Gem::Version
|
123
|
-
version: '0.
|
81
|
+
version: '0.2'
|
124
82
|
type: :development
|
125
83
|
prerelease: false
|
126
84
|
version_requirements: !ruby/object:Gem::Requirement
|
127
85
|
requirements:
|
128
86
|
- - "~>"
|
129
87
|
- !ruby/object:Gem::Version
|
130
|
-
version: '0.
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: sqlite3
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - ">="
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '0'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '0'
|
88
|
+
version: '0.2'
|
145
89
|
description:
|
146
90
|
email:
|
147
91
|
- eduardobogoni@gmail.com
|
@@ -151,29 +95,12 @@ extra_rdoc_files: []
|
|
151
95
|
files:
|
152
96
|
- MIT-LICENSE
|
153
97
|
- README.rdoc
|
154
|
-
- app/assets/javascripts/aranha/application.js
|
155
|
-
- app/assets/stylesheets/aranha/application.css
|
156
|
-
- app/controllers/aranha/addresses_controller.rb
|
157
|
-
- app/helpers/aranha/application_helper.rb
|
158
|
-
- app/models/aranha/address.rb
|
159
|
-
- app/views/layouts/aranha/application.html.erb
|
160
|
-
- config/locales/en.yml
|
161
|
-
- config/locales/pt-BR.yml
|
162
|
-
- config/routes.rb
|
163
|
-
- db/migrate/20171201021251_create_aranha_addresses.rb
|
164
|
-
- db/migrate/20181125042102_add_extra_data_to_aranha_addresses.rb
|
165
98
|
- lib/aranha.rb
|
99
|
+
- lib/aranha/address_processor.rb
|
166
100
|
- lib/aranha/default_processor.rb
|
167
|
-
- lib/aranha/
|
168
|
-
- lib/aranha/dom_elements_traverser/conditions.rb
|
169
|
-
- lib/aranha/dom_elements_traverser/cursor.rb
|
170
|
-
- lib/aranha/dom_elements_traverser/data.rb
|
171
|
-
- lib/aranha/engine.rb
|
172
|
-
- lib/aranha/fixtures.rb
|
173
|
-
- lib/aranha/fixtures/download.rb
|
101
|
+
- lib/aranha/manager.rb
|
174
102
|
- lib/aranha/processor.rb
|
175
103
|
- lib/aranha/version.rb
|
176
|
-
- lib/tasks/aranha_tasks.rake
|
177
104
|
homepage:
|
178
105
|
licenses:
|
179
106
|
- MIT
|
@@ -193,8 +120,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
120
|
- !ruby/object:Gem::Version
|
194
121
|
version: '0'
|
195
122
|
requirements: []
|
196
|
-
rubygems_version: 3.
|
123
|
+
rubygems_version: 3.1.6
|
197
124
|
signing_key:
|
198
125
|
specification_version: 4
|
199
|
-
summary:
|
126
|
+
summary: Ruby utilities for web crawling.
|
200
127
|
test_files: []
|
@@ -1,14 +0,0 @@
|
|
1
|
-
// This is a manifest file that'll be compiled into application.js, which will include all the files
|
2
|
-
// listed below.
|
3
|
-
//
|
4
|
-
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
|
5
|
-
// or any plugin's vendor/assets/javascripts directory can be referenced here using a relative path.
|
6
|
-
//
|
7
|
-
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
|
8
|
-
// compiled file.
|
9
|
-
//
|
10
|
-
// Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details
|
11
|
-
// about supported directives.
|
12
|
-
//
|
13
|
-
//= require_tree .
|
14
|
-
//= require active_scaffold
|
@@ -1,16 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
3
|
-
* listed below.
|
4
|
-
*
|
5
|
-
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
6
|
-
* or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
|
7
|
-
*
|
8
|
-
* You're free to add application-wide styles to this file and they'll appear at the bottom of the
|
9
|
-
* compiled file so the styles you add here take precedence over styles defined in any styles
|
10
|
-
* defined in the other CSS/SCSS files in this directory. It is generally better to create a new
|
11
|
-
* file per style scope.
|
12
|
-
*
|
13
|
-
*= require_tree .
|
14
|
-
*= require_self
|
15
|
-
*= require active_scaffold
|
16
|
-
*/
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/yaml'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class Address < ActiveRecord::Base
|
7
|
-
include ::EacRailsUtils::Models::InequalityQueries
|
8
|
-
|
9
|
-
add_inequality_queries(:created_at)
|
10
|
-
|
11
|
-
class << self
|
12
|
-
def set_start_point(url, processor)
|
13
|
-
start_points[url] = processor
|
14
|
-
end
|
15
|
-
|
16
|
-
def add_start_points
|
17
|
-
::Rails.logger.info("Start points: #{start_points.count}")
|
18
|
-
start_points.each do |url, processor|
|
19
|
-
add(url, processor)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def add(url, processor, extra_data = nil)
|
24
|
-
a = find_or_initialize_by(url: sanitize_url(url))
|
25
|
-
a.processor = processor
|
26
|
-
a.extra_data = extra_data.to_yaml
|
27
|
-
a.save!
|
28
|
-
end
|
29
|
-
|
30
|
-
def clear_expired
|
31
|
-
q = by_created_at_lt(Time.zone.now - 12.hours)
|
32
|
-
Rails.logger.info("Addresses expired: #{q.count}")
|
33
|
-
q.destroy_all
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def sanitize_url(url)
|
39
|
-
if url.is_a?(Hash)
|
40
|
-
url.to_yaml
|
41
|
-
else
|
42
|
-
url.to_s
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def start_points
|
47
|
-
@start_points ||= {}
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
validates :url, presence: true, uniqueness: true
|
52
|
-
validates :processor, presence: true
|
53
|
-
|
54
|
-
scope :unprocessed, lambda {
|
55
|
-
where(processed_at: nil)
|
56
|
-
}
|
57
|
-
|
58
|
-
def to_s
|
59
|
-
"#{processor}|#{url}"
|
60
|
-
end
|
61
|
-
|
62
|
-
def process
|
63
|
-
ActiveRecord::Base.transaction do
|
64
|
-
instanciate_processor.process
|
65
|
-
self.processed_at = Time.zone.now
|
66
|
-
save!
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def instanciate_processor
|
73
|
-
processor_instancier.call(*processor_instancier_arguments)
|
74
|
-
end
|
75
|
-
|
76
|
-
def url_to_process
|
77
|
-
::EacRubyUtils::Yaml.load(url)
|
78
|
-
end
|
79
|
-
|
80
|
-
def processor_instancier
|
81
|
-
processor.constantize.method(:new)
|
82
|
-
end
|
83
|
-
|
84
|
-
def processor_instancier_arguments
|
85
|
-
if processor_instancier_arity == 2 || processor_instancier_arity.negative?
|
86
|
-
[url_to_process, EacRubyUtils::Yaml.load(extra_data)]
|
87
|
-
elsif processor_instancier_arity == 1
|
88
|
-
[processor_instancier.call(url_to_process)]
|
89
|
-
else
|
90
|
-
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def processor_instancier_arity
|
95
|
-
processor.constantize.instance_method(:initialize).arity
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
data/config/locales/en.yml
DELETED
data/config/locales/pt-BR.yml
DELETED
data/config/routes.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class CreateAranhaAddresses < (
|
4
|
-
Rails.version < '5' ? ActiveRecord::Migration : ActiveRecord::Migration[4.2]
|
5
|
-
)
|
6
|
-
def change
|
7
|
-
create_table :aranha_addresses do |t|
|
8
|
-
t.string :url
|
9
|
-
t.string :processor
|
10
|
-
t.timestamp :processed_at
|
11
|
-
|
12
|
-
t.timestamps null: false
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/dom_elements_traverser/conditions'
|
4
|
-
require 'aranha/dom_elements_traverser/data'
|
5
|
-
require 'aranha/dom_elements_traverser/cursor'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
class DomElementsTraverser
|
9
|
-
include ::Aranha::DomElementsTraverser::Conditions
|
10
|
-
include ::Aranha::DomElementsTraverser::Cursor
|
11
|
-
include ::Aranha::DomElementsTraverser::Data
|
12
|
-
|
13
|
-
class << self
|
14
|
-
def traverse(options, &block)
|
15
|
-
new(elements_from_options(options), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def empty
|
19
|
-
new([])
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def elements_from_options(options)
|
25
|
-
options = ::EacRubyUtils::OptionsConsumer.new(options)
|
26
|
-
elements = nil
|
27
|
-
options.consume(:children_of) { |v| elements = v.children.to_a }
|
28
|
-
raise 'None option of [:children_of] defined' unless elements
|
29
|
-
|
30
|
-
options.validate
|
31
|
-
elements
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def initialize(elements, &block)
|
38
|
-
@elements = elements
|
39
|
-
@index = 0
|
40
|
-
@data = {}
|
41
|
-
instance_eval(&block) if block
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Conditions
|
6
|
-
private
|
7
|
-
|
8
|
-
def match_conditions?(conditions)
|
9
|
-
raise "No element (Conditions: #{conditions})" unless current
|
10
|
-
|
11
|
-
conditions.all? { |key, value| match_condition?(key, value) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def match_condition?(key, value)
|
15
|
-
case key.to_sym
|
16
|
-
when :text then match_text_condition?(value)
|
17
|
-
when :name then match_name_condition?(value)
|
18
|
-
else raise "Unknown key condition: (#{key})"
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def match_name_condition?(tag_name)
|
23
|
-
current.name.casecmp(tag_name.to_s).zero?
|
24
|
-
end
|
25
|
-
|
26
|
-
def match_text_condition?(texts)
|
27
|
-
texts = [texts.to_s] unless texts.is_a?(Array)
|
28
|
-
texts.all? { |t| current.text.downcase.include?(t.downcase) }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'eac_ruby_utils/options_consumer'
|
4
|
-
|
5
|
-
module Aranha
|
6
|
-
class DomElementsTraverser
|
7
|
-
module Cursor
|
8
|
-
private
|
9
|
-
|
10
|
-
def current
|
11
|
-
@elements[@index]
|
12
|
-
end
|
13
|
-
|
14
|
-
def skip
|
15
|
-
@index += 1
|
16
|
-
end
|
17
|
-
|
18
|
-
def skip_until(options)
|
19
|
-
oc = ::EacRubyUtils::OptionsConsumer.new(options)
|
20
|
-
optional = oc.consume(:optional, false)
|
21
|
-
while current
|
22
|
-
break if match_conditions?(oc.left_data)
|
23
|
-
|
24
|
-
skip
|
25
|
-
end
|
26
|
-
raise "No element found for conditions #{oc.left_data}" unless current || optional
|
27
|
-
|
28
|
-
current
|
29
|
-
end
|
30
|
-
|
31
|
-
def skip_until_after(conditions)
|
32
|
-
skip_until(conditions)
|
33
|
-
skip
|
34
|
-
current
|
35
|
-
end
|
36
|
-
|
37
|
-
def if_found(conditions, &block)
|
38
|
-
marked = @index
|
39
|
-
skip_until({ optional: true }.merge(conditions))
|
40
|
-
if current
|
41
|
-
instance_eval(&block) if block
|
42
|
-
else
|
43
|
-
@index = marked
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class DomElementsTraverser
|
5
|
-
module Data
|
6
|
-
def data
|
7
|
-
@data.dup
|
8
|
-
end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
def store(key, options = {}, &converter)
|
13
|
-
validate(options)
|
14
|
-
value = store_value(options, converter)
|
15
|
-
@data[key] = value
|
16
|
-
r = current
|
17
|
-
skip
|
18
|
-
r
|
19
|
-
end
|
20
|
-
|
21
|
-
def store_value(options, converter)
|
22
|
-
value = if options.key?(:attribute)
|
23
|
-
current.attribute(options[:attribute]).value
|
24
|
-
else
|
25
|
-
current.text.strip
|
26
|
-
end
|
27
|
-
converter ? converter.call(value) : value
|
28
|
-
end
|
29
|
-
|
30
|
-
def validate(options)
|
31
|
-
return unless options.key?(:validate)
|
32
|
-
return if match_conditions?(options[:validate])
|
33
|
-
|
34
|
-
raise "Element does not match conditions #{options[:validate]}" \
|
35
|
-
" (Element: |#{current}|#{current.name}|)"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/aranha/engine.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Aranha
|
4
|
-
class Engine < ::Rails::Engine
|
5
|
-
isolate_namespace Aranha
|
6
|
-
|
7
|
-
initializer :append_migrations do |app|
|
8
|
-
config.paths['db/migrate'].expanded.each do |expanded_path|
|
9
|
-
app.config.paths['db/migrate'] << expanded_path
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/lib/aranha/fixtures.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'aranha/parsers/base'
|
4
|
-
require 'aranha/parsers/source_address'
|
5
|
-
require 'aranha/parsers/source_target_fixtures'
|
6
|
-
|
7
|
-
module Aranha
|
8
|
-
module Fixtures
|
9
|
-
class Download
|
10
|
-
attr_reader :pending
|
11
|
-
|
12
|
-
def initialize(options)
|
13
|
-
@prefix = options.fetch(:prefix)
|
14
|
-
@prefix = '' if @prefix.blank?
|
15
|
-
@download = options.fetch(:download)
|
16
|
-
@pending = options.fetch(:pending)
|
17
|
-
end
|
18
|
-
|
19
|
-
def run
|
20
|
-
url_files.each do |f|
|
21
|
-
Rails.logger.info(relative_path(f))
|
22
|
-
download(url(f), target(f)) if @download
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def url_files
|
29
|
-
Dir["#{fixtures_root}/**/*.url"].select { |path| select_path?(path) }
|
30
|
-
end
|
31
|
-
|
32
|
-
def select_path?(path)
|
33
|
-
return false unless match_prefix_pattern(path)
|
34
|
-
|
35
|
-
!pending || !source_exist?(path)
|
36
|
-
end
|
37
|
-
|
38
|
-
def match_prefix_pattern(path)
|
39
|
-
relative_path(path).start_with?(@prefix)
|
40
|
-
end
|
41
|
-
|
42
|
-
def fixtures_root
|
43
|
-
Rails.root.to_s
|
44
|
-
end
|
45
|
-
|
46
|
-
def download(url, target)
|
47
|
-
Rails.logger.info "Baixando \"#{url}\"..."
|
48
|
-
content = ::Aranha::Parsers::Base.new(url).content
|
49
|
-
raise "Content is blank for \"#{url}\"" if content.blank?
|
50
|
-
|
51
|
-
File.open(target, 'wb') { |file| file.write(content) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def url(file)
|
55
|
-
::Aranha::Parsers::SourceAddress.from_file(file)
|
56
|
-
end
|
57
|
-
|
58
|
-
def target(file)
|
59
|
-
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
60
|
-
end
|
61
|
-
|
62
|
-
def relative_path(path)
|
63
|
-
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
64
|
-
end
|
65
|
-
|
66
|
-
def source_exist?(path)
|
67
|
-
stf = ::Aranha::Parsers::SourceTargetFixtures.new(::File.dirname(path))
|
68
|
-
stf.source_file(::File.basename(path, '.url')).present?
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/lib/tasks/aranha_tasks.rake
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
namespace(:aranha) do
|
4
|
-
task process: :environment do
|
5
|
-
::Aranha::Processor.new
|
6
|
-
end
|
7
|
-
|
8
|
-
task clear: :environment do
|
9
|
-
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
|
-
end
|
11
|
-
|
12
|
-
namespace :fixtures do
|
13
|
-
desc 'Download remote content for fixtures.'
|
14
|
-
task download: :environment do
|
15
|
-
::Aranha::Fixtures::Download.new(
|
16
|
-
prefix: ENV['PREFIX'],
|
17
|
-
download: ENV['DOWNLOAD'].present?,
|
18
|
-
pending: ENV['PENDING'].present?
|
19
|
-
).run
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|