aranha 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/app/models/aranha/address.rb +16 -3
- data/lib/aranha.rb +1 -0
- data/lib/aranha/default_processor.rb +11 -4
- data/lib/aranha/fixtures/download.rb +55 -0
- data/lib/aranha/parsers/base.rb +2 -1
- data/lib/aranha/version.rb +1 -1
- data/lib/tasks/aranha_tasks.rake +7 -0
- metadata +29 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92786506ca137fde90c898b24a9018453468cd72a5f500d50125969d4c19ce71
|
4
|
+
data.tar.gz: 5cda7e665e1ec8b09ca758de1652c94cd42ab3e49fc27c447ba7d975cfe572be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2aca9aff975e8b027b17ea8e8afc2ef2edc815c8c4ae4a10eec6f7e2ec3181024c1b73702dcc4ff0ec0b40464157e9233a07b2bcbf396c1649dbf53a0dbd006d
|
7
|
+
data.tar.gz: b019d925812e0c8209ba4b5d31c9737f9c72dedbff8e9b018ef0246e127afdbb0d6a1bd91ba997fcb1f473edaf76b383b3735432184f8ffd9543d3b209e71bac
|
@@ -12,13 +12,14 @@ module Aranha
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def add_start_points
|
15
|
+
::Rails.logger.info("Start points: #{start_points.count}")
|
15
16
|
start_points.each do |url, processor|
|
16
17
|
add(url, processor)
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
20
21
|
def add(url, processor, extra_data = nil)
|
21
|
-
a = find_or_initialize_by(url: url)
|
22
|
+
a = find_or_initialize_by(url: sanitize_url(url))
|
22
23
|
a.processor = processor
|
23
24
|
a.extra_data = extra_data.to_yaml
|
24
25
|
a.save!
|
@@ -32,6 +33,14 @@ module Aranha
|
|
32
33
|
|
33
34
|
private
|
34
35
|
|
36
|
+
def sanitize_url(url)
|
37
|
+
if url.is_a?(Hash)
|
38
|
+
url.to_yaml
|
39
|
+
else
|
40
|
+
url.to_s
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
35
44
|
def start_points
|
36
45
|
@start_points ||= {}
|
37
46
|
end
|
@@ -60,14 +69,18 @@ module Aranha
|
|
60
69
|
|
61
70
|
def instanciate_processor
|
62
71
|
if processor_instancier_arity == 2 || processor_instancier_arity < 0
|
63
|
-
processor_instancier.call(
|
72
|
+
processor_instancier.call(url_to_process, YAML.load(extra_data))
|
64
73
|
elsif processor_instancier_arity == 1
|
65
|
-
processor_instancier.call(
|
74
|
+
processor_instancier.call(url_to_process)
|
66
75
|
else
|
67
76
|
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
68
77
|
end
|
69
78
|
end
|
70
79
|
|
80
|
+
def url_to_process
|
81
|
+
::YAML.load(url)
|
82
|
+
end
|
83
|
+
|
71
84
|
def processor_instancier
|
72
85
|
processor.constantize.method(:new)
|
73
86
|
end
|
data/lib/aranha.rb
CHANGED
@@ -4,11 +4,18 @@ module Aranha
|
|
4
4
|
class DefaultProcessor
|
5
5
|
attr_reader :source_uri, :extra_data
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
class << self
|
8
|
+
def sanitize_uri(uri)
|
9
|
+
return uri if uri.is_a?(Hash)
|
10
|
+
unless uri.is_a?(Addressable::URI)
|
11
|
+
uri = uri.to_s.gsub(%r{\A/}, 'file:///')
|
12
|
+
end
|
13
|
+
Addressable::URI.parse(uri)
|
10
14
|
end
|
11
|
-
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(source_uri, extra_data)
|
18
|
+
@source_uri = self.class.sanitize_uri(source_uri)
|
12
19
|
@extra_data = extra_data
|
13
20
|
end
|
14
21
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Fixtures
|
5
|
+
class Download
|
6
|
+
def initialize(prefix, download)
|
7
|
+
@prefix = prefix
|
8
|
+
@prefix = '' if @prefix.blank?
|
9
|
+
@download = download
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
url_files.each do |f|
|
14
|
+
Rails.logger.info(relative_path(f))
|
15
|
+
download(url(f), target(f)) if @download
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def url_files
|
22
|
+
files = []
|
23
|
+
Dir["#{fixtures_root}/**/*.url"].map do |path|
|
24
|
+
files << path if match_pattern(path)
|
25
|
+
end
|
26
|
+
files
|
27
|
+
end
|
28
|
+
|
29
|
+
def match_pattern(path)
|
30
|
+
relative_path(path).start_with?(@prefix)
|
31
|
+
end
|
32
|
+
|
33
|
+
def fixtures_root
|
34
|
+
Rails.root.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def download(url, target)
|
38
|
+
Rails.logger.info "Baixando \"#{url}\"..."
|
39
|
+
File.open(target, 'wb') { |file| file.write(::Aranha::Parsers::Base.new(url).content) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def url(file)
|
43
|
+
File.read(file).strip
|
44
|
+
end
|
45
|
+
|
46
|
+
def target(file)
|
47
|
+
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
48
|
+
end
|
49
|
+
|
50
|
+
def relative_path(path)
|
51
|
+
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/aranha/parsers/base.rb
CHANGED
data/lib/aranha/version.rb
CHANGED
data/lib/tasks/aranha_tasks.rake
CHANGED
@@ -8,4 +8,11 @@ namespace(:aranha) do
|
|
8
8
|
task clear: :environment do
|
9
9
|
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
10
|
end
|
11
|
+
|
12
|
+
namespace :fixtures do
|
13
|
+
desc 'Download remote content for fixtures.'
|
14
|
+
task download: :environment do
|
15
|
+
::Aranha::Fixtures::Download.new(ENV['PREFIX'], ENV['DOWNLOAD'].present?).run
|
16
|
+
end
|
17
|
+
end
|
11
18
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05-
|
11
|
+
date: 2019-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -107,6 +107,7 @@ files:
|
|
107
107
|
- lib/aranha/dom_elements_traverser/cursor.rb
|
108
108
|
- lib/aranha/dom_elements_traverser/data.rb
|
109
109
|
- lib/aranha/engine.rb
|
110
|
+
- lib/aranha/fixtures/download.rb
|
110
111
|
- lib/aranha/parsers/base.rb
|
111
112
|
- lib/aranha/parsers/html/base.rb
|
112
113
|
- lib/aranha/parsers/html/item_list.rb
|
@@ -180,42 +181,42 @@ signing_key:
|
|
180
181
|
specification_version: 4
|
181
182
|
summary: Rails utilities for web crawling.
|
182
183
|
test_files:
|
183
|
-
- test/
|
184
|
+
- test/dummy/Rakefile
|
185
|
+
- test/dummy/README.rdoc
|
186
|
+
- test/dummy/config.ru
|
187
|
+
- test/dummy/config/boot.rb
|
188
|
+
- test/dummy/config/database.yml
|
189
|
+
- test/dummy/config/secrets.yml
|
190
|
+
- test/dummy/config/locales/en.yml
|
191
|
+
- test/dummy/config/application.rb
|
192
|
+
- test/dummy/config/environments/development.rb
|
193
|
+
- test/dummy/config/environments/test.rb
|
194
|
+
- test/dummy/config/environments/production.rb
|
195
|
+
- test/dummy/config/environment.rb
|
184
196
|
- test/dummy/config/routes.rb
|
185
197
|
- test/dummy/config/initializers/assets.rb
|
186
198
|
- test/dummy/config/initializers/cookies_serializer.rb
|
187
|
-
- test/dummy/config/initializers/
|
199
|
+
- test/dummy/config/initializers/inflections.rb
|
188
200
|
- test/dummy/config/initializers/session_store.rb
|
189
|
-
- test/dummy/config/initializers/backtrace_silencers.rb
|
190
201
|
- test/dummy/config/initializers/wrap_parameters.rb
|
202
|
+
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
191
203
|
- test/dummy/config/initializers/filter_parameter_logging.rb
|
192
|
-
- test/dummy/config/initializers/
|
204
|
+
- test/dummy/config/initializers/backtrace_silencers.rb
|
193
205
|
- test/dummy/config/initializers/mime_types.rb
|
194
|
-
- test/dummy/
|
195
|
-
- test/dummy/config/secrets.yml
|
196
|
-
- test/dummy/config/locales/en.yml
|
197
|
-
- test/dummy/config/environment.rb
|
198
|
-
- test/dummy/config/boot.rb
|
199
|
-
- test/dummy/config/application.rb
|
200
|
-
- test/dummy/config/environments/production.rb
|
201
|
-
- test/dummy/config/environments/test.rb
|
202
|
-
- test/dummy/config/environments/development.rb
|
203
|
-
- test/dummy/Rakefile
|
204
|
-
- test/dummy/public/favicon.ico
|
205
|
-
- test/dummy/public/404.html
|
206
|
-
- test/dummy/public/500.html
|
207
|
-
- test/dummy/public/422.html
|
208
|
-
- test/dummy/config.ru
|
209
|
-
- test/dummy/app/assets/stylesheets/application.css
|
210
|
-
- test/dummy/app/assets/javascripts/application.js
|
211
|
-
- test/dummy/app/helpers/application_helper.rb
|
206
|
+
- test/dummy/db/schema.rb
|
212
207
|
- test/dummy/app/views/layouts/application.html.erb
|
213
208
|
- test/dummy/app/controllers/application_controller.rb
|
214
|
-
- test/dummy/
|
209
|
+
- test/dummy/app/helpers/application_helper.rb
|
210
|
+
- test/dummy/app/assets/stylesheets/application.css
|
211
|
+
- test/dummy/app/assets/javascripts/application.js
|
212
|
+
- test/dummy/public/422.html
|
213
|
+
- test/dummy/public/404.html
|
214
|
+
- test/dummy/public/favicon.ico
|
215
|
+
- test/dummy/public/500.html
|
215
216
|
- test/dummy/bin/bundle
|
217
|
+
- test/dummy/bin/setup
|
216
218
|
- test/dummy/bin/rails
|
217
219
|
- test/dummy/bin/rake
|
218
|
-
- test/
|
219
|
-
- test/dummy/README.rdoc
|
220
|
-
- test/integration/navigation_test.rb
|
220
|
+
- test/aranha_test.rb
|
221
221
|
- test/test_helper.rb
|
222
|
+
- test/integration/navigation_test.rb
|