aranha 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/models/aranha/address.rb +16 -3
- data/lib/aranha.rb +1 -0
- data/lib/aranha/default_processor.rb +11 -4
- data/lib/aranha/fixtures/download.rb +55 -0
- data/lib/aranha/parsers/base.rb +2 -1
- data/lib/aranha/version.rb +1 -1
- data/lib/tasks/aranha_tasks.rake +7 -0
- metadata +29 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92786506ca137fde90c898b24a9018453468cd72a5f500d50125969d4c19ce71
|
4
|
+
data.tar.gz: 5cda7e665e1ec8b09ca758de1652c94cd42ab3e49fc27c447ba7d975cfe572be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2aca9aff975e8b027b17ea8e8afc2ef2edc815c8c4ae4a10eec6f7e2ec3181024c1b73702dcc4ff0ec0b40464157e9233a07b2bcbf396c1649dbf53a0dbd006d
|
7
|
+
data.tar.gz: b019d925812e0c8209ba4b5d31c9737f9c72dedbff8e9b018ef0246e127afdbb0d6a1bd91ba997fcb1f473edaf76b383b3735432184f8ffd9543d3b209e71bac
|
@@ -12,13 +12,14 @@ module Aranha
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def add_start_points
|
15
|
+
::Rails.logger.info("Start points: #{start_points.count}")
|
15
16
|
start_points.each do |url, processor|
|
16
17
|
add(url, processor)
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
20
21
|
def add(url, processor, extra_data = nil)
|
21
|
-
a = find_or_initialize_by(url: url)
|
22
|
+
a = find_or_initialize_by(url: sanitize_url(url))
|
22
23
|
a.processor = processor
|
23
24
|
a.extra_data = extra_data.to_yaml
|
24
25
|
a.save!
|
@@ -32,6 +33,14 @@ module Aranha
|
|
32
33
|
|
33
34
|
private
|
34
35
|
|
36
|
+
def sanitize_url(url)
|
37
|
+
if url.is_a?(Hash)
|
38
|
+
url.to_yaml
|
39
|
+
else
|
40
|
+
url.to_s
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
35
44
|
def start_points
|
36
45
|
@start_points ||= {}
|
37
46
|
end
|
@@ -60,14 +69,18 @@ module Aranha
|
|
60
69
|
|
61
70
|
def instanciate_processor
|
62
71
|
if processor_instancier_arity == 2 || processor_instancier_arity < 0
|
63
|
-
processor_instancier.call(
|
72
|
+
processor_instancier.call(url_to_process, YAML.load(extra_data))
|
64
73
|
elsif processor_instancier_arity == 1
|
65
|
-
processor_instancier.call(
|
74
|
+
processor_instancier.call(url_to_process)
|
66
75
|
else
|
67
76
|
raise("#{processor}.initialize should has 1 or 2 or * arguments")
|
68
77
|
end
|
69
78
|
end
|
70
79
|
|
80
|
+
def url_to_process
|
81
|
+
::YAML.load(url)
|
82
|
+
end
|
83
|
+
|
71
84
|
def processor_instancier
|
72
85
|
processor.constantize.method(:new)
|
73
86
|
end
|
data/lib/aranha.rb
CHANGED
@@ -4,11 +4,18 @@ module Aranha
|
|
4
4
|
class DefaultProcessor
|
5
5
|
attr_reader :source_uri, :extra_data
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
class << self
|
8
|
+
def sanitize_uri(uri)
|
9
|
+
return uri if uri.is_a?(Hash)
|
10
|
+
unless uri.is_a?(Addressable::URI)
|
11
|
+
uri = uri.to_s.gsub(%r{\A/}, 'file:///')
|
12
|
+
end
|
13
|
+
Addressable::URI.parse(uri)
|
10
14
|
end
|
11
|
-
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(source_uri, extra_data)
|
18
|
+
@source_uri = self.class.sanitize_uri(source_uri)
|
12
19
|
@extra_data = extra_data
|
13
20
|
end
|
14
21
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Fixtures
|
5
|
+
class Download
|
6
|
+
def initialize(prefix, download)
|
7
|
+
@prefix = prefix
|
8
|
+
@prefix = '' if @prefix.blank?
|
9
|
+
@download = download
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
url_files.each do |f|
|
14
|
+
Rails.logger.info(relative_path(f))
|
15
|
+
download(url(f), target(f)) if @download
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def url_files
|
22
|
+
files = []
|
23
|
+
Dir["#{fixtures_root}/**/*.url"].map do |path|
|
24
|
+
files << path if match_pattern(path)
|
25
|
+
end
|
26
|
+
files
|
27
|
+
end
|
28
|
+
|
29
|
+
def match_pattern(path)
|
30
|
+
relative_path(path).start_with?(@prefix)
|
31
|
+
end
|
32
|
+
|
33
|
+
def fixtures_root
|
34
|
+
Rails.root.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def download(url, target)
|
38
|
+
Rails.logger.info "Baixando \"#{url}\"..."
|
39
|
+
File.open(target, 'wb') { |file| file.write(::Aranha::Parsers::Base.new(url).content) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def url(file)
|
43
|
+
File.read(file).strip
|
44
|
+
end
|
45
|
+
|
46
|
+
def target(file)
|
47
|
+
File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
|
48
|
+
end
|
49
|
+
|
50
|
+
def relative_path(path)
|
51
|
+
path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/aranha/parsers/base.rb
CHANGED
data/lib/aranha/version.rb
CHANGED
data/lib/tasks/aranha_tasks.rake
CHANGED
@@ -8,4 +8,11 @@ namespace(:aranha) do
|
|
8
8
|
task clear: :environment do
|
9
9
|
Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
|
10
10
|
end
|
11
|
+
|
12
|
+
namespace :fixtures do
|
13
|
+
desc 'Download remote content for fixtures.'
|
14
|
+
task download: :environment do
|
15
|
+
::Aranha::Fixtures::Download.new(ENV['PREFIX'], ENV['DOWNLOAD'].present?).run
|
16
|
+
end
|
17
|
+
end
|
11
18
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo H. Bogoni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05-
|
11
|
+
date: 2019-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_scaffold
|
@@ -107,6 +107,7 @@ files:
|
|
107
107
|
- lib/aranha/dom_elements_traverser/cursor.rb
|
108
108
|
- lib/aranha/dom_elements_traverser/data.rb
|
109
109
|
- lib/aranha/engine.rb
|
110
|
+
- lib/aranha/fixtures/download.rb
|
110
111
|
- lib/aranha/parsers/base.rb
|
111
112
|
- lib/aranha/parsers/html/base.rb
|
112
113
|
- lib/aranha/parsers/html/item_list.rb
|
@@ -180,42 +181,42 @@ signing_key:
|
|
180
181
|
specification_version: 4
|
181
182
|
summary: Rails utilities for web crawling.
|
182
183
|
test_files:
|
183
|
-
- test/
|
184
|
+
- test/dummy/Rakefile
|
185
|
+
- test/dummy/README.rdoc
|
186
|
+
- test/dummy/config.ru
|
187
|
+
- test/dummy/config/boot.rb
|
188
|
+
- test/dummy/config/database.yml
|
189
|
+
- test/dummy/config/secrets.yml
|
190
|
+
- test/dummy/config/locales/en.yml
|
191
|
+
- test/dummy/config/application.rb
|
192
|
+
- test/dummy/config/environments/development.rb
|
193
|
+
- test/dummy/config/environments/test.rb
|
194
|
+
- test/dummy/config/environments/production.rb
|
195
|
+
- test/dummy/config/environment.rb
|
184
196
|
- test/dummy/config/routes.rb
|
185
197
|
- test/dummy/config/initializers/assets.rb
|
186
198
|
- test/dummy/config/initializers/cookies_serializer.rb
|
187
|
-
- test/dummy/config/initializers/
|
199
|
+
- test/dummy/config/initializers/inflections.rb
|
188
200
|
- test/dummy/config/initializers/session_store.rb
|
189
|
-
- test/dummy/config/initializers/backtrace_silencers.rb
|
190
201
|
- test/dummy/config/initializers/wrap_parameters.rb
|
202
|
+
- test/dummy/config/initializers/to_time_preserves_timezone.rb
|
191
203
|
- test/dummy/config/initializers/filter_parameter_logging.rb
|
192
|
-
- test/dummy/config/initializers/
|
204
|
+
- test/dummy/config/initializers/backtrace_silencers.rb
|
193
205
|
- test/dummy/config/initializers/mime_types.rb
|
194
|
-
- test/dummy/
|
195
|
-
- test/dummy/config/secrets.yml
|
196
|
-
- test/dummy/config/locales/en.yml
|
197
|
-
- test/dummy/config/environment.rb
|
198
|
-
- test/dummy/config/boot.rb
|
199
|
-
- test/dummy/config/application.rb
|
200
|
-
- test/dummy/config/environments/production.rb
|
201
|
-
- test/dummy/config/environments/test.rb
|
202
|
-
- test/dummy/config/environments/development.rb
|
203
|
-
- test/dummy/Rakefile
|
204
|
-
- test/dummy/public/favicon.ico
|
205
|
-
- test/dummy/public/404.html
|
206
|
-
- test/dummy/public/500.html
|
207
|
-
- test/dummy/public/422.html
|
208
|
-
- test/dummy/config.ru
|
209
|
-
- test/dummy/app/assets/stylesheets/application.css
|
210
|
-
- test/dummy/app/assets/javascripts/application.js
|
211
|
-
- test/dummy/app/helpers/application_helper.rb
|
206
|
+
- test/dummy/db/schema.rb
|
212
207
|
- test/dummy/app/views/layouts/application.html.erb
|
213
208
|
- test/dummy/app/controllers/application_controller.rb
|
214
|
-
- test/dummy/
|
209
|
+
- test/dummy/app/helpers/application_helper.rb
|
210
|
+
- test/dummy/app/assets/stylesheets/application.css
|
211
|
+
- test/dummy/app/assets/javascripts/application.js
|
212
|
+
- test/dummy/public/422.html
|
213
|
+
- test/dummy/public/404.html
|
214
|
+
- test/dummy/public/favicon.ico
|
215
|
+
- test/dummy/public/500.html
|
215
216
|
- test/dummy/bin/bundle
|
217
|
+
- test/dummy/bin/setup
|
216
218
|
- test/dummy/bin/rails
|
217
219
|
- test/dummy/bin/rake
|
218
|
-
- test/
|
219
|
-
- test/dummy/README.rdoc
|
220
|
-
- test/integration/navigation_test.rb
|
220
|
+
- test/aranha_test.rb
|
221
221
|
- test/test_helper.rb
|
222
|
+
- test/integration/navigation_test.rb
|