aranha 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
4
- data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
3
+ metadata.gz: 92786506ca137fde90c898b24a9018453468cd72a5f500d50125969d4c19ce71
4
+ data.tar.gz: 5cda7e665e1ec8b09ca758de1652c94cd42ab3e49fc27c447ba7d975cfe572be
5
5
  SHA512:
6
- metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
7
- data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
6
+ metadata.gz: 2aca9aff975e8b027b17ea8e8afc2ef2edc815c8c4ae4a10eec6f7e2ec3181024c1b73702dcc4ff0ec0b40464157e9233a07b2bcbf396c1649dbf53a0dbd006d
7
+ data.tar.gz: b019d925812e0c8209ba4b5d31c9737f9c72dedbff8e9b018ef0246e127afdbb0d6a1bd91ba997fcb1f473edaf76b383b3735432184f8ffd9543d3b209e71bac
@@ -12,13 +12,14 @@ module Aranha
12
12
  end
13
13
 
14
14
  def add_start_points
15
+ ::Rails.logger.info("Start points: #{start_points.count}")
15
16
  start_points.each do |url, processor|
16
17
  add(url, processor)
17
18
  end
18
19
  end
19
20
 
20
21
  def add(url, processor, extra_data = nil)
21
- a = find_or_initialize_by(url: url)
22
+ a = find_or_initialize_by(url: sanitize_url(url))
22
23
  a.processor = processor
23
24
  a.extra_data = extra_data.to_yaml
24
25
  a.save!
@@ -32,6 +33,14 @@ module Aranha
32
33
 
33
34
  private
34
35
 
36
+ def sanitize_url(url)
37
+ if url.is_a?(Hash)
38
+ url.to_yaml
39
+ else
40
+ url.to_s
41
+ end
42
+ end
43
+
35
44
  def start_points
36
45
  @start_points ||= {}
37
46
  end
@@ -60,14 +69,18 @@ module Aranha
60
69
 
61
70
  def instanciate_processor
62
71
  if processor_instancier_arity == 2 || processor_instancier_arity < 0
63
- processor_instancier.call(url, YAML.load(extra_data))
72
+ processor_instancier.call(url_to_process, YAML.load(extra_data))
64
73
  elsif processor_instancier_arity == 1
65
- processor_instancier.call(url)
74
+ processor_instancier.call(url_to_process)
66
75
  else
67
76
  raise("#{processor}.initialize should has 1 or 2 or * arguments")
68
77
  end
69
78
  end
70
79
 
80
+ def url_to_process
81
+ ::YAML.load(url)
82
+ end
83
+
71
84
  def processor_instancier
72
85
  processor.constantize.method(:new)
73
86
  end
@@ -9,6 +9,7 @@ module Aranha
9
9
  end
10
10
 
11
11
  require_dependency 'aranha/default_processor'
12
+ require_dependency 'aranha/fixtures/download'
12
13
  require_dependency 'aranha/processor'
13
14
  require_dependency 'aranha/parsers/base'
14
15
  require_dependency 'aranha/parsers/html/base'
@@ -4,11 +4,18 @@ module Aranha
4
4
  class DefaultProcessor
5
5
  attr_reader :source_uri, :extra_data
6
6
 
7
- def initialize(source_uri, extra_data)
8
- unless source_uri.is_a?(Addressable::URI)
9
- source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
7
+ class << self
8
+ def sanitize_uri(uri)
9
+ return uri if uri.is_a?(Hash)
10
+ unless uri.is_a?(Addressable::URI)
11
+ uri = uri.to_s.gsub(%r{\A/}, 'file:///')
12
+ end
13
+ Addressable::URI.parse(uri)
10
14
  end
11
- @source_uri = Addressable::URI.parse(source_uri)
15
+ end
16
+
17
+ def initialize(source_uri, extra_data)
18
+ @source_uri = self.class.sanitize_uri(source_uri)
12
19
  @extra_data = extra_data
13
20
  end
14
21
 
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aranha
4
+ module Fixtures
5
+ class Download
6
+ def initialize(prefix, download)
7
+ @prefix = prefix
8
+ @prefix = '' if @prefix.blank?
9
+ @download = download
10
+ end
11
+
12
+ def run
13
+ url_files.each do |f|
14
+ Rails.logger.info(relative_path(f))
15
+ download(url(f), target(f)) if @download
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def url_files
22
+ files = []
23
+ Dir["#{fixtures_root}/**/*.url"].map do |path|
24
+ files << path if match_pattern(path)
25
+ end
26
+ files
27
+ end
28
+
29
+ def match_pattern(path)
30
+ relative_path(path).start_with?(@prefix)
31
+ end
32
+
33
+ def fixtures_root
34
+ Rails.root.to_s
35
+ end
36
+
37
+ def download(url, target)
38
+ Rails.logger.info "Baixando \"#{url}\"..."
39
+ File.open(target, 'wb') { |file| file.write(::Aranha::Parsers::Base.new(url).content) }
40
+ end
41
+
42
+ def url(file)
43
+ File.read(file).strip
44
+ end
45
+
46
+ def target(file)
47
+ File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
48
+ end
49
+
50
+ def relative_path(path)
51
+ path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
52
+ end
53
+ end
54
+ end
55
+ end
@@ -11,7 +11,8 @@ module Aranha
11
11
  end
12
12
 
13
13
  def url
14
- @url.gsub(%r{/+$}, '')
14
+ r = (@url.is_a?(Hash) ? @url.fetch(:url) : @url)
15
+ r.to_s.gsub(%r{/+$}, '')
15
16
  end
16
17
 
17
18
  def content
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,4 +8,11 @@ namespace(:aranha) do
8
8
  task clear: :environment do
9
9
  Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
10
10
  end
11
+
12
+ namespace :fixtures do
13
+ desc 'Download remote content for fixtures.'
14
+ task download: :environment do
15
+ ::Aranha::Fixtures::Download.new(ENV['PREFIX'], ENV['DOWNLOAD'].present?).run
16
+ end
17
+ end
11
18
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-02 00:00:00.000000000 Z
11
+ date: 2019-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -107,6 +107,7 @@ files:
107
107
  - lib/aranha/dom_elements_traverser/cursor.rb
108
108
  - lib/aranha/dom_elements_traverser/data.rb
109
109
  - lib/aranha/engine.rb
110
+ - lib/aranha/fixtures/download.rb
110
111
  - lib/aranha/parsers/base.rb
111
112
  - lib/aranha/parsers/html/base.rb
112
113
  - lib/aranha/parsers/html/item_list.rb
@@ -180,42 +181,42 @@ signing_key:
180
181
  specification_version: 4
181
182
  summary: Rails utilities for web crawling.
182
183
  test_files:
183
- - test/aranha_test.rb
184
+ - test/dummy/Rakefile
185
+ - test/dummy/README.rdoc
186
+ - test/dummy/config.ru
187
+ - test/dummy/config/boot.rb
188
+ - test/dummy/config/database.yml
189
+ - test/dummy/config/secrets.yml
190
+ - test/dummy/config/locales/en.yml
191
+ - test/dummy/config/application.rb
192
+ - test/dummy/config/environments/development.rb
193
+ - test/dummy/config/environments/test.rb
194
+ - test/dummy/config/environments/production.rb
195
+ - test/dummy/config/environment.rb
184
196
  - test/dummy/config/routes.rb
185
197
  - test/dummy/config/initializers/assets.rb
186
198
  - test/dummy/config/initializers/cookies_serializer.rb
187
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
199
+ - test/dummy/config/initializers/inflections.rb
188
200
  - test/dummy/config/initializers/session_store.rb
189
- - test/dummy/config/initializers/backtrace_silencers.rb
190
201
  - test/dummy/config/initializers/wrap_parameters.rb
202
+ - test/dummy/config/initializers/to_time_preserves_timezone.rb
191
203
  - test/dummy/config/initializers/filter_parameter_logging.rb
192
- - test/dummy/config/initializers/inflections.rb
204
+ - test/dummy/config/initializers/backtrace_silencers.rb
193
205
  - test/dummy/config/initializers/mime_types.rb
194
- - test/dummy/config/database.yml
195
- - test/dummy/config/secrets.yml
196
- - test/dummy/config/locales/en.yml
197
- - test/dummy/config/environment.rb
198
- - test/dummy/config/boot.rb
199
- - test/dummy/config/application.rb
200
- - test/dummy/config/environments/production.rb
201
- - test/dummy/config/environments/test.rb
202
- - test/dummy/config/environments/development.rb
203
- - test/dummy/Rakefile
204
- - test/dummy/public/favicon.ico
205
- - test/dummy/public/404.html
206
- - test/dummy/public/500.html
207
- - test/dummy/public/422.html
208
- - test/dummy/config.ru
209
- - test/dummy/app/assets/stylesheets/application.css
210
- - test/dummy/app/assets/javascripts/application.js
211
- - test/dummy/app/helpers/application_helper.rb
206
+ - test/dummy/db/schema.rb
212
207
  - test/dummy/app/views/layouts/application.html.erb
213
208
  - test/dummy/app/controllers/application_controller.rb
214
- - test/dummy/bin/setup
209
+ - test/dummy/app/helpers/application_helper.rb
210
+ - test/dummy/app/assets/stylesheets/application.css
211
+ - test/dummy/app/assets/javascripts/application.js
212
+ - test/dummy/public/422.html
213
+ - test/dummy/public/404.html
214
+ - test/dummy/public/favicon.ico
215
+ - test/dummy/public/500.html
215
216
  - test/dummy/bin/bundle
217
+ - test/dummy/bin/setup
216
218
  - test/dummy/bin/rails
217
219
  - test/dummy/bin/rake
218
- - test/dummy/db/schema.rb
219
- - test/dummy/README.rdoc
220
- - test/integration/navigation_test.rb
220
+ - test/aranha_test.rb
221
221
  - test/test_helper.rb
222
+ - test/integration/navigation_test.rb