aranha 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b770597c6db06b5641e943c6b758e4bf03da504dddde3b53873042397ebd2d0f
4
- data.tar.gz: 97e2aa65591ba5ef13af0d3e121b5e81f6d30ab78d10c5d6b8654b96e40e55fc
3
+ metadata.gz: 92786506ca137fde90c898b24a9018453468cd72a5f500d50125969d4c19ce71
4
+ data.tar.gz: 5cda7e665e1ec8b09ca758de1652c94cd42ab3e49fc27c447ba7d975cfe572be
5
5
  SHA512:
6
- metadata.gz: c18119d14b4b101ed22289c4da45faa78949c2c7871a08c62a182722f7fbe42807f3101526f970f463657dcf6341e596f54eb32d8f040f6f529d3fe961b77db2
7
- data.tar.gz: 83f31a41541d7a700e448894b665109b43c2ee8f7a8adfe561292dc675ace9fa118cb59e6891a03d916ac03043c9eb36879eb1fa08e14a2c78ee693de91de3e4
6
+ metadata.gz: 2aca9aff975e8b027b17ea8e8afc2ef2edc815c8c4ae4a10eec6f7e2ec3181024c1b73702dcc4ff0ec0b40464157e9233a07b2bcbf396c1649dbf53a0dbd006d
7
+ data.tar.gz: b019d925812e0c8209ba4b5d31c9737f9c72dedbff8e9b018ef0246e127afdbb0d6a1bd91ba997fcb1f473edaf76b383b3735432184f8ffd9543d3b209e71bac
@@ -12,13 +12,14 @@ module Aranha
12
12
  end
13
13
 
14
14
  def add_start_points
15
+ ::Rails.logger.info("Start points: #{start_points.count}")
15
16
  start_points.each do |url, processor|
16
17
  add(url, processor)
17
18
  end
18
19
  end
19
20
 
20
21
  def add(url, processor, extra_data = nil)
21
- a = find_or_initialize_by(url: url)
22
+ a = find_or_initialize_by(url: sanitize_url(url))
22
23
  a.processor = processor
23
24
  a.extra_data = extra_data.to_yaml
24
25
  a.save!
@@ -32,6 +33,14 @@ module Aranha
32
33
 
33
34
  private
34
35
 
36
+ def sanitize_url(url)
37
+ if url.is_a?(Hash)
38
+ url.to_yaml
39
+ else
40
+ url.to_s
41
+ end
42
+ end
43
+
35
44
  def start_points
36
45
  @start_points ||= {}
37
46
  end
@@ -60,14 +69,18 @@ module Aranha
60
69
 
61
70
  def instanciate_processor
62
71
  if processor_instancier_arity == 2 || processor_instancier_arity < 0
63
- processor_instancier.call(url, YAML.load(extra_data))
72
+ processor_instancier.call(url_to_process, YAML.load(extra_data))
64
73
  elsif processor_instancier_arity == 1
65
- processor_instancier.call(url)
74
+ processor_instancier.call(url_to_process)
66
75
  else
67
76
  raise("#{processor}.initialize should has 1 or 2 or * arguments")
68
77
  end
69
78
  end
70
79
 
80
+ def url_to_process
81
+ ::YAML.load(url)
82
+ end
83
+
71
84
  def processor_instancier
72
85
  processor.constantize.method(:new)
73
86
  end
@@ -9,6 +9,7 @@ module Aranha
9
9
  end
10
10
 
11
11
  require_dependency 'aranha/default_processor'
12
+ require_dependency 'aranha/fixtures/download'
12
13
  require_dependency 'aranha/processor'
13
14
  require_dependency 'aranha/parsers/base'
14
15
  require_dependency 'aranha/parsers/html/base'
@@ -4,11 +4,18 @@ module Aranha
4
4
  class DefaultProcessor
5
5
  attr_reader :source_uri, :extra_data
6
6
 
7
- def initialize(source_uri, extra_data)
8
- unless source_uri.is_a?(Addressable::URI)
9
- source_uri = source_uri.to_s.gsub(%r{\A/}, 'file:///')
7
+ class << self
8
+ def sanitize_uri(uri)
9
+ return uri if uri.is_a?(Hash)
10
+ unless uri.is_a?(Addressable::URI)
11
+ uri = uri.to_s.gsub(%r{\A/}, 'file:///')
12
+ end
13
+ Addressable::URI.parse(uri)
10
14
  end
11
- @source_uri = Addressable::URI.parse(source_uri)
15
+ end
16
+
17
+ def initialize(source_uri, extra_data)
18
+ @source_uri = self.class.sanitize_uri(source_uri)
12
19
  @extra_data = extra_data
13
20
  end
14
21
 
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aranha
4
+ module Fixtures
5
+ class Download
6
+ def initialize(prefix, download)
7
+ @prefix = prefix
8
+ @prefix = '' if @prefix.blank?
9
+ @download = download
10
+ end
11
+
12
+ def run
13
+ url_files.each do |f|
14
+ Rails.logger.info(relative_path(f))
15
+ download(url(f), target(f)) if @download
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def url_files
22
+ files = []
23
+ Dir["#{fixtures_root}/**/*.url"].map do |path|
24
+ files << path if match_pattern(path)
25
+ end
26
+ files
27
+ end
28
+
29
+ def match_pattern(path)
30
+ relative_path(path).start_with?(@prefix)
31
+ end
32
+
33
+ def fixtures_root
34
+ Rails.root.to_s
35
+ end
36
+
37
+ def download(url, target)
38
+ Rails.logger.info "Baixando \"#{url}\"..."
39
+ File.open(target, 'wb') { |file| file.write(::Aranha::Parsers::Base.new(url).content) }
40
+ end
41
+
42
+ def url(file)
43
+ File.read(file).strip
44
+ end
45
+
46
+ def target(file)
47
+ File.expand_path(File.basename(file, '.url') + '.source.html', File.dirname(file))
48
+ end
49
+
50
+ def relative_path(path)
51
+ path.sub(%r{^#{Regexp.quote(fixtures_root)}/}, '')
52
+ end
53
+ end
54
+ end
55
+ end
@@ -11,7 +11,8 @@ module Aranha
11
11
  end
12
12
 
13
13
  def url
14
- @url.gsub(%r{/+$}, '')
14
+ r = (@url.is_a?(Hash) ? @url.fetch(:url) : @url)
15
+ r.to_s.gsub(%r{/+$}, '')
15
16
  end
16
17
 
17
18
  def content
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,4 +8,11 @@ namespace(:aranha) do
8
8
  task clear: :environment do
9
9
  Rails.logger.info("Addresses deleted: #{::Aranha::Address.destroy_all.count}")
10
10
  end
11
+
12
+ namespace :fixtures do
13
+ desc 'Download remote content for fixtures.'
14
+ task download: :environment do
15
+ ::Aranha::Fixtures::Download.new(ENV['PREFIX'], ENV['DOWNLOAD'].present?).run
16
+ end
17
+ end
11
18
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-02 00:00:00.000000000 Z
11
+ date: 2019-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -107,6 +107,7 @@ files:
107
107
  - lib/aranha/dom_elements_traverser/cursor.rb
108
108
  - lib/aranha/dom_elements_traverser/data.rb
109
109
  - lib/aranha/engine.rb
110
+ - lib/aranha/fixtures/download.rb
110
111
  - lib/aranha/parsers/base.rb
111
112
  - lib/aranha/parsers/html/base.rb
112
113
  - lib/aranha/parsers/html/item_list.rb
@@ -180,42 +181,42 @@ signing_key:
180
181
  specification_version: 4
181
182
  summary: Rails utilities for web crawling.
182
183
  test_files:
183
- - test/aranha_test.rb
184
+ - test/dummy/Rakefile
185
+ - test/dummy/README.rdoc
186
+ - test/dummy/config.ru
187
+ - test/dummy/config/boot.rb
188
+ - test/dummy/config/database.yml
189
+ - test/dummy/config/secrets.yml
190
+ - test/dummy/config/locales/en.yml
191
+ - test/dummy/config/application.rb
192
+ - test/dummy/config/environments/development.rb
193
+ - test/dummy/config/environments/test.rb
194
+ - test/dummy/config/environments/production.rb
195
+ - test/dummy/config/environment.rb
184
196
  - test/dummy/config/routes.rb
185
197
  - test/dummy/config/initializers/assets.rb
186
198
  - test/dummy/config/initializers/cookies_serializer.rb
187
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
199
+ - test/dummy/config/initializers/inflections.rb
188
200
  - test/dummy/config/initializers/session_store.rb
189
- - test/dummy/config/initializers/backtrace_silencers.rb
190
201
  - test/dummy/config/initializers/wrap_parameters.rb
202
+ - test/dummy/config/initializers/to_time_preserves_timezone.rb
191
203
  - test/dummy/config/initializers/filter_parameter_logging.rb
192
- - test/dummy/config/initializers/inflections.rb
204
+ - test/dummy/config/initializers/backtrace_silencers.rb
193
205
  - test/dummy/config/initializers/mime_types.rb
194
- - test/dummy/config/database.yml
195
- - test/dummy/config/secrets.yml
196
- - test/dummy/config/locales/en.yml
197
- - test/dummy/config/environment.rb
198
- - test/dummy/config/boot.rb
199
- - test/dummy/config/application.rb
200
- - test/dummy/config/environments/production.rb
201
- - test/dummy/config/environments/test.rb
202
- - test/dummy/config/environments/development.rb
203
- - test/dummy/Rakefile
204
- - test/dummy/public/favicon.ico
205
- - test/dummy/public/404.html
206
- - test/dummy/public/500.html
207
- - test/dummy/public/422.html
208
- - test/dummy/config.ru
209
- - test/dummy/app/assets/stylesheets/application.css
210
- - test/dummy/app/assets/javascripts/application.js
211
- - test/dummy/app/helpers/application_helper.rb
206
+ - test/dummy/db/schema.rb
212
207
  - test/dummy/app/views/layouts/application.html.erb
213
208
  - test/dummy/app/controllers/application_controller.rb
214
- - test/dummy/bin/setup
209
+ - test/dummy/app/helpers/application_helper.rb
210
+ - test/dummy/app/assets/stylesheets/application.css
211
+ - test/dummy/app/assets/javascripts/application.js
212
+ - test/dummy/public/422.html
213
+ - test/dummy/public/404.html
214
+ - test/dummy/public/favicon.ico
215
+ - test/dummy/public/500.html
215
216
  - test/dummy/bin/bundle
217
+ - test/dummy/bin/setup
216
218
  - test/dummy/bin/rails
217
219
  - test/dummy/bin/rake
218
- - test/dummy/db/schema.rb
219
- - test/dummy/README.rdoc
220
- - test/integration/navigation_test.rb
220
+ - test/aranha_test.rb
221
221
  - test/test_helper.rb
222
+ - test/integration/navigation_test.rb