aranha 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +7 -6
  3. data/lib/aranha/fixtures/download.rb +3 -1
  4. data/lib/aranha/parsers/base.rb +13 -55
  5. data/lib/aranha/parsers/source_address.rb +49 -0
  6. data/lib/aranha/parsers/source_address/file.rb +31 -0
  7. data/lib/aranha/parsers/source_address/hash_http_post.rb +44 -0
  8. data/lib/aranha/parsers/source_address/http_get.rb +56 -0
  9. data/lib/aranha/version.rb +1 -1
  10. data/test/test_helper.rb +2 -2
  11. metadata +20 -74
  12. data/test/dummy/README.rdoc +0 -28
  13. data/test/dummy/Rakefile +0 -8
  14. data/test/dummy/app/assets/javascripts/application.js +0 -13
  15. data/test/dummy/app/assets/stylesheets/application.css +0 -15
  16. data/test/dummy/app/controllers/application_controller.rb +0 -7
  17. data/test/dummy/app/helpers/application_helper.rb +0 -4
  18. data/test/dummy/app/views/layouts/application.html.erb +0 -12
  19. data/test/dummy/bin/bundle +0 -5
  20. data/test/dummy/bin/rails +0 -6
  21. data/test/dummy/bin/rake +0 -6
  22. data/test/dummy/bin/setup +0 -31
  23. data/test/dummy/config.ru +0 -6
  24. data/test/dummy/config/application.rb +0 -27
  25. data/test/dummy/config/boot.rb +0 -7
  26. data/test/dummy/config/database.yml +0 -25
  27. data/test/dummy/config/environment.rb +0 -7
  28. data/test/dummy/config/environments/development.rb +0 -43
  29. data/test/dummy/config/environments/production.rb +0 -82
  30. data/test/dummy/config/environments/test.rb +0 -44
  31. data/test/dummy/config/initializers/assets.rb +0 -13
  32. data/test/dummy/config/initializers/backtrace_silencers.rb +0 -11
  33. data/test/dummy/config/initializers/cookies_serializer.rb +0 -5
  34. data/test/dummy/config/initializers/filter_parameter_logging.rb +0 -6
  35. data/test/dummy/config/initializers/inflections.rb +0 -18
  36. data/test/dummy/config/initializers/mime_types.rb +0 -6
  37. data/test/dummy/config/initializers/session_store.rb +0 -5
  38. data/test/dummy/config/initializers/to_time_preserves_timezone.rb +0 -12
  39. data/test/dummy/config/initializers/wrap_parameters.rb +0 -16
  40. data/test/dummy/config/locales/en.yml +0 -23
  41. data/test/dummy/config/routes.rb +0 -5
  42. data/test/dummy/config/secrets.yml +0 -22
  43. data/test/dummy/db/schema.rb +0 -24
  44. data/test/dummy/public/404.html +0 -67
  45. data/test/dummy/public/422.html +0 -67
  46. data/test/dummy/public/500.html +0 -66
  47. data/test/dummy/public/favicon.ico +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d65a2ccecd09ab619dea2d76306dfc48c7aad83b0b111b4544afb8492a97ef04
4
- data.tar.gz: fd1d57c4d7ec4a22f8bd5829b8ad9a76838b729d54440b68c559cdf8ba5f9482
3
+ metadata.gz: d8acb62bae7e2cede5a9d658d705bfbc28229cc9a595207d9f6f7796a7681e1f
4
+ data.tar.gz: debd7afd142d1de8d2ff38f6f7cf35cf89b82174ce278f7f2247324cfeb53b41
5
5
  SHA512:
6
- metadata.gz: 3b34ee01cfc1f6be364017da680c64b4abf7a5bc303b40edfdedf7532cdfd69cbe17d57349fac4fbc8881dc8db336f8a5738aa1976e9e5a3f60a09b8c4808f77
7
- data.tar.gz: c5129c06f6ec81bfd02da79ccfa64b1187abf644e2271ac84ea30d24fc02b1608c13b74fbf3f962ecbe4a3d69955c08c51fd0ad1f692395e82ac15a78978f638
6
+ metadata.gz: 7ce9a11b1c49756f67dae806d065cdd0fbdbe8f9b2e132ec4cc2efad55f198fd2c8fbea1c21a27cef9e22398a0632d8efef03a02c590a57a9abc2d7024c7ef23
7
+ data.tar.gz: 3dc367711f35207d2f693f01fe4ae2f485eb79f7052fbccd75876e4a90c8a2e98624947980135dc764b95b7f30b791227002f7fee609638ce5a961ad83c04e7a
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ RDoc::Task.new(:rdoc) do |rdoc|
16
16
  rdoc.rdoc_files.include('lib/**/*.rb')
17
17
  end
18
18
 
19
- APP_RAKEFILE = File.expand_path('test/dummy/Rakefile', __dir__)
19
+ APP_RAKEFILE = File.expand_path('spec/support/rails_app/Rakefile', __dir__)
20
20
  load 'rails/tasks/engine.rake'
21
21
 
22
22
  load 'rails/tasks/statistics.rake'
@@ -25,11 +25,12 @@ Bundler::GemHelper.install_tasks
25
25
 
26
26
  require 'rake/testtask'
27
27
 
28
- Rake::TestTask.new(:test) do |t|
29
- t.libs << 'lib'
30
- t.libs << 'test'
31
- t.pattern = 'test/**/*_test.rb'
32
- t.verbose = false
28
+ begin
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec)
31
+ task test: :spec
32
+ rescue LoadError
33
+ # no rspec available
33
34
  end
34
35
 
35
36
  task default: :test
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'aranha/parsers/source_address'
4
+
3
5
  module Aranha
4
6
  module Fixtures
5
7
  class Download
@@ -44,7 +46,7 @@ module Aranha
44
46
  end
45
47
 
46
48
  def url(file)
47
- File.read(file).strip
49
+ ::Aranha::Parsers::SourceAddress.from_file(file)
48
50
  end
49
51
 
50
52
  def target(file)
@@ -2,82 +2,41 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'fileutils'
5
+ require 'aranha/parsers/source_address'
5
6
 
6
7
  module Aranha
7
8
  module Parsers
8
9
  class Base
9
10
  LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
10
11
 
12
+ attr_reader :source_address
13
+
11
14
  def initialize(url)
12
- @url = url
15
+ @source_address = ::Aranha::Parsers::SourceAddress.new(url)
16
+ log_content(source_address.serialize, '-source-address')
13
17
  end
14
18
 
15
- def url
16
- r = (@url.is_a?(Hash) ? @url.fetch(:url) : @url)
17
- r.to_s.gsub(%r{/+$}, '')
18
- end
19
+ delegate :url, to: :source_address
19
20
 
20
21
  def content
21
- s = content_by_url_type
22
+ s = source_address.content
22
23
  log_content(s)
23
24
  s
24
25
  end
25
26
 
26
27
  private
27
28
 
28
- def content_by_url_type
29
- if @url.is_a?(Hash)
30
- content_hash
31
- elsif /^http/ =~ @url
32
- content_get
33
- else
34
- content_file
35
- end
36
- end
37
-
38
- def content_file
39
- ::File.open(@url.to_s.gsub(%r{\Afile://}, ''), &:read)
40
- end
41
-
42
- def content_get
43
- content_get_fetch(@url)
44
- end
45
-
46
- def content_get_fetch(uri, limit = 10)
47
- raise 'too many HTTP redirects' if limit.zero?
48
-
49
- response = Net::HTTP.get_response(URI(uri))
50
-
51
- case response
52
- when Net::HTTPSuccess then
53
- response.body
54
- when Net::HTTPRedirection then
55
- content_get_fetch(response['location'], limit - 1)
56
- else
57
- response.value
58
- end
59
- end
60
-
61
- def content_hash
62
- return content_post if @url[:method] == :post
63
-
64
- raise "Unknown URL format: #{@url}"
65
- end
66
-
67
- def content_post
68
- HTTPClient.new.post_content(@url[:url], @url[:params].merge(follow_redirect: true))
69
- end
29
+ def log_content(content, suffix = '')
30
+ path = log_file(suffix)
70
31
 
71
- def log_content(content)
72
- path = log_file
73
32
  return unless path
74
33
  File.open(path, 'wb') { |file| file.write(content) }
75
34
  end
76
35
 
77
- def log_file
36
+ def log_file(suffix)
78
37
  dir = log_parsers_dir
79
38
  return nil unless dir
80
- f = ::File.join(dir, "#{self.class.name.parameterize}.log")
39
+ f = ::File.join(dir, "#{self.class.name.parameterize}#{suffix}.log")
81
40
  FileUtils.mkdir_p(File.dirname(f))
82
41
  f
83
42
  end
@@ -89,9 +48,8 @@ module Aranha
89
48
  end
90
49
 
91
50
  def rails_root_exist?
92
- klass = Module.const_get('Rails')
93
- return false unless klass.is_a?(Class)
94
- klass.respond_to?(:root)
51
+ ::Rails.root
52
+ true
95
53
  rescue NameError
96
54
  return false
97
55
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require 'active_support/core_ext/module/delegation'
5
+ require 'aranha/parsers/source_address/hash_http_post'
6
+ require 'aranha/parsers/source_address/http_get'
7
+ require 'aranha/parsers/source_address/file'
8
+
9
+ module Aranha
10
+ module Parsers
11
+ class SourceAddress
12
+ class << self
13
+ SUBS = [
14
+ ::Aranha::Parsers::SourceAddress::HashHttpPost,
15
+ ::Aranha::Parsers::SourceAddress::HttpGet,
16
+ ::Aranha::Parsers::SourceAddress::File
17
+ ].freeze
18
+
19
+ def detect_sub(source)
20
+ return source.sub if source.is_a?(self)
21
+ SUBS.each do |sub|
22
+ return sub.new(source) if sub.valid_source?(source)
23
+ end
24
+ raise "No content fetcher found for source \"#{source}\""
25
+ end
26
+
27
+ def deserialize(string)
28
+ new(string =~ %r{\A[a-z]+://} ? string.strip : ::YAML.load(string))
29
+ end
30
+
31
+ def from_file(path)
32
+ deserialize(::File.read(path))
33
+ end
34
+ end
35
+
36
+ attr_reader :sub
37
+
38
+ def initialize(source)
39
+ @sub = self.class.detect_sub(source)
40
+ end
41
+
42
+ delegate :content, :url, to: :sub
43
+
44
+ def serialize
45
+ sub.serialize.strip + "\n"
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/source_address/http_get'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class File < ::Aranha::Parsers::SourceAddress::HttpGet
9
+ SCHEME = 'file://'
10
+
11
+ class << self
12
+ def valid_source?(source)
13
+ source.to_s.start_with?(SCHEME + '/', '/')
14
+ end
15
+ end
16
+
17
+ def initialize(source)
18
+ super source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, '')
19
+ end
20
+
21
+ def url
22
+ "#{SCHEME}#{source}"
23
+ end
24
+
25
+ def content
26
+ ::File.open(source, &:read)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'active_support/core_ext/hash/indifferent_access'
4
+ require 'httpclient'
5
+ require 'yaml'
6
+
7
+ module Aranha
8
+ module Parsers
9
+ class SourceAddress
10
+ class HashHttpPost
11
+ class << self
12
+ def valid_source?(source)
13
+ source.is_a?(::Hash) && source.with_indifferent_access.key?(:url)
14
+ end
15
+ end
16
+
17
+ attr_reader :source
18
+
19
+ def initialize(source)
20
+ @source = source.with_indifferent_access
21
+ end
22
+
23
+ def ==(other)
24
+ self.class == other.class && source == other.source
25
+ end
26
+
27
+ def url
28
+ source.fetch(:url)
29
+ end
30
+
31
+ def serialize
32
+ source.to_yaml
33
+ end
34
+
35
+ def content
36
+ HTTPClient.new.post_content(
37
+ source[:url],
38
+ source[:params].merge(follow_redirect: true)
39
+ )
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class HttpGet
9
+ class << self
10
+ def valid_source?(source)
11
+ source.to_s.start_with?('http://')
12
+ end
13
+ end
14
+
15
+ attr_reader :source
16
+
17
+ def initialize(source)
18
+ @source = source.to_s
19
+ end
20
+
21
+ def ==(other)
22
+ self.class == other.class && source == other.source
23
+ end
24
+
25
+ def url
26
+ source
27
+ end
28
+
29
+ def content
30
+ content_fetch(url)
31
+ end
32
+
33
+ def serialize
34
+ url
35
+ end
36
+
37
+ private
38
+
39
+ def content_fetch(uri, limit = 10)
40
+ raise 'too many HTTP redirects' if limit.zero?
41
+
42
+ response = Net::HTTP.get_response(URI(uri))
43
+
44
+ case response
45
+ when Net::HTTPSuccess then
46
+ response.body
47
+ when Net::HTTPRedirection then
48
+ content_fetch(response['location'], limit - 1)
49
+ else
50
+ response.value
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.6.0'
4
+ VERSION = '0.7.0'
5
5
  end
@@ -3,9 +3,9 @@
3
3
  # Configure Rails Environment
4
4
  ENV['RAILS_ENV'] = 'test'
5
5
 
6
- require File.expand_path('../test/dummy/config/environment.rb', __dir__)
6
+ require File.expand_path('../spec/support/rails_apps/config/environment.rb', __dir__)
7
7
  ActiveRecord::Migrator.migrations_paths = [
8
- File.expand_path('../test/dummy/db/migrate', __dir__)
8
+ File.expand_path('../spec/support/rails_app/db/migrate', __dir__)
9
9
  ]
10
10
  require 'rails/test_help'
11
11
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-04 00:00:00.000000000 Z
11
+ date: 2019-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -86,6 +86,20 @@ dependencies:
86
86
  - - ">="
87
87
  - !ruby/object:Gem::Version
88
88
  version: 3.142.3
89
+ - !ruby/object:Gem::Dependency
90
+ name: rspec
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '3.8'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '3.8'
89
103
  - !ruby/object:Gem::Dependency
90
104
  name: sqlite3
91
105
  requirement: !ruby/object:Gem::Requirement
@@ -135,6 +149,10 @@ files:
135
149
  - lib/aranha/parsers/html/node/base.rb
136
150
  - lib/aranha/parsers/html/node/default.rb
137
151
  - lib/aranha/parsers/invalid_state_exception.rb
152
+ - lib/aranha/parsers/source_address.rb
153
+ - lib/aranha/parsers/source_address/file.rb
154
+ - lib/aranha/parsers/source_address/hash_http_post.rb
155
+ - lib/aranha/parsers/source_address/http_get.rb
138
156
  - lib/aranha/parsers/spec/source_target_fixtures.rb
139
157
  - lib/aranha/parsers/spec/source_target_fixtures_example.rb
140
158
  - lib/aranha/processor.rb
@@ -144,42 +162,6 @@ files:
144
162
  - lib/aranha/version.rb
145
163
  - lib/tasks/aranha_tasks.rake
146
164
  - test/aranha_test.rb
147
- - test/dummy/README.rdoc
148
- - test/dummy/Rakefile
149
- - test/dummy/app/assets/javascripts/application.js
150
- - test/dummy/app/assets/stylesheets/application.css
151
- - test/dummy/app/controllers/application_controller.rb
152
- - test/dummy/app/helpers/application_helper.rb
153
- - test/dummy/app/views/layouts/application.html.erb
154
- - test/dummy/bin/bundle
155
- - test/dummy/bin/rails
156
- - test/dummy/bin/rake
157
- - test/dummy/bin/setup
158
- - test/dummy/config.ru
159
- - test/dummy/config/application.rb
160
- - test/dummy/config/boot.rb
161
- - test/dummy/config/database.yml
162
- - test/dummy/config/environment.rb
163
- - test/dummy/config/environments/development.rb
164
- - test/dummy/config/environments/production.rb
165
- - test/dummy/config/environments/test.rb
166
- - test/dummy/config/initializers/assets.rb
167
- - test/dummy/config/initializers/backtrace_silencers.rb
168
- - test/dummy/config/initializers/cookies_serializer.rb
169
- - test/dummy/config/initializers/filter_parameter_logging.rb
170
- - test/dummy/config/initializers/inflections.rb
171
- - test/dummy/config/initializers/mime_types.rb
172
- - test/dummy/config/initializers/session_store.rb
173
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
174
- - test/dummy/config/initializers/wrap_parameters.rb
175
- - test/dummy/config/locales/en.yml
176
- - test/dummy/config/routes.rb
177
- - test/dummy/config/secrets.yml
178
- - test/dummy/db/schema.rb
179
- - test/dummy/public/404.html
180
- - test/dummy/public/422.html
181
- - test/dummy/public/500.html
182
- - test/dummy/public/favicon.ico
183
165
  - test/integration/navigation_test.rb
184
166
  - test/test_helper.rb
185
167
  homepage:
@@ -207,42 +189,6 @@ signing_key:
207
189
  specification_version: 4
208
190
  summary: Rails utilities for web crawling.
209
191
  test_files:
210
- - test/dummy/Rakefile
211
- - test/dummy/README.rdoc
212
- - test/dummy/config.ru
213
- - test/dummy/config/boot.rb
214
- - test/dummy/config/database.yml
215
- - test/dummy/config/secrets.yml
216
- - test/dummy/config/locales/en.yml
217
- - test/dummy/config/application.rb
218
- - test/dummy/config/environments/development.rb
219
- - test/dummy/config/environments/test.rb
220
- - test/dummy/config/environments/production.rb
221
- - test/dummy/config/environment.rb
222
- - test/dummy/config/routes.rb
223
- - test/dummy/config/initializers/assets.rb
224
- - test/dummy/config/initializers/cookies_serializer.rb
225
- - test/dummy/config/initializers/inflections.rb
226
- - test/dummy/config/initializers/session_store.rb
227
- - test/dummy/config/initializers/wrap_parameters.rb
228
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
229
- - test/dummy/config/initializers/filter_parameter_logging.rb
230
- - test/dummy/config/initializers/backtrace_silencers.rb
231
- - test/dummy/config/initializers/mime_types.rb
232
- - test/dummy/db/schema.rb
233
- - test/dummy/app/views/layouts/application.html.erb
234
- - test/dummy/app/controllers/application_controller.rb
235
- - test/dummy/app/helpers/application_helper.rb
236
- - test/dummy/app/assets/stylesheets/application.css
237
- - test/dummy/app/assets/javascripts/application.js
238
- - test/dummy/public/422.html
239
- - test/dummy/public/404.html
240
- - test/dummy/public/favicon.ico
241
- - test/dummy/public/500.html
242
- - test/dummy/bin/bundle
243
- - test/dummy/bin/setup
244
- - test/dummy/bin/rails
245
- - test/dummy/bin/rake
246
192
  - test/aranha_test.rb
247
193
  - test/test_helper.rb
248
194
  - test/integration/navigation_test.rb