aranha 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +7 -6
  3. data/lib/aranha/fixtures/download.rb +3 -1
  4. data/lib/aranha/parsers/base.rb +13 -55
  5. data/lib/aranha/parsers/source_address.rb +49 -0
  6. data/lib/aranha/parsers/source_address/file.rb +31 -0
  7. data/lib/aranha/parsers/source_address/hash_http_post.rb +44 -0
  8. data/lib/aranha/parsers/source_address/http_get.rb +56 -0
  9. data/lib/aranha/version.rb +1 -1
  10. data/test/test_helper.rb +2 -2
  11. metadata +20 -74
  12. data/test/dummy/README.rdoc +0 -28
  13. data/test/dummy/Rakefile +0 -8
  14. data/test/dummy/app/assets/javascripts/application.js +0 -13
  15. data/test/dummy/app/assets/stylesheets/application.css +0 -15
  16. data/test/dummy/app/controllers/application_controller.rb +0 -7
  17. data/test/dummy/app/helpers/application_helper.rb +0 -4
  18. data/test/dummy/app/views/layouts/application.html.erb +0 -12
  19. data/test/dummy/bin/bundle +0 -5
  20. data/test/dummy/bin/rails +0 -6
  21. data/test/dummy/bin/rake +0 -6
  22. data/test/dummy/bin/setup +0 -31
  23. data/test/dummy/config.ru +0 -6
  24. data/test/dummy/config/application.rb +0 -27
  25. data/test/dummy/config/boot.rb +0 -7
  26. data/test/dummy/config/database.yml +0 -25
  27. data/test/dummy/config/environment.rb +0 -7
  28. data/test/dummy/config/environments/development.rb +0 -43
  29. data/test/dummy/config/environments/production.rb +0 -82
  30. data/test/dummy/config/environments/test.rb +0 -44
  31. data/test/dummy/config/initializers/assets.rb +0 -13
  32. data/test/dummy/config/initializers/backtrace_silencers.rb +0 -11
  33. data/test/dummy/config/initializers/cookies_serializer.rb +0 -5
  34. data/test/dummy/config/initializers/filter_parameter_logging.rb +0 -6
  35. data/test/dummy/config/initializers/inflections.rb +0 -18
  36. data/test/dummy/config/initializers/mime_types.rb +0 -6
  37. data/test/dummy/config/initializers/session_store.rb +0 -5
  38. data/test/dummy/config/initializers/to_time_preserves_timezone.rb +0 -12
  39. data/test/dummy/config/initializers/wrap_parameters.rb +0 -16
  40. data/test/dummy/config/locales/en.yml +0 -23
  41. data/test/dummy/config/routes.rb +0 -5
  42. data/test/dummy/config/secrets.yml +0 -22
  43. data/test/dummy/db/schema.rb +0 -24
  44. data/test/dummy/public/404.html +0 -67
  45. data/test/dummy/public/422.html +0 -67
  46. data/test/dummy/public/500.html +0 -66
  47. data/test/dummy/public/favicon.ico +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d65a2ccecd09ab619dea2d76306dfc48c7aad83b0b111b4544afb8492a97ef04
4
- data.tar.gz: fd1d57c4d7ec4a22f8bd5829b8ad9a76838b729d54440b68c559cdf8ba5f9482
3
+ metadata.gz: d8acb62bae7e2cede5a9d658d705bfbc28229cc9a595207d9f6f7796a7681e1f
4
+ data.tar.gz: debd7afd142d1de8d2ff38f6f7cf35cf89b82174ce278f7f2247324cfeb53b41
5
5
  SHA512:
6
- metadata.gz: 3b34ee01cfc1f6be364017da680c64b4abf7a5bc303b40edfdedf7532cdfd69cbe17d57349fac4fbc8881dc8db336f8a5738aa1976e9e5a3f60a09b8c4808f77
7
- data.tar.gz: c5129c06f6ec81bfd02da79ccfa64b1187abf644e2271ac84ea30d24fc02b1608c13b74fbf3f962ecbe4a3d69955c08c51fd0ad1f692395e82ac15a78978f638
6
+ metadata.gz: 7ce9a11b1c49756f67dae806d065cdd0fbdbe8f9b2e132ec4cc2efad55f198fd2c8fbea1c21a27cef9e22398a0632d8efef03a02c590a57a9abc2d7024c7ef23
7
+ data.tar.gz: 3dc367711f35207d2f693f01fe4ae2f485eb79f7052fbccd75876e4a90c8a2e98624947980135dc764b95b7f30b791227002f7fee609638ce5a961ad83c04e7a
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ RDoc::Task.new(:rdoc) do |rdoc|
16
16
  rdoc.rdoc_files.include('lib/**/*.rb')
17
17
  end
18
18
 
19
- APP_RAKEFILE = File.expand_path('test/dummy/Rakefile', __dir__)
19
+ APP_RAKEFILE = File.expand_path('spec/support/rails_app/Rakefile', __dir__)
20
20
  load 'rails/tasks/engine.rake'
21
21
 
22
22
  load 'rails/tasks/statistics.rake'
@@ -25,11 +25,12 @@ Bundler::GemHelper.install_tasks
25
25
 
26
26
  require 'rake/testtask'
27
27
 
28
- Rake::TestTask.new(:test) do |t|
29
- t.libs << 'lib'
30
- t.libs << 'test'
31
- t.pattern = 'test/**/*_test.rb'
32
- t.verbose = false
28
+ begin
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec)
31
+ task test: :spec
32
+ rescue LoadError
33
+ # no rspec available
33
34
  end
34
35
 
35
36
  task default: :test
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'aranha/parsers/source_address'
4
+
3
5
  module Aranha
4
6
  module Fixtures
5
7
  class Download
@@ -44,7 +46,7 @@ module Aranha
44
46
  end
45
47
 
46
48
  def url(file)
47
- File.read(file).strip
49
+ ::Aranha::Parsers::SourceAddress.from_file(file)
48
50
  end
49
51
 
50
52
  def target(file)
@@ -2,82 +2,41 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'fileutils'
5
+ require 'aranha/parsers/source_address'
5
6
 
6
7
  module Aranha
7
8
  module Parsers
8
9
  class Base
9
10
  LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
10
11
 
12
+ attr_reader :source_address
13
+
11
14
  def initialize(url)
12
- @url = url
15
+ @source_address = ::Aranha::Parsers::SourceAddress.new(url)
16
+ log_content(source_address.serialize, '-source-address')
13
17
  end
14
18
 
15
- def url
16
- r = (@url.is_a?(Hash) ? @url.fetch(:url) : @url)
17
- r.to_s.gsub(%r{/+$}, '')
18
- end
19
+ delegate :url, to: :source_address
19
20
 
20
21
  def content
21
- s = content_by_url_type
22
+ s = source_address.content
22
23
  log_content(s)
23
24
  s
24
25
  end
25
26
 
26
27
  private
27
28
 
28
- def content_by_url_type
29
- if @url.is_a?(Hash)
30
- content_hash
31
- elsif /^http/ =~ @url
32
- content_get
33
- else
34
- content_file
35
- end
36
- end
37
-
38
- def content_file
39
- ::File.open(@url.to_s.gsub(%r{\Afile://}, ''), &:read)
40
- end
41
-
42
- def content_get
43
- content_get_fetch(@url)
44
- end
45
-
46
- def content_get_fetch(uri, limit = 10)
47
- raise 'too many HTTP redirects' if limit.zero?
48
-
49
- response = Net::HTTP.get_response(URI(uri))
50
-
51
- case response
52
- when Net::HTTPSuccess then
53
- response.body
54
- when Net::HTTPRedirection then
55
- content_get_fetch(response['location'], limit - 1)
56
- else
57
- response.value
58
- end
59
- end
60
-
61
- def content_hash
62
- return content_post if @url[:method] == :post
63
-
64
- raise "Unknown URL format: #{@url}"
65
- end
66
-
67
- def content_post
68
- HTTPClient.new.post_content(@url[:url], @url[:params].merge(follow_redirect: true))
69
- end
29
+ def log_content(content, suffix = '')
30
+ path = log_file(suffix)
70
31
 
71
- def log_content(content)
72
- path = log_file
73
32
  return unless path
74
33
  File.open(path, 'wb') { |file| file.write(content) }
75
34
  end
76
35
 
77
- def log_file
36
+ def log_file(suffix)
78
37
  dir = log_parsers_dir
79
38
  return nil unless dir
80
- f = ::File.join(dir, "#{self.class.name.parameterize}.log")
39
+ f = ::File.join(dir, "#{self.class.name.parameterize}#{suffix}.log")
81
40
  FileUtils.mkdir_p(File.dirname(f))
82
41
  f
83
42
  end
@@ -89,9 +48,8 @@ module Aranha
89
48
  end
90
49
 
91
50
  def rails_root_exist?
92
- klass = Module.const_get('Rails')
93
- return false unless klass.is_a?(Class)
94
- klass.respond_to?(:root)
51
+ ::Rails.root
52
+ true
95
53
  rescue NameError
96
54
  return false
97
55
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require 'active_support/core_ext/module/delegation'
5
+ require 'aranha/parsers/source_address/hash_http_post'
6
+ require 'aranha/parsers/source_address/http_get'
7
+ require 'aranha/parsers/source_address/file'
8
+
9
+ module Aranha
10
+ module Parsers
11
+ class SourceAddress
12
+ class << self
13
+ SUBS = [
14
+ ::Aranha::Parsers::SourceAddress::HashHttpPost,
15
+ ::Aranha::Parsers::SourceAddress::HttpGet,
16
+ ::Aranha::Parsers::SourceAddress::File
17
+ ].freeze
18
+
19
+ def detect_sub(source)
20
+ return source.sub if source.is_a?(self)
21
+ SUBS.each do |sub|
22
+ return sub.new(source) if sub.valid_source?(source)
23
+ end
24
+ raise "No content fetcher found for source \"#{source}\""
25
+ end
26
+
27
+ def deserialize(string)
28
+ new(string =~ %r{\A[a-z]+://} ? string.strip : ::YAML.load(string))
29
+ end
30
+
31
+ def from_file(path)
32
+ deserialize(::File.read(path))
33
+ end
34
+ end
35
+
36
+ attr_reader :sub
37
+
38
+ def initialize(source)
39
+ @sub = self.class.detect_sub(source)
40
+ end
41
+
42
+ delegate :content, :url, to: :sub
43
+
44
+ def serialize
45
+ sub.serialize.strip + "\n"
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/source_address/http_get'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class File < ::Aranha::Parsers::SourceAddress::HttpGet
9
+ SCHEME = 'file://'
10
+
11
+ class << self
12
+ def valid_source?(source)
13
+ source.to_s.start_with?(SCHEME + '/', '/')
14
+ end
15
+ end
16
+
17
+ def initialize(source)
18
+ super source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, '')
19
+ end
20
+
21
+ def url
22
+ "#{SCHEME}#{source}"
23
+ end
24
+
25
+ def content
26
+ ::File.open(source, &:read)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'active_support/core_ext/hash/indifferent_access'
4
+ require 'httpclient'
5
+ require 'yaml'
6
+
7
+ module Aranha
8
+ module Parsers
9
+ class SourceAddress
10
+ class HashHttpPost
11
+ class << self
12
+ def valid_source?(source)
13
+ source.is_a?(::Hash) && source.with_indifferent_access.key?(:url)
14
+ end
15
+ end
16
+
17
+ attr_reader :source
18
+
19
+ def initialize(source)
20
+ @source = source.with_indifferent_access
21
+ end
22
+
23
+ def ==(other)
24
+ self.class == other.class && source == other.source
25
+ end
26
+
27
+ def url
28
+ source.fetch(:url)
29
+ end
30
+
31
+ def serialize
32
+ source.to_yaml
33
+ end
34
+
35
+ def content
36
+ HTTPClient.new.post_content(
37
+ source[:url],
38
+ source[:params].merge(follow_redirect: true)
39
+ )
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class HttpGet
9
+ class << self
10
+ def valid_source?(source)
11
+ source.to_s.start_with?('http://')
12
+ end
13
+ end
14
+
15
+ attr_reader :source
16
+
17
+ def initialize(source)
18
+ @source = source.to_s
19
+ end
20
+
21
+ def ==(other)
22
+ self.class == other.class && source == other.source
23
+ end
24
+
25
+ def url
26
+ source
27
+ end
28
+
29
+ def content
30
+ content_fetch(url)
31
+ end
32
+
33
+ def serialize
34
+ url
35
+ end
36
+
37
+ private
38
+
39
+ def content_fetch(uri, limit = 10)
40
+ raise 'too many HTTP redirects' if limit.zero?
41
+
42
+ response = Net::HTTP.get_response(URI(uri))
43
+
44
+ case response
45
+ when Net::HTTPSuccess then
46
+ response.body
47
+ when Net::HTTPRedirection then
48
+ content_fetch(response['location'], limit - 1)
49
+ else
50
+ response.value
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.6.0'
4
+ VERSION = '0.7.0'
5
5
  end
@@ -3,9 +3,9 @@
3
3
  # Configure Rails Environment
4
4
  ENV['RAILS_ENV'] = 'test'
5
5
 
6
- require File.expand_path('../test/dummy/config/environment.rb', __dir__)
6
+ require File.expand_path('../spec/support/rails_apps/config/environment.rb', __dir__)
7
7
  ActiveRecord::Migrator.migrations_paths = [
8
- File.expand_path('../test/dummy/db/migrate', __dir__)
8
+ File.expand_path('../spec/support/rails_app/db/migrate', __dir__)
9
9
  ]
10
10
  require 'rails/test_help'
11
11
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-04 00:00:00.000000000 Z
11
+ date: 2019-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -86,6 +86,20 @@ dependencies:
86
86
  - - ">="
87
87
  - !ruby/object:Gem::Version
88
88
  version: 3.142.3
89
+ - !ruby/object:Gem::Dependency
90
+ name: rspec
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '3.8'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '3.8'
89
103
  - !ruby/object:Gem::Dependency
90
104
  name: sqlite3
91
105
  requirement: !ruby/object:Gem::Requirement
@@ -135,6 +149,10 @@ files:
135
149
  - lib/aranha/parsers/html/node/base.rb
136
150
  - lib/aranha/parsers/html/node/default.rb
137
151
  - lib/aranha/parsers/invalid_state_exception.rb
152
+ - lib/aranha/parsers/source_address.rb
153
+ - lib/aranha/parsers/source_address/file.rb
154
+ - lib/aranha/parsers/source_address/hash_http_post.rb
155
+ - lib/aranha/parsers/source_address/http_get.rb
138
156
  - lib/aranha/parsers/spec/source_target_fixtures.rb
139
157
  - lib/aranha/parsers/spec/source_target_fixtures_example.rb
140
158
  - lib/aranha/processor.rb
@@ -144,42 +162,6 @@ files:
144
162
  - lib/aranha/version.rb
145
163
  - lib/tasks/aranha_tasks.rake
146
164
  - test/aranha_test.rb
147
- - test/dummy/README.rdoc
148
- - test/dummy/Rakefile
149
- - test/dummy/app/assets/javascripts/application.js
150
- - test/dummy/app/assets/stylesheets/application.css
151
- - test/dummy/app/controllers/application_controller.rb
152
- - test/dummy/app/helpers/application_helper.rb
153
- - test/dummy/app/views/layouts/application.html.erb
154
- - test/dummy/bin/bundle
155
- - test/dummy/bin/rails
156
- - test/dummy/bin/rake
157
- - test/dummy/bin/setup
158
- - test/dummy/config.ru
159
- - test/dummy/config/application.rb
160
- - test/dummy/config/boot.rb
161
- - test/dummy/config/database.yml
162
- - test/dummy/config/environment.rb
163
- - test/dummy/config/environments/development.rb
164
- - test/dummy/config/environments/production.rb
165
- - test/dummy/config/environments/test.rb
166
- - test/dummy/config/initializers/assets.rb
167
- - test/dummy/config/initializers/backtrace_silencers.rb
168
- - test/dummy/config/initializers/cookies_serializer.rb
169
- - test/dummy/config/initializers/filter_parameter_logging.rb
170
- - test/dummy/config/initializers/inflections.rb
171
- - test/dummy/config/initializers/mime_types.rb
172
- - test/dummy/config/initializers/session_store.rb
173
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
174
- - test/dummy/config/initializers/wrap_parameters.rb
175
- - test/dummy/config/locales/en.yml
176
- - test/dummy/config/routes.rb
177
- - test/dummy/config/secrets.yml
178
- - test/dummy/db/schema.rb
179
- - test/dummy/public/404.html
180
- - test/dummy/public/422.html
181
- - test/dummy/public/500.html
182
- - test/dummy/public/favicon.ico
183
165
  - test/integration/navigation_test.rb
184
166
  - test/test_helper.rb
185
167
  homepage:
@@ -207,42 +189,6 @@ signing_key:
207
189
  specification_version: 4
208
190
  summary: Rails utilities for web crawling.
209
191
  test_files:
210
- - test/dummy/Rakefile
211
- - test/dummy/README.rdoc
212
- - test/dummy/config.ru
213
- - test/dummy/config/boot.rb
214
- - test/dummy/config/database.yml
215
- - test/dummy/config/secrets.yml
216
- - test/dummy/config/locales/en.yml
217
- - test/dummy/config/application.rb
218
- - test/dummy/config/environments/development.rb
219
- - test/dummy/config/environments/test.rb
220
- - test/dummy/config/environments/production.rb
221
- - test/dummy/config/environment.rb
222
- - test/dummy/config/routes.rb
223
- - test/dummy/config/initializers/assets.rb
224
- - test/dummy/config/initializers/cookies_serializer.rb
225
- - test/dummy/config/initializers/inflections.rb
226
- - test/dummy/config/initializers/session_store.rb
227
- - test/dummy/config/initializers/wrap_parameters.rb
228
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
229
- - test/dummy/config/initializers/filter_parameter_logging.rb
230
- - test/dummy/config/initializers/backtrace_silencers.rb
231
- - test/dummy/config/initializers/mime_types.rb
232
- - test/dummy/db/schema.rb
233
- - test/dummy/app/views/layouts/application.html.erb
234
- - test/dummy/app/controllers/application_controller.rb
235
- - test/dummy/app/helpers/application_helper.rb
236
- - test/dummy/app/assets/stylesheets/application.css
237
- - test/dummy/app/assets/javascripts/application.js
238
- - test/dummy/public/422.html
239
- - test/dummy/public/404.html
240
- - test/dummy/public/favicon.ico
241
- - test/dummy/public/500.html
242
- - test/dummy/bin/bundle
243
- - test/dummy/bin/setup
244
- - test/dummy/bin/rails
245
- - test/dummy/bin/rake
246
192
  - test/aranha_test.rb
247
193
  - test/test_helper.rb
248
194
  - test/integration/navigation_test.rb