aranha 0.10.1 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77b72be05123a10faead5d82b596c579668f00716754447a025feb6635c76916
4
- data.tar.gz: 5d5f6ccf53c73becdc225060175303259ec11c257dacc6c28eaff1426a0946c5
3
+ metadata.gz: dea7f9194a220cad9e2f168f4fa9a32b17353f45233eef8d0698788535317228
4
+ data.tar.gz: 40fa4399386bd6b1f2e52fc914cbb8fb436099f29c5493f404aa219b40cfb586
5
5
  SHA512:
6
- metadata.gz: bcec515206067ecb5a7a1d749c76d594cce289dc33df4dcb57b8065b390ffa763c4c1838ef1bb9e587f318208256f9ba18c667de64a66415029b841ec965adb7
7
- data.tar.gz: 0fdf294b6ee26c63a039c77cfa79ec5d16136b61ca89159817c885a31bb2bd36f8fa5a9e25c47a0c33dbd2a2cee6db8302087d671701ffea55c5a1fd32cc3399
6
+ metadata.gz: 1955d0bbaa9fffbe63c24814bad2e7f97a1c960e1b4e5041ca1775fc10a12e8eb3653c9d73c706011805de7af4608cad3a2f5a10548ae3434d91a199eb0b9255
7
+ data.tar.gz: 532b2cd21a97ae55873aa996815947cf3759212b0279081dac374a23d2f9f390b059b422dd9ee5e2a2627e0e66de524702ffd817c0910063b8c376b22e47b1f3
@@ -49,6 +49,10 @@ module Aranha
49
49
  option_value(:headless)
50
50
  end
51
51
 
52
+ def user_agent
53
+ options.fetch(:user_agent)
54
+ end
55
+
52
56
  private
53
57
 
54
58
  def option_value(key)
@@ -19,6 +19,7 @@ module Aranha
19
19
  r.add_argument('--headless') if headless?
20
20
  r.add_argument('--disable-popup-blocking')
21
21
  r.add_argument('--disable-translate')
22
+ r.add_argument("user-agent=#{user_agent}") if user_agent.present?
22
23
  r.add_preference(:download, prompt_for_download: false, default_directory: downloads_dir)
23
24
  r
24
25
  end
@@ -40,6 +40,7 @@ module Aranha
40
40
  r['browser.download.folderList'] = 2
41
41
  r['browser.helperApps.neverAsk.saveToDisk'] = auto_download_mime_types.join(';')
42
42
  r['pdfjs.disabled'] = true
43
+ r['general.useragent.override'] = user_agent if user_agent.present?
43
44
  r
44
45
  end
45
46
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.10.1'
4
+ VERSION = '0.11.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-17 00:00:00.000000000 Z
11
+ date: 2019-09-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 3.4.41.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: aranha-parsers
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.1'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: eac_ruby_utils
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -149,23 +163,6 @@ files:
149
163
  - lib/aranha/engine.rb
150
164
  - lib/aranha/fixtures.rb
151
165
  - lib/aranha/fixtures/download.rb
152
- - lib/aranha/parsers.rb
153
- - lib/aranha/parsers/base.rb
154
- - lib/aranha/parsers/html.rb
155
- - lib/aranha/parsers/html/base.rb
156
- - lib/aranha/parsers/html/item.rb
157
- - lib/aranha/parsers/html/item_list.rb
158
- - lib/aranha/parsers/html/node.rb
159
- - lib/aranha/parsers/html/node/base.rb
160
- - lib/aranha/parsers/html/node/default.rb
161
- - lib/aranha/parsers/invalid_state_exception.rb
162
- - lib/aranha/parsers/source_address.rb
163
- - lib/aranha/parsers/source_address/file.rb
164
- - lib/aranha/parsers/source_address/hash_http_get.rb
165
- - lib/aranha/parsers/source_address/hash_http_post.rb
166
- - lib/aranha/parsers/source_address/http_get.rb
167
- - lib/aranha/parsers/spec/source_target_fixtures.rb
168
- - lib/aranha/parsers/spec/source_target_fixtures_example.rb
169
166
  - lib/aranha/processor.rb
170
167
  - lib/aranha/selenium.rb
171
168
  - lib/aranha/selenium/driver_factory.rb
@@ -205,5 +202,5 @@ specification_version: 4
205
202
  summary: Rails utilities for web crawling.
206
203
  test_files:
207
204
  - test/aranha_test.rb
208
- - test/test_helper.rb
209
205
  - test/integration/navigation_test.rb
206
+ - test/test_helper.rb
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
- module Aranha
3
- module Parsers
4
- require 'aranha/parsers/base'
5
- require 'aranha/parsers/html'
6
- require 'aranha/parsers/invalid_state_exception'
7
- require 'aranha/parsers/source_address'
8
- end
9
- end
@@ -1,58 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'open-uri'
4
- require 'fileutils'
5
- require 'aranha/parsers/source_address'
6
-
7
- module Aranha
8
- module Parsers
9
- class Base
10
- LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
11
-
12
- attr_reader :source_address
13
-
14
- def initialize(url)
15
- @source_address = ::Aranha::Parsers::SourceAddress.new(url)
16
- log_content(source_address.serialize, '-source-address')
17
- end
18
-
19
- delegate :url, to: :source_address
20
-
21
- def content
22
- s = source_address.content
23
- log_content(s)
24
- s
25
- end
26
-
27
- private
28
-
29
- def log_content(content, suffix = '')
30
- path = log_file(suffix)
31
-
32
- return unless path
33
- File.open(path, 'wb') { |file| file.write(content) }
34
- end
35
-
36
- def log_file(suffix)
37
- dir = log_parsers_dir
38
- return nil unless dir
39
- f = ::File.join(dir, "#{self.class.name.parameterize}#{suffix}.log")
40
- FileUtils.mkdir_p(File.dirname(f))
41
- f
42
- end
43
-
44
- def log_parsers_dir
45
- return ENV[LOG_DIR_ENVVAR] if ENV[LOG_DIR_ENVVAR]
46
- return ::Rails.root.join('log', 'parsers') if rails_root_exist?
47
- nil
48
- end
49
-
50
- def rails_root_exist?
51
- ::Rails.root
52
- true
53
- rescue NameError
54
- return false
55
- end
56
- end
57
- end
58
- end
@@ -1,11 +0,0 @@
1
- # frozen_string_literal: true
2
- module Aranha
3
- module Parsers
4
- module Html
5
- require 'aranha/parsers/html/base'
6
- require 'aranha/parsers/html/item'
7
- require 'aranha/parsers/html/item_list'
8
- require 'aranha/parsers/html/node'
9
- end
10
- end
11
- end
@@ -1,47 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
- require 'aranha/parsers/base'
5
- require 'aranha/parsers/html/node/default'
6
-
7
- module Aranha
8
- module Parsers
9
- module Html
10
- class Base < ::Aranha::Parsers::Base
11
- class << self
12
- def fields
13
- @fields ||= []
14
- @fields.dup
15
- end
16
-
17
- def field(name, type, xpath)
18
- @fields ||= []
19
- @fields << Field.new(name, type, xpath)
20
- end
21
-
22
- Field = Struct.new(:name, :type, :xpath)
23
- end
24
-
25
- def nokogiri
26
- @nokogiri ||= Nokogiri::HTML(content, &:noblanks)
27
- end
28
-
29
- protected
30
-
31
- def node_parser_class
32
- ::Aranha::Parsers::Html::Node::Default
33
- end
34
-
35
- private
36
-
37
- def node_parser
38
- @node_parser ||= node_parser_class.new(fields)
39
- end
40
-
41
- def fields
42
- self.class.fields.map { |f| [f.name, f.type, f.xpath] }
43
- end
44
- end
45
- end
46
- end
47
- end
@@ -1,23 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/html/base'
4
-
5
- module Aranha
6
- module Parsers
7
- module Html
8
- class Item < Base
9
- def data
10
- @data ||= node_parser.parse(item_node)
11
- end
12
-
13
- def item_node
14
- @item_node ||= begin
15
- r = item_xpath ? nokogiri.at_xpath(item_xpath) : nokogiri
16
- raise "Item node not found (Item xpath: #{item_xpath})" unless r
17
- r
18
- end
19
- end
20
- end
21
- end
22
- end
23
- end
@@ -1,25 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/html/base'
4
-
5
- module Aranha
6
- module Parsers
7
- module Html
8
- class ItemList < Base
9
- def data
10
- count = 0
11
- @data ||= nokogiri.xpath(items_xpath).map do |m|
12
- count += 1
13
- node_parser.parse(m)
14
- end
15
- rescue StandardError => e
16
- raise StandardError, "#{e.message} (Count: #{count})"
17
- end
18
-
19
- def items_xpath
20
- raise "Class #{self.class} has no method \"item_xpath\". Implement it"
21
- end
22
- end
23
- end
24
- end
25
- end
@@ -1,11 +0,0 @@
1
- # frozen_string_literal: true
2
- module Aranha
3
- module Parsers
4
- module Html
5
- module Node
6
- require 'aranha/parsers/html/node/base'
7
- require 'aranha/parsers/html/node/default'
8
- end
9
- end
10
- end
11
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module Parsers
5
- module Html
6
- module Node
7
- class Base
8
- attr_reader :fields
9
-
10
- def initialize(fields)
11
- @fields = fields
12
- end
13
-
14
- def parse(node)
15
- Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
16
- end
17
-
18
- private
19
-
20
- def parse_field(node, xpath, parser_method)
21
- value_method = "#{parser_method}_value"
22
- return send(value_method, node, xpath) if respond_to?(value_method)
23
-
24
- raise "Method \"#{value_method}\" not found in #{self.class}"
25
- end
26
- end
27
- end
28
- end
29
- end
30
- end
@@ -1,93 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/html/node/base'
4
-
5
- module Aranha
6
- module Parsers
7
- module Html
8
- module Node
9
- class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def quoted_value(node, xpath)
19
- s = string_value(node, xpath)
20
- return '' unless s
21
-
22
- m = /\"([^\"]+)\"/.match(s)
23
- return m[1] if m
24
-
25
- ''
26
- end
27
-
28
- def integer_value(node, xpath)
29
- r = string_value(node, xpath)
30
- return nil if r.blank?
31
-
32
- m = /\d+/.match(r)
33
- raise "Integer not found in \"#{r}\"" unless m
34
-
35
- m[0].to_i
36
- end
37
-
38
- def integer_optional_value(node, xpath)
39
- r = string_value(node, xpath)
40
- m = /\d+/.match(r)
41
- m ? m[0].to_i : nil
42
- end
43
-
44
- def float_value(node, xpath)
45
- parse_float(node, xpath, true)
46
- end
47
-
48
- def float_optional_value(node, xpath)
49
- parse_float(node, xpath, false)
50
- end
51
-
52
- def array_value(node, xpath)
53
- r = node.xpath(xpath).map { |n| n.text.strip }
54
- r.join('|')
55
- end
56
-
57
- def join_value(node, xpath)
58
- m = ''
59
- node.xpath(xpath).each do |n|
60
- m << n.text.strip
61
- end
62
- m
63
- end
64
-
65
- def duration_value(node, xpath)
66
- m = /(\d+) m/.match(join_value(node, xpath))
67
- m ? m[1].to_i : nil
68
- end
69
-
70
- def regxep(node, xpath, pattern)
71
- s = string_value(node, xpath)
72
- m = pattern.match(s)
73
- return m if m
74
-
75
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
76
- end
77
-
78
- private
79
-
80
- def parse_float(node, xpath, required)
81
- s = string_value(node, xpath)
82
- m = /\d+(?:[\.\,](\d+))?/.match(s)
83
- if m
84
- m[0].sub(',', '.').to_f
85
- elsif required
86
- raise "Float value not found in \"#{s}\""
87
- end
88
- end
89
- end
90
- end
91
- end
92
- end
93
- end
@@ -1,8 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Aranha
4
- module Parsers
5
- class InvalidStateException < StandardError
6
- end
7
- end
8
- end
@@ -1,55 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'yaml'
4
- require 'active_support/core_ext/module/delegation'
5
- require 'aranha/parsers/source_address/hash_http_get'
6
- require 'aranha/parsers/source_address/hash_http_post'
7
- require 'aranha/parsers/source_address/http_get'
8
- require 'aranha/parsers/source_address/file'
9
-
10
- module Aranha
11
- module Parsers
12
- class SourceAddress
13
- class << self
14
- SUBS = [
15
- ::Aranha::Parsers::SourceAddress::HashHttpGet,
16
- ::Aranha::Parsers::SourceAddress::HashHttpPost,
17
- ::Aranha::Parsers::SourceAddress::HttpGet,
18
- ::Aranha::Parsers::SourceAddress::File
19
- ].freeze
20
-
21
- def detect_sub(source)
22
- return source.sub if source.is_a?(self)
23
- SUBS.each do |sub|
24
- return sub.new(source) if sub.valid_source?(source)
25
- end
26
- raise "No content fetcher found for source \"#{source}\""
27
- end
28
-
29
- def deserialize(string)
30
- new(string =~ %r{\A[a-z]+://} ? string.strip : ::YAML.load(string))
31
- end
32
-
33
- def from_file(path)
34
- deserialize(::File.read(path))
35
- end
36
- end
37
-
38
- attr_reader :sub
39
-
40
- def initialize(source)
41
- @sub = self.class.detect_sub(source)
42
- end
43
-
44
- delegate :content, :url, to: :sub
45
-
46
- def to_s
47
- sub.url
48
- end
49
-
50
- def serialize
51
- sub.serialize.strip + "\n"
52
- end
53
- end
54
- end
55
- end
@@ -1,31 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/source_address/http_get'
4
-
5
- module Aranha
6
- module Parsers
7
- class SourceAddress
8
- class File < ::Aranha::Parsers::SourceAddress::HttpGet
9
- SCHEME = 'file://'
10
-
11
- class << self
12
- def valid_source?(source)
13
- source.to_s.start_with?(SCHEME + '/', '/')
14
- end
15
- end
16
-
17
- def initialize(source)
18
- super source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, '')
19
- end
20
-
21
- def url
22
- "#{SCHEME}#{source}"
23
- end
24
-
25
- def content
26
- ::File.open(source, &:read)
27
- end
28
- end
29
- end
30
- end
31
- end
@@ -1,25 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'aranha/parsers/source_address/hash_http_post'
4
-
5
- module Aranha
6
- module Parsers
7
- class SourceAddress
8
- class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpPost
9
- class << self
10
- def valid_source?(source)
11
- source.is_a?(::Hash) &&
12
- source.with_indifferent_access[:method].to_s.downcase.strip == 'get'
13
- end
14
- end
15
-
16
- def content
17
- HTTPClient.new.get_content(
18
- source[:url],
19
- source[:params]
20
- )
21
- end
22
- end
23
- end
24
- end
25
- end
@@ -1,45 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'active_support/core_ext/hash/indifferent_access'
4
- require 'httpclient'
5
- require 'yaml'
6
-
7
- module Aranha
8
- module Parsers
9
- class SourceAddress
10
- class HashHttpPost
11
- class << self
12
- def valid_source?(source)
13
- source.is_a?(::Hash) &&
14
- source.with_indifferent_access[:method].to_s.downcase.strip == 'post'
15
- end
16
- end
17
-
18
- attr_reader :source
19
-
20
- def initialize(source)
21
- @source = source.with_indifferent_access
22
- end
23
-
24
- def ==(other)
25
- self.class == other.class && source == other.source
26
- end
27
-
28
- def url
29
- source.fetch(:url)
30
- end
31
-
32
- def serialize
33
- source.to_yaml
34
- end
35
-
36
- def content
37
- HTTPClient.new.post_content(
38
- source[:url],
39
- source[:params].merge(follow_redirect: true)
40
- )
41
- end
42
- end
43
- end
44
- end
45
- end
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'addressable'
4
- require 'net/http'
5
-
6
- module Aranha
7
- module Parsers
8
- class SourceAddress
9
- class HttpGet
10
- class << self
11
- def location_uri(source_uri, location)
12
- ::Addressable::URI.join(source_uri, location).to_s
13
- end
14
-
15
- def valid_source?(source)
16
- source.to_s =~ %r{\Ahttps?://}
17
- end
18
- end
19
-
20
- attr_reader :source
21
-
22
- def initialize(source)
23
- @source = source.to_s
24
- end
25
-
26
- def ==(other)
27
- self.class == other.class && source == other.source
28
- end
29
-
30
- def url
31
- source
32
- end
33
-
34
- def content
35
- content_fetch(url)
36
- end
37
-
38
- def serialize
39
- url
40
- end
41
-
42
- private
43
-
44
- def content_fetch(uri, limit = 10)
45
- raise 'too many HTTP redirects' if limit.zero?
46
-
47
- response = Net::HTTP.get_response(URI(uri))
48
-
49
- case response
50
- when Net::HTTPSuccess then
51
- response.body
52
- when Net::HTTPRedirection then
53
- content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
54
- else
55
- response.value
56
- end
57
- end
58
- end
59
- end
60
- end
61
- end
@@ -1,67 +0,0 @@
1
- # encoding: UTF-8
2
- # frozen_string_literal: true
3
-
4
- require 'yaml'
5
-
6
- module Aranha
7
- module Spec
8
- # Lists pairs of source/target files in a directory.
9
- class SourceTargetFixtures
10
- class << self
11
- def source_target_basename(file)
12
- m = /^(.+)\.(?:source|target)(?:\..+)?$/.match(File.basename(file))
13
- m ? m[1] : nil
14
- end
15
- end
16
-
17
- attr_reader :fixtures_directory
18
-
19
- def initialize(fixtures_directory)
20
- @fixtures_directory = fixtures_directory
21
- end
22
-
23
- def source_target_files
24
- sources_targets_basenames.map do |basename|
25
- OpenStruct.new(source: source_file(basename), target: target_file(basename))
26
- end
27
- end
28
-
29
- def source_files
30
- r = []
31
- source_target_files.each do |st|
32
- r << st.source if st.source
33
- end
34
- r
35
- end
36
-
37
- def target_file(basename)
38
- fixture_file(basename, 'target')
39
- end
40
-
41
- def source_file(basename)
42
- fixture_file(basename, 'source')
43
- end
44
-
45
- private
46
-
47
- def fixture_file(basename, suffix)
48
- prefix = "#{basename}.#{suffix}"
49
- Dir.foreach(fixtures_directory) do |item|
50
- next if item == '.' || item == '..'
51
- return File.expand_path(item, fixtures_directory) if item.starts_with?(prefix)
52
- end
53
- nil
54
- end
55
-
56
- def sources_targets_basenames
57
- basenames = Set.new
58
- Dir.foreach(fixtures_directory) do |item|
59
- next if item == '.' || item == '..'
60
- b = self.class.source_target_basename(item)
61
- basenames << b if b.present?
62
- end
63
- basenames
64
- end
65
- end
66
- end
67
- end
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'source_target_fixtures'
4
-
5
- RSpec.shared_examples 'source_target_fixtures' do |spec_file| # rubocop:disable Metrics/BlockLength
6
- let(:spec_file) { spec_file }
7
-
8
- it 'fixtures directory should exist' do
9
- expect(::File.directory?(fixtures_dir)).to be true
10
- end
11
-
12
- context 'in fixtures directory' do
13
- it 'should have at least one file' do
14
- expect(source_target_fixtures.source_target_files.count).to be > 0
15
- end
16
-
17
- if ENV['WRITE_TARGET_FIXTURES']
18
- it 'should write target data for all files' do
19
- source_target_fixtures.source_files.each do |source_file|
20
- sd = sort_results(source_data(source_file))
21
- basename = ::Aranha::Spec::SourceTargetFixtures.source_target_basename(source_file)
22
- target_file = File.expand_path("../#{basename}.target.yaml", source_file)
23
- File.write(target_file, sd.to_yaml)
24
- end
25
- end
26
- else
27
- it 'should parse data for all files' do
28
- source_target_fixtures.source_target_files.each do |st|
29
- assert_source_target_complete(st)
30
- sd = source_data(st.source)
31
- td = YAML.load_file(st.target)
32
- expect(sort_results(sd)).to eq(sort_results(td))
33
- end
34
- end
35
- end
36
- end
37
-
38
- def source_target_fixtures
39
- @source_target_fixtures ||= ::Aranha::Spec::SourceTargetFixtures.new(fixtures_dir)
40
- end
41
-
42
- def assert_source_target_complete(st)
43
- expect(st.source).to(be_truthy, "Source not found (Target: #{st.target})")
44
- expect(st.target).to(be_truthy, "Target not found (Source: #{st.source})")
45
- end
46
-
47
- def source_data(source_file)
48
- described_class.new(source_file).data
49
- end
50
-
51
- def fixtures_dir
52
- ::File.join(
53
- ::File.dirname(spec_file),
54
- ::File.basename(spec_file, '.*') + '_files'
55
- )
56
- end
57
-
58
- def sort_results(r)
59
- r
60
- end
61
- end