aranha-parsers 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +5 -0
- data/lib/aranha/parsers.rb +9 -0
- data/lib/aranha/parsers/base.rb +58 -0
- data/lib/aranha/parsers/html.rb +11 -0
- data/lib/aranha/parsers/html/base.rb +47 -0
- data/lib/aranha/parsers/html/item.rb +23 -0
- data/lib/aranha/parsers/html/item_list.rb +25 -0
- data/lib/aranha/parsers/html/node.rb +11 -0
- data/lib/aranha/parsers/html/node/base.rb +30 -0
- data/lib/aranha/parsers/html/node/default.rb +93 -0
- data/lib/aranha/parsers/invalid_state_exception.rb +8 -0
- data/lib/aranha/parsers/source_address.rb +55 -0
- data/lib/aranha/parsers/source_address/file.rb +31 -0
- data/lib/aranha/parsers/source_address/hash_http_get.rb +25 -0
- data/lib/aranha/parsers/source_address/hash_http_post.rb +45 -0
- data/lib/aranha/parsers/source_address/http_get.rb +61 -0
- data/lib/aranha/parsers/spec/source_target_fixtures.rb +67 -0
- data/lib/aranha/parsers/spec/source_target_fixtures_example.rb +61 -0
- data/lib/aranha/parsers/version.rb +5 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5685f5cd07ae6f7ff6adb9f03c7ebd286a0279f508157706460617feb377937e
|
4
|
+
data.tar.gz: 3e02eb80a205bd5b26397728d6a680a3ac4bf65afdb9cd45a3e831e75b5193e2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 468f57861a6182901b9aca6361115ae0e9bbdb192e91f9c10d34b99e4af11338b3f727a17734a70c85493495ddde7f633078560d3f52e0b4677a013fc58cd234
|
7
|
+
data.tar.gz: 56fafe12e79122cc78dafc92d8bde7721b3f1daad69ca959fc36c58e7c314af326f8cf017516ce102fb3109a1c652c86102a2d37859192849f39b104ba3037e9
|
data/Gemfile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'aranha/parsers/source_address'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
class Base
|
10
|
+
LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
|
11
|
+
|
12
|
+
attr_reader :source_address
|
13
|
+
|
14
|
+
def initialize(url)
|
15
|
+
@source_address = ::Aranha::Parsers::SourceAddress.new(url)
|
16
|
+
log_content(source_address.serialize, '-source-address')
|
17
|
+
end
|
18
|
+
|
19
|
+
delegate :url, to: :source_address
|
20
|
+
|
21
|
+
def content
|
22
|
+
s = source_address.content
|
23
|
+
log_content(s)
|
24
|
+
s
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def log_content(content, suffix = '')
|
30
|
+
path = log_file(suffix)
|
31
|
+
|
32
|
+
return unless path
|
33
|
+
File.open(path, 'wb') { |file| file.write(content) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_file(suffix)
|
37
|
+
dir = log_parsers_dir
|
38
|
+
return nil unless dir
|
39
|
+
f = ::File.join(dir, "#{self.class.name.parameterize}#{suffix}.log")
|
40
|
+
FileUtils.mkdir_p(File.dirname(f))
|
41
|
+
f
|
42
|
+
end
|
43
|
+
|
44
|
+
def log_parsers_dir
|
45
|
+
return ENV[LOG_DIR_ENVVAR] if ENV[LOG_DIR_ENVVAR]
|
46
|
+
return ::Rails.root.join('log', 'parsers') if rails_root_exist?
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def rails_root_exist?
|
51
|
+
::Rails.root
|
52
|
+
true
|
53
|
+
rescue NameError
|
54
|
+
return false
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'aranha/parsers/base'
|
5
|
+
require 'aranha/parsers/html/node/default'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
module Html
|
10
|
+
class Base < ::Aranha::Parsers::Base
|
11
|
+
class << self
|
12
|
+
def fields
|
13
|
+
@fields ||= []
|
14
|
+
@fields.dup
|
15
|
+
end
|
16
|
+
|
17
|
+
def field(name, type, xpath)
|
18
|
+
@fields ||= []
|
19
|
+
@fields << Field.new(name, type, xpath)
|
20
|
+
end
|
21
|
+
|
22
|
+
Field = Struct.new(:name, :type, :xpath)
|
23
|
+
end
|
24
|
+
|
25
|
+
def nokogiri
|
26
|
+
@nokogiri ||= Nokogiri::HTML(content, &:noblanks)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def node_parser_class
|
32
|
+
::Aranha::Parsers::Html::Node::Default
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def node_parser
|
38
|
+
@node_parser ||= node_parser_class.new(fields)
|
39
|
+
end
|
40
|
+
|
41
|
+
def fields
|
42
|
+
self.class.fields.map { |f| [f.name, f.type, f.xpath] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
class Item < Base
|
9
|
+
def data
|
10
|
+
@data ||= node_parser.parse(item_node)
|
11
|
+
end
|
12
|
+
|
13
|
+
def item_node
|
14
|
+
@item_node ||= begin
|
15
|
+
r = item_xpath ? nokogiri.at_xpath(item_xpath) : nokogiri
|
16
|
+
raise "Item node not found (Item xpath: #{item_xpath})" unless r
|
17
|
+
r
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
class ItemList < Base
|
9
|
+
def data
|
10
|
+
count = 0
|
11
|
+
@data ||= nokogiri.xpath(items_xpath).map do |m|
|
12
|
+
count += 1
|
13
|
+
node_parser.parse(m)
|
14
|
+
end
|
15
|
+
rescue StandardError => e
|
16
|
+
raise StandardError, "#{e.message} (Count: #{count})"
|
17
|
+
end
|
18
|
+
|
19
|
+
def items_xpath
|
20
|
+
raise "Class #{self.class} has no method \"item_xpath\". Implement it"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
module Node
|
7
|
+
class Base
|
8
|
+
attr_reader :fields
|
9
|
+
|
10
|
+
def initialize(fields)
|
11
|
+
@fields = fields
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(node)
|
15
|
+
Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def parse_field(node, xpath, parser_method)
|
21
|
+
value_method = "#{parser_method}_value"
|
22
|
+
return send(value_method, node, xpath) if respond_to?(value_method)
|
23
|
+
|
24
|
+
raise "Method \"#{value_method}\" not found in #{self.class}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
module Node
|
9
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
+
def string_value(node, xpath)
|
11
|
+
if node.at_xpath(xpath)
|
12
|
+
node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
|
13
|
+
else
|
14
|
+
''
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def quoted_value(node, xpath)
|
19
|
+
s = string_value(node, xpath)
|
20
|
+
return '' unless s
|
21
|
+
|
22
|
+
m = /\"([^\"]+)\"/.match(s)
|
23
|
+
return m[1] if m
|
24
|
+
|
25
|
+
''
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer_value(node, xpath)
|
29
|
+
r = string_value(node, xpath)
|
30
|
+
return nil if r.blank?
|
31
|
+
|
32
|
+
m = /\d+/.match(r)
|
33
|
+
raise "Integer not found in \"#{r}\"" unless m
|
34
|
+
|
35
|
+
m[0].to_i
|
36
|
+
end
|
37
|
+
|
38
|
+
def integer_optional_value(node, xpath)
|
39
|
+
r = string_value(node, xpath)
|
40
|
+
m = /\d+/.match(r)
|
41
|
+
m ? m[0].to_i : nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def float_value(node, xpath)
|
45
|
+
parse_float(node, xpath, true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def float_optional_value(node, xpath)
|
49
|
+
parse_float(node, xpath, false)
|
50
|
+
end
|
51
|
+
|
52
|
+
def array_value(node, xpath)
|
53
|
+
r = node.xpath(xpath).map { |n| n.text.strip }
|
54
|
+
r.join('|')
|
55
|
+
end
|
56
|
+
|
57
|
+
def join_value(node, xpath)
|
58
|
+
m = ''
|
59
|
+
node.xpath(xpath).each do |n|
|
60
|
+
m << n.text.strip
|
61
|
+
end
|
62
|
+
m
|
63
|
+
end
|
64
|
+
|
65
|
+
def duration_value(node, xpath)
|
66
|
+
m = /(\d+) m/.match(join_value(node, xpath))
|
67
|
+
m ? m[1].to_i : nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def regxep(node, xpath, pattern)
|
71
|
+
s = string_value(node, xpath)
|
72
|
+
m = pattern.match(s)
|
73
|
+
return m if m
|
74
|
+
|
75
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def parse_float(node, xpath, required)
|
81
|
+
s = string_value(node, xpath)
|
82
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
83
|
+
if m
|
84
|
+
m[0].sub(',', '.').to_f
|
85
|
+
elsif required
|
86
|
+
raise "Float value not found in \"#{s}\""
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'active_support/core_ext/module/delegation'
|
5
|
+
require 'aranha/parsers/source_address/hash_http_get'
|
6
|
+
require 'aranha/parsers/source_address/hash_http_post'
|
7
|
+
require 'aranha/parsers/source_address/http_get'
|
8
|
+
require 'aranha/parsers/source_address/file'
|
9
|
+
|
10
|
+
module Aranha
|
11
|
+
module Parsers
|
12
|
+
class SourceAddress
|
13
|
+
class << self
|
14
|
+
SUBS = [
|
15
|
+
::Aranha::Parsers::SourceAddress::HashHttpGet,
|
16
|
+
::Aranha::Parsers::SourceAddress::HashHttpPost,
|
17
|
+
::Aranha::Parsers::SourceAddress::HttpGet,
|
18
|
+
::Aranha::Parsers::SourceAddress::File
|
19
|
+
].freeze
|
20
|
+
|
21
|
+
def detect_sub(source)
|
22
|
+
return source.sub if source.is_a?(self)
|
23
|
+
SUBS.each do |sub|
|
24
|
+
return sub.new(source) if sub.valid_source?(source)
|
25
|
+
end
|
26
|
+
raise "No content fetcher found for source \"#{source}\""
|
27
|
+
end
|
28
|
+
|
29
|
+
def deserialize(string)
|
30
|
+
new(string =~ %r{\A[a-z]+://} ? string.strip : ::YAML.load(string))
|
31
|
+
end
|
32
|
+
|
33
|
+
def from_file(path)
|
34
|
+
deserialize(::File.read(path))
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :sub
|
39
|
+
|
40
|
+
def initialize(source)
|
41
|
+
@sub = self.class.detect_sub(source)
|
42
|
+
end
|
43
|
+
|
44
|
+
delegate :content, :url, to: :sub
|
45
|
+
|
46
|
+
def to_s
|
47
|
+
sub.url
|
48
|
+
end
|
49
|
+
|
50
|
+
def serialize
|
51
|
+
sub.serialize.strip + "\n"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/source_address/http_get'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class File < ::Aranha::Parsers::SourceAddress::HttpGet
|
9
|
+
SCHEME = 'file://'
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def valid_source?(source)
|
13
|
+
source.to_s.start_with?(SCHEME + '/', '/')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(source)
|
18
|
+
super source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, '')
|
19
|
+
end
|
20
|
+
|
21
|
+
def url
|
22
|
+
"#{SCHEME}#{source}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def content
|
26
|
+
::File.open(source, &:read)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/source_address/hash_http_post'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpPost
|
9
|
+
class << self
|
10
|
+
def valid_source?(source)
|
11
|
+
source.is_a?(::Hash) &&
|
12
|
+
source.with_indifferent_access[:method].to_s.downcase.strip == 'get'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def content
|
17
|
+
HTTPClient.new.get_content(
|
18
|
+
source[:url],
|
19
|
+
source[:params]
|
20
|
+
)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
4
|
+
require 'httpclient'
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
class SourceAddress
|
10
|
+
class HashHttpPost
|
11
|
+
class << self
|
12
|
+
def valid_source?(source)
|
13
|
+
source.is_a?(::Hash) &&
|
14
|
+
source.with_indifferent_access[:method].to_s.downcase.strip == 'post'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :source
|
19
|
+
|
20
|
+
def initialize(source)
|
21
|
+
@source = source.with_indifferent_access
|
22
|
+
end
|
23
|
+
|
24
|
+
def ==(other)
|
25
|
+
self.class == other.class && source == other.source
|
26
|
+
end
|
27
|
+
|
28
|
+
def url
|
29
|
+
source.fetch(:url)
|
30
|
+
end
|
31
|
+
|
32
|
+
def serialize
|
33
|
+
source.to_yaml
|
34
|
+
end
|
35
|
+
|
36
|
+
def content
|
37
|
+
HTTPClient.new.post_content(
|
38
|
+
source[:url],
|
39
|
+
source[:params].merge(follow_redirect: true)
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
require 'net/http'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
class SourceAddress
|
9
|
+
class HttpGet
|
10
|
+
class << self
|
11
|
+
def location_uri(source_uri, location)
|
12
|
+
::Addressable::URI.join(source_uri, location).to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def valid_source?(source)
|
16
|
+
source.to_s =~ %r{\Ahttps?://}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :source
|
21
|
+
|
22
|
+
def initialize(source)
|
23
|
+
@source = source.to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
self.class == other.class && source == other.source
|
28
|
+
end
|
29
|
+
|
30
|
+
def url
|
31
|
+
source
|
32
|
+
end
|
33
|
+
|
34
|
+
def content
|
35
|
+
content_fetch(url)
|
36
|
+
end
|
37
|
+
|
38
|
+
def serialize
|
39
|
+
url
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def content_fetch(uri, limit = 10)
|
45
|
+
raise 'too many HTTP redirects' if limit.zero?
|
46
|
+
|
47
|
+
response = Net::HTTP.get_response(URI(uri))
|
48
|
+
|
49
|
+
case response
|
50
|
+
when Net::HTTPSuccess then
|
51
|
+
response.body
|
52
|
+
when Net::HTTPRedirection then
|
53
|
+
content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
|
54
|
+
else
|
55
|
+
response.value
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Spec
|
8
|
+
# Lists pairs of source/target files in a directory.
|
9
|
+
class SourceTargetFixtures
|
10
|
+
class << self
|
11
|
+
def source_target_basename(file)
|
12
|
+
m = /^(.+)\.(?:source|target)(?:\..+)?$/.match(File.basename(file))
|
13
|
+
m ? m[1] : nil
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :fixtures_directory
|
18
|
+
|
19
|
+
def initialize(fixtures_directory)
|
20
|
+
@fixtures_directory = fixtures_directory
|
21
|
+
end
|
22
|
+
|
23
|
+
def source_target_files
|
24
|
+
sources_targets_basenames.map do |basename|
|
25
|
+
OpenStruct.new(source: source_file(basename), target: target_file(basename))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def source_files
|
30
|
+
r = []
|
31
|
+
source_target_files.each do |st|
|
32
|
+
r << st.source if st.source
|
33
|
+
end
|
34
|
+
r
|
35
|
+
end
|
36
|
+
|
37
|
+
def target_file(basename)
|
38
|
+
fixture_file(basename, 'target')
|
39
|
+
end
|
40
|
+
|
41
|
+
def source_file(basename)
|
42
|
+
fixture_file(basename, 'source')
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def fixture_file(basename, suffix)
|
48
|
+
prefix = "#{basename}.#{suffix}"
|
49
|
+
Dir.foreach(fixtures_directory) do |item|
|
50
|
+
next if item == '.' || item == '..'
|
51
|
+
return File.expand_path(item, fixtures_directory) if item.starts_with?(prefix)
|
52
|
+
end
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def sources_targets_basenames
|
57
|
+
basenames = Set.new
|
58
|
+
Dir.foreach(fixtures_directory) do |item|
|
59
|
+
next if item == '.' || item == '..'
|
60
|
+
b = self.class.source_target_basename(item)
|
61
|
+
basenames << b if b.present?
|
62
|
+
end
|
63
|
+
basenames
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'source_target_fixtures'
|
4
|
+
|
5
|
+
RSpec.shared_examples 'source_target_fixtures' do |spec_file| # rubocop:disable Metrics/BlockLength
|
6
|
+
let(:spec_file) { spec_file }
|
7
|
+
|
8
|
+
it 'fixtures directory should exist' do
|
9
|
+
expect(::File.directory?(fixtures_dir)).to be true
|
10
|
+
end
|
11
|
+
|
12
|
+
context 'in fixtures directory' do
|
13
|
+
it 'should have at least one file' do
|
14
|
+
expect(source_target_fixtures.source_target_files.count).to be > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
if ENV['WRITE_TARGET_FIXTURES']
|
18
|
+
it 'should write target data for all files' do
|
19
|
+
source_target_fixtures.source_files.each do |source_file|
|
20
|
+
sd = sort_results(source_data(source_file))
|
21
|
+
basename = ::Aranha::Spec::SourceTargetFixtures.source_target_basename(source_file)
|
22
|
+
target_file = File.expand_path("../#{basename}.target.yaml", source_file)
|
23
|
+
File.write(target_file, sd.to_yaml)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
else
|
27
|
+
it 'should parse data for all files' do
|
28
|
+
source_target_fixtures.source_target_files.each do |st|
|
29
|
+
assert_source_target_complete(st)
|
30
|
+
sd = source_data(st.source)
|
31
|
+
td = YAML.load_file(st.target)
|
32
|
+
expect(sort_results(sd)).to eq(sort_results(td))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def source_target_fixtures
|
39
|
+
@source_target_fixtures ||= ::Aranha::Spec::SourceTargetFixtures.new(fixtures_dir)
|
40
|
+
end
|
41
|
+
|
42
|
+
def assert_source_target_complete(st)
|
43
|
+
expect(st.source).to(be_truthy, "Source not found (Target: #{st.target})")
|
44
|
+
expect(st.target).to(be_truthy, "Target not found (Source: #{st.source})")
|
45
|
+
end
|
46
|
+
|
47
|
+
def source_data(source_file)
|
48
|
+
described_class.new(source_file).data
|
49
|
+
end
|
50
|
+
|
51
|
+
def fixtures_dir
|
52
|
+
::File.join(
|
53
|
+
::File.dirname(spec_file),
|
54
|
+
::File.basename(spec_file, '.*') + '_files'
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def sort_results(r)
|
59
|
+
r
|
60
|
+
end
|
61
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aranha-parsers
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Esquilo Azul Company
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-09-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 4.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: addressable
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.7'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.7'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: httpclient
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.8'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 2.8.3
|
51
|
+
type: :runtime
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '2.8'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.8.3
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: rspec
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '3.8'
|
68
|
+
type: :development
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '3.8'
|
75
|
+
description:
|
76
|
+
email:
|
77
|
+
executables: []
|
78
|
+
extensions: []
|
79
|
+
extra_rdoc_files: []
|
80
|
+
files:
|
81
|
+
- Gemfile
|
82
|
+
- lib/aranha/parsers.rb
|
83
|
+
- lib/aranha/parsers/base.rb
|
84
|
+
- lib/aranha/parsers/html.rb
|
85
|
+
- lib/aranha/parsers/html/base.rb
|
86
|
+
- lib/aranha/parsers/html/item.rb
|
87
|
+
- lib/aranha/parsers/html/item_list.rb
|
88
|
+
- lib/aranha/parsers/html/node.rb
|
89
|
+
- lib/aranha/parsers/html/node/base.rb
|
90
|
+
- lib/aranha/parsers/html/node/default.rb
|
91
|
+
- lib/aranha/parsers/invalid_state_exception.rb
|
92
|
+
- lib/aranha/parsers/source_address.rb
|
93
|
+
- lib/aranha/parsers/source_address/file.rb
|
94
|
+
- lib/aranha/parsers/source_address/hash_http_get.rb
|
95
|
+
- lib/aranha/parsers/source_address/hash_http_post.rb
|
96
|
+
- lib/aranha/parsers/source_address/http_get.rb
|
97
|
+
- lib/aranha/parsers/spec/source_target_fixtures.rb
|
98
|
+
- lib/aranha/parsers/spec/source_target_fixtures_example.rb
|
99
|
+
- lib/aranha/parsers/version.rb
|
100
|
+
homepage:
|
101
|
+
licenses: []
|
102
|
+
metadata: {}
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.7.7
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Parsers' utilities for Ruby.
|
123
|
+
test_files: []
|