aranha-parsers 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +5 -0
- data/lib/aranha/parsers.rb +9 -0
- data/lib/aranha/parsers/base.rb +58 -0
- data/lib/aranha/parsers/html.rb +11 -0
- data/lib/aranha/parsers/html/base.rb +47 -0
- data/lib/aranha/parsers/html/item.rb +23 -0
- data/lib/aranha/parsers/html/item_list.rb +25 -0
- data/lib/aranha/parsers/html/node.rb +11 -0
- data/lib/aranha/parsers/html/node/base.rb +30 -0
- data/lib/aranha/parsers/html/node/default.rb +93 -0
- data/lib/aranha/parsers/invalid_state_exception.rb +8 -0
- data/lib/aranha/parsers/source_address.rb +55 -0
- data/lib/aranha/parsers/source_address/file.rb +31 -0
- data/lib/aranha/parsers/source_address/hash_http_get.rb +25 -0
- data/lib/aranha/parsers/source_address/hash_http_post.rb +45 -0
- data/lib/aranha/parsers/source_address/http_get.rb +61 -0
- data/lib/aranha/parsers/spec/source_target_fixtures.rb +67 -0
- data/lib/aranha/parsers/spec/source_target_fixtures_example.rb +61 -0
- data/lib/aranha/parsers/version.rb +5 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5685f5cd07ae6f7ff6adb9f03c7ebd286a0279f508157706460617feb377937e
|
4
|
+
data.tar.gz: 3e02eb80a205bd5b26397728d6a680a3ac4bf65afdb9cd45a3e831e75b5193e2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 468f57861a6182901b9aca6361115ae0e9bbdb192e91f9c10d34b99e4af11338b3f727a17734a70c85493495ddde7f633078560d3f52e0b4677a013fc58cd234
|
7
|
+
data.tar.gz: 56fafe12e79122cc78dafc92d8bde7721b3f1daad69ca959fc36c58e7c314af326f8cf017516ce102fb3109a1c652c86102a2d37859192849f39b104ba3037e9
|
data/Gemfile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'aranha/parsers/source_address'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
class Base
|
10
|
+
LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
|
11
|
+
|
12
|
+
attr_reader :source_address
|
13
|
+
|
14
|
+
def initialize(url)
|
15
|
+
@source_address = ::Aranha::Parsers::SourceAddress.new(url)
|
16
|
+
log_content(source_address.serialize, '-source-address')
|
17
|
+
end
|
18
|
+
|
19
|
+
delegate :url, to: :source_address
|
20
|
+
|
21
|
+
def content
|
22
|
+
s = source_address.content
|
23
|
+
log_content(s)
|
24
|
+
s
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def log_content(content, suffix = '')
|
30
|
+
path = log_file(suffix)
|
31
|
+
|
32
|
+
return unless path
|
33
|
+
File.open(path, 'wb') { |file| file.write(content) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def log_file(suffix)
|
37
|
+
dir = log_parsers_dir
|
38
|
+
return nil unless dir
|
39
|
+
f = ::File.join(dir, "#{self.class.name.parameterize}#{suffix}.log")
|
40
|
+
FileUtils.mkdir_p(File.dirname(f))
|
41
|
+
f
|
42
|
+
end
|
43
|
+
|
44
|
+
def log_parsers_dir
|
45
|
+
return ENV[LOG_DIR_ENVVAR] if ENV[LOG_DIR_ENVVAR]
|
46
|
+
return ::Rails.root.join('log', 'parsers') if rails_root_exist?
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def rails_root_exist?
|
51
|
+
::Rails.root
|
52
|
+
true
|
53
|
+
rescue NameError
|
54
|
+
return false
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'aranha/parsers/base'
|
5
|
+
require 'aranha/parsers/html/node/default'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
module Html
|
10
|
+
class Base < ::Aranha::Parsers::Base
|
11
|
+
class << self
|
12
|
+
def fields
|
13
|
+
@fields ||= []
|
14
|
+
@fields.dup
|
15
|
+
end
|
16
|
+
|
17
|
+
def field(name, type, xpath)
|
18
|
+
@fields ||= []
|
19
|
+
@fields << Field.new(name, type, xpath)
|
20
|
+
end
|
21
|
+
|
22
|
+
Field = Struct.new(:name, :type, :xpath)
|
23
|
+
end
|
24
|
+
|
25
|
+
def nokogiri
|
26
|
+
@nokogiri ||= Nokogiri::HTML(content, &:noblanks)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def node_parser_class
|
32
|
+
::Aranha::Parsers::Html::Node::Default
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def node_parser
|
38
|
+
@node_parser ||= node_parser_class.new(fields)
|
39
|
+
end
|
40
|
+
|
41
|
+
def fields
|
42
|
+
self.class.fields.map { |f| [f.name, f.type, f.xpath] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
class Item < Base
|
9
|
+
def data
|
10
|
+
@data ||= node_parser.parse(item_node)
|
11
|
+
end
|
12
|
+
|
13
|
+
def item_node
|
14
|
+
@item_node ||= begin
|
15
|
+
r = item_xpath ? nokogiri.at_xpath(item_xpath) : nokogiri
|
16
|
+
raise "Item node not found (Item xpath: #{item_xpath})" unless r
|
17
|
+
r
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
class ItemList < Base
|
9
|
+
def data
|
10
|
+
count = 0
|
11
|
+
@data ||= nokogiri.xpath(items_xpath).map do |m|
|
12
|
+
count += 1
|
13
|
+
node_parser.parse(m)
|
14
|
+
end
|
15
|
+
rescue StandardError => e
|
16
|
+
raise StandardError, "#{e.message} (Count: #{count})"
|
17
|
+
end
|
18
|
+
|
19
|
+
def items_xpath
|
20
|
+
raise "Class #{self.class} has no method \"item_xpath\". Implement it"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Html
|
6
|
+
module Node
|
7
|
+
class Base
|
8
|
+
attr_reader :fields
|
9
|
+
|
10
|
+
def initialize(fields)
|
11
|
+
@fields = fields
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(node)
|
15
|
+
Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def parse_field(node, xpath, parser_method)
|
21
|
+
value_method = "#{parser_method}_value"
|
22
|
+
return send(value_method, node, xpath) if respond_to?(value_method)
|
23
|
+
|
24
|
+
raise "Method \"#{value_method}\" not found in #{self.class}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Html
|
8
|
+
module Node
|
9
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
+
def string_value(node, xpath)
|
11
|
+
if node.at_xpath(xpath)
|
12
|
+
node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
|
13
|
+
else
|
14
|
+
''
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def quoted_value(node, xpath)
|
19
|
+
s = string_value(node, xpath)
|
20
|
+
return '' unless s
|
21
|
+
|
22
|
+
m = /\"([^\"]+)\"/.match(s)
|
23
|
+
return m[1] if m
|
24
|
+
|
25
|
+
''
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer_value(node, xpath)
|
29
|
+
r = string_value(node, xpath)
|
30
|
+
return nil if r.blank?
|
31
|
+
|
32
|
+
m = /\d+/.match(r)
|
33
|
+
raise "Integer not found in \"#{r}\"" unless m
|
34
|
+
|
35
|
+
m[0].to_i
|
36
|
+
end
|
37
|
+
|
38
|
+
def integer_optional_value(node, xpath)
|
39
|
+
r = string_value(node, xpath)
|
40
|
+
m = /\d+/.match(r)
|
41
|
+
m ? m[0].to_i : nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def float_value(node, xpath)
|
45
|
+
parse_float(node, xpath, true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def float_optional_value(node, xpath)
|
49
|
+
parse_float(node, xpath, false)
|
50
|
+
end
|
51
|
+
|
52
|
+
def array_value(node, xpath)
|
53
|
+
r = node.xpath(xpath).map { |n| n.text.strip }
|
54
|
+
r.join('|')
|
55
|
+
end
|
56
|
+
|
57
|
+
def join_value(node, xpath)
|
58
|
+
m = ''
|
59
|
+
node.xpath(xpath).each do |n|
|
60
|
+
m << n.text.strip
|
61
|
+
end
|
62
|
+
m
|
63
|
+
end
|
64
|
+
|
65
|
+
def duration_value(node, xpath)
|
66
|
+
m = /(\d+) m/.match(join_value(node, xpath))
|
67
|
+
m ? m[1].to_i : nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def regxep(node, xpath, pattern)
|
71
|
+
s = string_value(node, xpath)
|
72
|
+
m = pattern.match(s)
|
73
|
+
return m if m
|
74
|
+
|
75
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def parse_float(node, xpath, required)
|
81
|
+
s = string_value(node, xpath)
|
82
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
83
|
+
if m
|
84
|
+
m[0].sub(',', '.').to_f
|
85
|
+
elsif required
|
86
|
+
raise "Float value not found in \"#{s}\""
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'active_support/core_ext/module/delegation'
|
5
|
+
require 'aranha/parsers/source_address/hash_http_get'
|
6
|
+
require 'aranha/parsers/source_address/hash_http_post'
|
7
|
+
require 'aranha/parsers/source_address/http_get'
|
8
|
+
require 'aranha/parsers/source_address/file'
|
9
|
+
|
10
|
+
module Aranha
|
11
|
+
module Parsers
|
12
|
+
class SourceAddress
|
13
|
+
class << self
|
14
|
+
SUBS = [
|
15
|
+
::Aranha::Parsers::SourceAddress::HashHttpGet,
|
16
|
+
::Aranha::Parsers::SourceAddress::HashHttpPost,
|
17
|
+
::Aranha::Parsers::SourceAddress::HttpGet,
|
18
|
+
::Aranha::Parsers::SourceAddress::File
|
19
|
+
].freeze
|
20
|
+
|
21
|
+
def detect_sub(source)
|
22
|
+
return source.sub if source.is_a?(self)
|
23
|
+
SUBS.each do |sub|
|
24
|
+
return sub.new(source) if sub.valid_source?(source)
|
25
|
+
end
|
26
|
+
raise "No content fetcher found for source \"#{source}\""
|
27
|
+
end
|
28
|
+
|
29
|
+
def deserialize(string)
|
30
|
+
new(string =~ %r{\A[a-z]+://} ? string.strip : ::YAML.load(string))
|
31
|
+
end
|
32
|
+
|
33
|
+
def from_file(path)
|
34
|
+
deserialize(::File.read(path))
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :sub
|
39
|
+
|
40
|
+
def initialize(source)
|
41
|
+
@sub = self.class.detect_sub(source)
|
42
|
+
end
|
43
|
+
|
44
|
+
delegate :content, :url, to: :sub
|
45
|
+
|
46
|
+
def to_s
|
47
|
+
sub.url
|
48
|
+
end
|
49
|
+
|
50
|
+
def serialize
|
51
|
+
sub.serialize.strip + "\n"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/source_address/http_get'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class File < ::Aranha::Parsers::SourceAddress::HttpGet
|
9
|
+
SCHEME = 'file://'
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def valid_source?(source)
|
13
|
+
source.to_s.start_with?(SCHEME + '/', '/')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(source)
|
18
|
+
super source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, '')
|
19
|
+
end
|
20
|
+
|
21
|
+
def url
|
22
|
+
"#{SCHEME}#{source}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def content
|
26
|
+
::File.open(source, &:read)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/source_address/hash_http_post'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpPost
|
9
|
+
class << self
|
10
|
+
def valid_source?(source)
|
11
|
+
source.is_a?(::Hash) &&
|
12
|
+
source.with_indifferent_access[:method].to_s.downcase.strip == 'get'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def content
|
17
|
+
HTTPClient.new.get_content(
|
18
|
+
source[:url],
|
19
|
+
source[:params]
|
20
|
+
)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
4
|
+
require 'httpclient'
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
module Aranha
|
8
|
+
module Parsers
|
9
|
+
class SourceAddress
|
10
|
+
class HashHttpPost
|
11
|
+
class << self
|
12
|
+
def valid_source?(source)
|
13
|
+
source.is_a?(::Hash) &&
|
14
|
+
source.with_indifferent_access[:method].to_s.downcase.strip == 'post'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :source
|
19
|
+
|
20
|
+
def initialize(source)
|
21
|
+
@source = source.with_indifferent_access
|
22
|
+
end
|
23
|
+
|
24
|
+
def ==(other)
|
25
|
+
self.class == other.class && source == other.source
|
26
|
+
end
|
27
|
+
|
28
|
+
def url
|
29
|
+
source.fetch(:url)
|
30
|
+
end
|
31
|
+
|
32
|
+
def serialize
|
33
|
+
source.to_yaml
|
34
|
+
end
|
35
|
+
|
36
|
+
def content
|
37
|
+
HTTPClient.new.post_content(
|
38
|
+
source[:url],
|
39
|
+
source[:params].merge(follow_redirect: true)
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
require 'net/http'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
class SourceAddress
|
9
|
+
class HttpGet
|
10
|
+
class << self
|
11
|
+
def location_uri(source_uri, location)
|
12
|
+
::Addressable::URI.join(source_uri, location).to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def valid_source?(source)
|
16
|
+
source.to_s =~ %r{\Ahttps?://}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :source
|
21
|
+
|
22
|
+
def initialize(source)
|
23
|
+
@source = source.to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
self.class == other.class && source == other.source
|
28
|
+
end
|
29
|
+
|
30
|
+
def url
|
31
|
+
source
|
32
|
+
end
|
33
|
+
|
34
|
+
def content
|
35
|
+
content_fetch(url)
|
36
|
+
end
|
37
|
+
|
38
|
+
def serialize
|
39
|
+
url
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def content_fetch(uri, limit = 10)
|
45
|
+
raise 'too many HTTP redirects' if limit.zero?
|
46
|
+
|
47
|
+
response = Net::HTTP.get_response(URI(uri))
|
48
|
+
|
49
|
+
case response
|
50
|
+
when Net::HTTPSuccess then
|
51
|
+
response.body
|
52
|
+
when Net::HTTPRedirection then
|
53
|
+
content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
|
54
|
+
else
|
55
|
+
response.value
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Spec
|
8
|
+
# Lists pairs of source/target files in a directory.
|
9
|
+
class SourceTargetFixtures
|
10
|
+
class << self
|
11
|
+
def source_target_basename(file)
|
12
|
+
m = /^(.+)\.(?:source|target)(?:\..+)?$/.match(File.basename(file))
|
13
|
+
m ? m[1] : nil
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :fixtures_directory
|
18
|
+
|
19
|
+
def initialize(fixtures_directory)
|
20
|
+
@fixtures_directory = fixtures_directory
|
21
|
+
end
|
22
|
+
|
23
|
+
def source_target_files
|
24
|
+
sources_targets_basenames.map do |basename|
|
25
|
+
OpenStruct.new(source: source_file(basename), target: target_file(basename))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def source_files
|
30
|
+
r = []
|
31
|
+
source_target_files.each do |st|
|
32
|
+
r << st.source if st.source
|
33
|
+
end
|
34
|
+
r
|
35
|
+
end
|
36
|
+
|
37
|
+
def target_file(basename)
|
38
|
+
fixture_file(basename, 'target')
|
39
|
+
end
|
40
|
+
|
41
|
+
def source_file(basename)
|
42
|
+
fixture_file(basename, 'source')
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def fixture_file(basename, suffix)
|
48
|
+
prefix = "#{basename}.#{suffix}"
|
49
|
+
Dir.foreach(fixtures_directory) do |item|
|
50
|
+
next if item == '.' || item == '..'
|
51
|
+
return File.expand_path(item, fixtures_directory) if item.starts_with?(prefix)
|
52
|
+
end
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def sources_targets_basenames
|
57
|
+
basenames = Set.new
|
58
|
+
Dir.foreach(fixtures_directory) do |item|
|
59
|
+
next if item == '.' || item == '..'
|
60
|
+
b = self.class.source_target_basename(item)
|
61
|
+
basenames << b if b.present?
|
62
|
+
end
|
63
|
+
basenames
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'source_target_fixtures'
|
4
|
+
|
5
|
+
RSpec.shared_examples 'source_target_fixtures' do |spec_file| # rubocop:disable Metrics/BlockLength
|
6
|
+
let(:spec_file) { spec_file }
|
7
|
+
|
8
|
+
it 'fixtures directory should exist' do
|
9
|
+
expect(::File.directory?(fixtures_dir)).to be true
|
10
|
+
end
|
11
|
+
|
12
|
+
context 'in fixtures directory' do
|
13
|
+
it 'should have at least one file' do
|
14
|
+
expect(source_target_fixtures.source_target_files.count).to be > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
if ENV['WRITE_TARGET_FIXTURES']
|
18
|
+
it 'should write target data for all files' do
|
19
|
+
source_target_fixtures.source_files.each do |source_file|
|
20
|
+
sd = sort_results(source_data(source_file))
|
21
|
+
basename = ::Aranha::Spec::SourceTargetFixtures.source_target_basename(source_file)
|
22
|
+
target_file = File.expand_path("../#{basename}.target.yaml", source_file)
|
23
|
+
File.write(target_file, sd.to_yaml)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
else
|
27
|
+
it 'should parse data for all files' do
|
28
|
+
source_target_fixtures.source_target_files.each do |st|
|
29
|
+
assert_source_target_complete(st)
|
30
|
+
sd = source_data(st.source)
|
31
|
+
td = YAML.load_file(st.target)
|
32
|
+
expect(sort_results(sd)).to eq(sort_results(td))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def source_target_fixtures
|
39
|
+
@source_target_fixtures ||= ::Aranha::Spec::SourceTargetFixtures.new(fixtures_dir)
|
40
|
+
end
|
41
|
+
|
42
|
+
def assert_source_target_complete(st)
|
43
|
+
expect(st.source).to(be_truthy, "Source not found (Target: #{st.target})")
|
44
|
+
expect(st.target).to(be_truthy, "Target not found (Source: #{st.source})")
|
45
|
+
end
|
46
|
+
|
47
|
+
def source_data(source_file)
|
48
|
+
described_class.new(source_file).data
|
49
|
+
end
|
50
|
+
|
51
|
+
def fixtures_dir
|
52
|
+
::File.join(
|
53
|
+
::File.dirname(spec_file),
|
54
|
+
::File.basename(spec_file, '.*') + '_files'
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def sort_results(r)
|
59
|
+
r
|
60
|
+
end
|
61
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aranha-parsers
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Esquilo Azul Company
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-09-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 4.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: addressable
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.7'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.7'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: httpclient
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.8'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 2.8.3
|
51
|
+
type: :runtime
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '2.8'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.8.3
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: rspec
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '3.8'
|
68
|
+
type: :development
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '3.8'
|
75
|
+
description:
|
76
|
+
email:
|
77
|
+
executables: []
|
78
|
+
extensions: []
|
79
|
+
extra_rdoc_files: []
|
80
|
+
files:
|
81
|
+
- Gemfile
|
82
|
+
- lib/aranha/parsers.rb
|
83
|
+
- lib/aranha/parsers/base.rb
|
84
|
+
- lib/aranha/parsers/html.rb
|
85
|
+
- lib/aranha/parsers/html/base.rb
|
86
|
+
- lib/aranha/parsers/html/item.rb
|
87
|
+
- lib/aranha/parsers/html/item_list.rb
|
88
|
+
- lib/aranha/parsers/html/node.rb
|
89
|
+
- lib/aranha/parsers/html/node/base.rb
|
90
|
+
- lib/aranha/parsers/html/node/default.rb
|
91
|
+
- lib/aranha/parsers/invalid_state_exception.rb
|
92
|
+
- lib/aranha/parsers/source_address.rb
|
93
|
+
- lib/aranha/parsers/source_address/file.rb
|
94
|
+
- lib/aranha/parsers/source_address/hash_http_get.rb
|
95
|
+
- lib/aranha/parsers/source_address/hash_http_post.rb
|
96
|
+
- lib/aranha/parsers/source_address/http_get.rb
|
97
|
+
- lib/aranha/parsers/spec/source_target_fixtures.rb
|
98
|
+
- lib/aranha/parsers/spec/source_target_fixtures_example.rb
|
99
|
+
- lib/aranha/parsers/version.rb
|
100
|
+
homepage:
|
101
|
+
licenses: []
|
102
|
+
metadata: {}
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.7.7
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Parsers' utilities for Ruby.
|
123
|
+
test_files: []
|