aranha-parsers 0.8.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/base.rb +3 -1
- data/lib/aranha/parsers/html/node/default/numeric_support.rb +70 -0
- data/lib/aranha/parsers/html/node/default/string_support.rb +77 -0
- data/lib/aranha/parsers/html/node/default.rb +2 -94
- data/lib/aranha/parsers/json/base.rb +20 -0
- data/lib/aranha/parsers/json.rb +11 -0
- data/lib/aranha/parsers/rspec/setup.rb +1 -1
- data/lib/aranha/parsers/rspec/{source_target_fixtures_example.rb → shared_examples/source_target_fixtures.rb} +0 -0
- data/lib/aranha/parsers/source_address/fetch_content_error.rb +10 -0
- data/lib/aranha/parsers/source_address/http_get.rb +15 -5
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +42 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6ec27959167dfbae56e1fb00bfdbedad5bfe022a0708e493c623ca259df5c5d
|
4
|
+
data.tar.gz: c87db56022bfdf05f3f5cc99f58fbba4f0e73a5e8ddb7683d2569aabbf7bf28a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9116a522005c0a01b650bc889ce1a4c70f7096b478febc2e1b24f6596a4d94c77e750d81e616bd70b6ca557a1c61e56d55fcfe2066ff4f62aea3a9a14237ed01
|
7
|
+
data.tar.gz: 5080f643d1da251f05afb094a1c7ac9c5cbbe140a15f0d05b77ec87591e700b148ba4766615ee1853284d24674e2720149f25e6858b485e0123ebd3add74c4db
|
data/lib/aranha/parsers/base.rb
CHANGED
@@ -11,7 +11,9 @@ module Aranha
|
|
11
11
|
class << self
|
12
12
|
def from_content(content)
|
13
13
|
::EacRubyUtils::Fs::Temp.on_file do |path|
|
14
|
-
path.
|
14
|
+
::File.open(path.to_s, 'w:UTF-8') do |f|
|
15
|
+
f.write content.force_encoding('UTF-8')
|
16
|
+
end
|
15
17
|
r = new(path.to_path)
|
16
18
|
r.content
|
17
19
|
r
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
module Node
|
10
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
11
|
+
module NumericSupport
|
12
|
+
def integer_value(node, xpath)
|
13
|
+
r = string_value(node, xpath)
|
14
|
+
return nil if r.blank?
|
15
|
+
|
16
|
+
m = /\d+/.match(r)
|
17
|
+
raise "Integer not found in \"#{r}\"" unless m
|
18
|
+
|
19
|
+
m[0].to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
def integer_optional_value(node, xpath)
|
23
|
+
r = string_value(node, xpath)
|
24
|
+
m = /\d+/.match(r)
|
25
|
+
m ? m[0].to_i : nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def float_value(node, xpath)
|
29
|
+
parse_float(node, xpath, true)
|
30
|
+
end
|
31
|
+
|
32
|
+
def float_optional_value(node, xpath)
|
33
|
+
parse_float(node, xpath, false)
|
34
|
+
end
|
35
|
+
|
36
|
+
def us_decimal_value(node, xpath)
|
37
|
+
parse_us_decimal(node, xpath, true)
|
38
|
+
end
|
39
|
+
|
40
|
+
def us_decimal_optional_value(node, xpath)
|
41
|
+
parse_us_decimal(node, xpath, false)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def parse_float(node, xpath, required)
|
47
|
+
s = string_value(node, xpath)
|
48
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
49
|
+
if m
|
50
|
+
m[0].delete('.').tr(',', '.').to_f
|
51
|
+
elsif required
|
52
|
+
raise "Float value not found in \"#{s}\""
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_us_decimal(node, xpath, required)
|
57
|
+
s = string_value(node, xpath)
|
58
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
59
|
+
if m
|
60
|
+
m[0].delete(',').to_f
|
61
|
+
elsif required
|
62
|
+
raise "US decimal value not found in \"#{s}\""
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
module Node
|
10
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
11
|
+
module StringSupport
|
12
|
+
def quoted_value(node, xpath)
|
13
|
+
s = string_value(node, xpath)
|
14
|
+
return '' unless s
|
15
|
+
|
16
|
+
m = /\"([^\"]+)\"/.match(s)
|
17
|
+
return m[1] if m
|
18
|
+
|
19
|
+
''
|
20
|
+
end
|
21
|
+
|
22
|
+
def regxep(node, xpath, pattern)
|
23
|
+
s = string_value(node, xpath)
|
24
|
+
m = pattern.match(s)
|
25
|
+
return m if m
|
26
|
+
|
27
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
28
|
+
end
|
29
|
+
|
30
|
+
def string_value(node, xpath)
|
31
|
+
if node.at_xpath(xpath)
|
32
|
+
sanitize_string(node.at_xpath(xpath).text)
|
33
|
+
else
|
34
|
+
''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def string_recursive_value(node, xpath, required = true)
|
39
|
+
root = node.at_xpath(xpath)
|
40
|
+
if root.blank?
|
41
|
+
return nil unless required
|
42
|
+
|
43
|
+
raise "No node found (Xpath: #{xpath})"
|
44
|
+
end
|
45
|
+
result = string_recursive(root)
|
46
|
+
return result if result.present?
|
47
|
+
return nil unless required
|
48
|
+
|
49
|
+
raise "String blank (Xpath: #{xpath})"
|
50
|
+
end
|
51
|
+
|
52
|
+
def string_recursive_optional_value(node, xpath)
|
53
|
+
string_recursive_value(node, xpath, false)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def sanitize_string(obj)
|
59
|
+
obj.to_s.tr("\u00A0", ' ').strip
|
60
|
+
end
|
61
|
+
|
62
|
+
def string_recursive(node)
|
63
|
+
return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
|
64
|
+
|
65
|
+
s = ''
|
66
|
+
node.children.each do |child|
|
67
|
+
child_s = string_recursive(child)
|
68
|
+
s += ' ' + child_s if child_s.present?
|
69
|
+
end
|
70
|
+
sanitize_string(s)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -1,71 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
module Parsers
|
7
8
|
module Html
|
8
9
|
module Node
|
9
10
|
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
-
|
11
|
-
if node.at_xpath(xpath)
|
12
|
-
sanitize_string(node.at_xpath(xpath).text)
|
13
|
-
else
|
14
|
-
''
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def string_recursive_value(node, xpath, required = true)
|
19
|
-
root = node.at_xpath(xpath)
|
20
|
-
if root.blank?
|
21
|
-
return nil unless required
|
22
|
-
|
23
|
-
raise "No node found (Xpath: #{xpath})"
|
24
|
-
end
|
25
|
-
result = string_recursive(root)
|
26
|
-
return result if result.present?
|
27
|
-
return nil unless required
|
28
|
-
|
29
|
-
raise "String blank (Xpath: #{xpath})"
|
30
|
-
end
|
31
|
-
|
32
|
-
def string_recursive_optional_value(node, xpath)
|
33
|
-
string_recursive_value(node, xpath, false)
|
34
|
-
end
|
35
|
-
|
36
|
-
def quoted_value(node, xpath)
|
37
|
-
s = string_value(node, xpath)
|
38
|
-
return '' unless s
|
39
|
-
|
40
|
-
m = /\"([^\"]+)\"/.match(s)
|
41
|
-
return m[1] if m
|
42
|
-
|
43
|
-
''
|
44
|
-
end
|
45
|
-
|
46
|
-
def integer_value(node, xpath)
|
47
|
-
r = string_value(node, xpath)
|
48
|
-
return nil if r.blank?
|
49
|
-
|
50
|
-
m = /\d+/.match(r)
|
51
|
-
raise "Integer not found in \"#{r}\"" unless m
|
52
|
-
|
53
|
-
m[0].to_i
|
54
|
-
end
|
55
|
-
|
56
|
-
def integer_optional_value(node, xpath)
|
57
|
-
r = string_value(node, xpath)
|
58
|
-
m = /\d+/.match(r)
|
59
|
-
m ? m[0].to_i : nil
|
60
|
-
end
|
61
|
-
|
62
|
-
def float_value(node, xpath)
|
63
|
-
parse_float(node, xpath, true)
|
64
|
-
end
|
65
|
-
|
66
|
-
def float_optional_value(node, xpath)
|
67
|
-
parse_float(node, xpath, false)
|
68
|
-
end
|
11
|
+
require_sub __FILE__, include_modules: true
|
69
12
|
|
70
13
|
def array_value(node, xpath)
|
71
14
|
r = node.xpath(xpath).map { |n| n.text.strip }
|
@@ -84,41 +27,6 @@ module Aranha
|
|
84
27
|
m = /(\d+) m/.match(join_value(node, xpath))
|
85
28
|
m ? m[1].to_i : nil
|
86
29
|
end
|
87
|
-
|
88
|
-
def regxep(node, xpath, pattern)
|
89
|
-
s = string_value(node, xpath)
|
90
|
-
m = pattern.match(s)
|
91
|
-
return m if m
|
92
|
-
|
93
|
-
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
94
|
-
end
|
95
|
-
|
96
|
-
private
|
97
|
-
|
98
|
-
def parse_float(node, xpath, required)
|
99
|
-
s = string_value(node, xpath)
|
100
|
-
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
101
|
-
if m
|
102
|
-
m[0].delete('.').tr(',', '.').to_f
|
103
|
-
elsif required
|
104
|
-
raise "Float value not found in \"#{s}\""
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def sanitize_string(obj)
|
109
|
-
obj.to_s.tr("\u00A0", ' ').strip
|
110
|
-
end
|
111
|
-
|
112
|
-
def string_recursive(node)
|
113
|
-
return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
|
114
|
-
|
115
|
-
s = ''
|
116
|
-
node.children.each do |child|
|
117
|
-
child_s = string_recursive(child)
|
118
|
-
s += ' ' + child_s if child_s.present?
|
119
|
-
end
|
120
|
-
sanitize_string(s)
|
121
|
-
end
|
122
30
|
end
|
123
31
|
end
|
124
32
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/base'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Json
|
9
|
+
class Base < ::Aranha::Parsers::Base
|
10
|
+
def data
|
11
|
+
default_data
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_data
|
15
|
+
::JSON.parse(content)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
File without changes
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
require 'addressable'
|
4
4
|
require 'curb'
|
5
|
+
require 'aranha/parsers/source_address/fetch_content_error'
|
6
|
+
require 'faraday_middleware'
|
5
7
|
|
6
8
|
module Aranha
|
7
9
|
module Parsers
|
@@ -31,13 +33,21 @@ module Aranha
|
|
31
33
|
source
|
32
34
|
end
|
33
35
|
|
36
|
+
def final_url
|
37
|
+
content unless @final_url
|
38
|
+
@final_url
|
39
|
+
end
|
40
|
+
|
34
41
|
def content
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
42
|
+
conn = ::Faraday.new do |f|
|
43
|
+
f.request :retry # retry transient failures
|
44
|
+
f.response :follow_redirects # follow redirects
|
45
|
+
end
|
46
|
+
c = conn.get(url)
|
47
|
+
return c.body if c.status == 200
|
39
48
|
|
40
|
-
raise
|
49
|
+
raise ::Aranha::Parsers::SourceAddress::FetchContentError,
|
50
|
+
"Get #{url} returned #{c.status.to_i}"
|
41
51
|
end
|
42
52
|
|
43
53
|
def serialize
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.74'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: faraday_middleware
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: httpclient
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,6 +100,26 @@ dependencies:
|
|
86
100
|
- - ">="
|
87
101
|
- !ruby/object:Gem::Version
|
88
102
|
version: 2.8.3
|
103
|
+
- !ruby/object:Gem::Dependency
|
104
|
+
name: nokogiri
|
105
|
+
requirement: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '1.12'
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: 1.12.4
|
113
|
+
type: :runtime
|
114
|
+
prerelease: false
|
115
|
+
version_requirements: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '1.12'
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 1.12.4
|
89
123
|
- !ruby/object:Gem::Dependency
|
90
124
|
name: ofx-parser
|
91
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,13 +170,18 @@ files:
|
|
136
170
|
- lib/aranha/parsers/html/node.rb
|
137
171
|
- lib/aranha/parsers/html/node/base.rb
|
138
172
|
- lib/aranha/parsers/html/node/default.rb
|
173
|
+
- lib/aranha/parsers/html/node/default/numeric_support.rb
|
174
|
+
- lib/aranha/parsers/html/node/default/string_support.rb
|
139
175
|
- lib/aranha/parsers/invalid_state_exception.rb
|
176
|
+
- lib/aranha/parsers/json.rb
|
177
|
+
- lib/aranha/parsers/json/base.rb
|
140
178
|
- lib/aranha/parsers/patches.rb
|
141
179
|
- lib/aranha/parsers/patches/ofx_parser.rb
|
142
180
|
- lib/aranha/parsers/rspec.rb
|
143
181
|
- lib/aranha/parsers/rspec/setup.rb
|
144
|
-
- lib/aranha/parsers/rspec/
|
182
|
+
- lib/aranha/parsers/rspec/shared_examples/source_target_fixtures.rb
|
145
183
|
- lib/aranha/parsers/source_address.rb
|
184
|
+
- lib/aranha/parsers/source_address/fetch_content_error.rb
|
146
185
|
- lib/aranha/parsers/source_address/file.rb
|
147
186
|
- lib/aranha/parsers/source_address/hash_http_get.rb
|
148
187
|
- lib/aranha/parsers/source_address/hash_http_post.rb
|