aranha-parsers 0.8.5 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/base.rb +10 -8
- data/lib/aranha/parsers/html/node/default/numeric_support.rb +70 -0
- data/lib/aranha/parsers/html/node/default/string_support.rb +77 -0
- data/lib/aranha/parsers/html/node/default.rb +2 -94
- data/lib/aranha/parsers/json/base.rb +20 -0
- data/lib/aranha/parsers/json.rb +11 -0
- data/lib/aranha/parsers/source_address/http_get.rb +12 -15
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea4e3ac094b66f5e1b02e6af4f102c752a869e0a234354b0badb3b2d666368c7
|
4
|
+
data.tar.gz: 95de53aed0f9e7157894515f3b3c881c740c6a9d4e2e787d4c5bbb2bb4432294
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 176470a9a8163f44654485f96285c254ecf8d6661f3b435f7af0bc6ccf076c8081cb0554f7b31d77a751f2ce9caa4592454d7b3b5de4cd522a008c2798ecdc7d
|
7
|
+
data.tar.gz: ebc4b32f65dc83d6ba681a0af28babcd8375f4b4c9585985080fc5bcdb5f7bae653979bf21377f10075862c1ab7fa589cca4c1ce0b254b134f1d8d9e7c1f9d3e
|
data/lib/aranha/parsers/base.rb
CHANGED
@@ -11,7 +11,9 @@ module Aranha
|
|
11
11
|
class << self
|
12
12
|
def from_content(content)
|
13
13
|
::EacRubyUtils::Fs::Temp.on_file do |path|
|
14
|
-
path.
|
14
|
+
::File.open(path.to_s, 'w:UTF-8') do |f|
|
15
|
+
f.write content.force_encoding('UTF-8')
|
16
|
+
end
|
15
17
|
r = new(path.to_path)
|
16
18
|
r.content
|
17
19
|
r
|
@@ -35,21 +37,21 @@ module Aranha
|
|
35
37
|
delegate :url, to: :source_address
|
36
38
|
|
37
39
|
def content
|
38
|
-
@content ||=
|
39
|
-
s = source_address.content
|
40
|
-
log_content(s)
|
41
|
-
s
|
42
|
-
end
|
40
|
+
@content ||= log_content(source_address_content)
|
43
41
|
end
|
44
42
|
|
43
|
+
# @return [String]
|
44
|
+
delegate :content, to: :source_address, prefix: true
|
45
|
+
|
45
46
|
private
|
46
47
|
|
48
|
+
# @return [String]
|
47
49
|
def log_content(content, suffix = '')
|
48
50
|
path = log_file(suffix)
|
49
51
|
|
50
|
-
|
52
|
+
File.open(path, 'wb') { |file| file.write(content) } if path
|
51
53
|
|
52
|
-
|
54
|
+
content
|
53
55
|
end
|
54
56
|
|
55
57
|
def log_file(suffix)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
module Node
|
10
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
11
|
+
module NumericSupport
|
12
|
+
def integer_value(node, xpath)
|
13
|
+
r = string_value(node, xpath)
|
14
|
+
return nil if r.blank?
|
15
|
+
|
16
|
+
m = /\d+/.match(r)
|
17
|
+
raise "Integer not found in \"#{r}\"" unless m
|
18
|
+
|
19
|
+
m[0].to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
def integer_optional_value(node, xpath)
|
23
|
+
r = string_value(node, xpath)
|
24
|
+
m = /\d+/.match(r)
|
25
|
+
m ? m[0].to_i : nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def float_value(node, xpath)
|
29
|
+
parse_float(node, xpath, true)
|
30
|
+
end
|
31
|
+
|
32
|
+
def float_optional_value(node, xpath)
|
33
|
+
parse_float(node, xpath, false)
|
34
|
+
end
|
35
|
+
|
36
|
+
def us_decimal_value(node, xpath)
|
37
|
+
parse_us_decimal(node, xpath, true)
|
38
|
+
end
|
39
|
+
|
40
|
+
def us_decimal_optional_value(node, xpath)
|
41
|
+
parse_us_decimal(node, xpath, false)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def parse_float(node, xpath, required)
|
47
|
+
s = string_value(node, xpath)
|
48
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
49
|
+
if m
|
50
|
+
m[0].delete('.').tr(',', '.').to_f
|
51
|
+
elsif required
|
52
|
+
raise "Float value not found in \"#{s}\""
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_us_decimal(node, xpath, required)
|
57
|
+
s = string_value(node, xpath)
|
58
|
+
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
59
|
+
if m
|
60
|
+
m[0].delete(',').to_f
|
61
|
+
elsif required
|
62
|
+
raise "US decimal value not found in \"#{s}\""
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Html
|
9
|
+
module Node
|
10
|
+
class Default < ::Aranha::Parsers::Html::Node::Base
|
11
|
+
module StringSupport
|
12
|
+
def quoted_value(node, xpath)
|
13
|
+
s = string_value(node, xpath)
|
14
|
+
return '' unless s
|
15
|
+
|
16
|
+
m = /\"([^\"]+)\"/.match(s)
|
17
|
+
return m[1] if m
|
18
|
+
|
19
|
+
''
|
20
|
+
end
|
21
|
+
|
22
|
+
def regxep(node, xpath, pattern)
|
23
|
+
s = string_value(node, xpath)
|
24
|
+
m = pattern.match(s)
|
25
|
+
return m if m
|
26
|
+
|
27
|
+
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
28
|
+
end
|
29
|
+
|
30
|
+
def string_value(node, xpath)
|
31
|
+
if node.at_xpath(xpath)
|
32
|
+
sanitize_string(node.at_xpath(xpath).text)
|
33
|
+
else
|
34
|
+
''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def string_recursive_value(node, xpath, required = true)
|
39
|
+
root = node.at_xpath(xpath)
|
40
|
+
if root.blank?
|
41
|
+
return nil unless required
|
42
|
+
|
43
|
+
raise "No node found (Xpath: #{xpath})"
|
44
|
+
end
|
45
|
+
result = string_recursive(root)
|
46
|
+
return result if result.present?
|
47
|
+
return nil unless required
|
48
|
+
|
49
|
+
raise "String blank (Xpath: #{xpath})"
|
50
|
+
end
|
51
|
+
|
52
|
+
def string_recursive_optional_value(node, xpath)
|
53
|
+
string_recursive_value(node, xpath, false)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def sanitize_string(obj)
|
59
|
+
obj.to_s.tr("\u00A0", ' ').strip
|
60
|
+
end
|
61
|
+
|
62
|
+
def string_recursive(node)
|
63
|
+
return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
|
64
|
+
|
65
|
+
s = ''
|
66
|
+
node.children.each do |child|
|
67
|
+
child_s = string_recursive(child)
|
68
|
+
s += ' ' + child_s if child_s.present?
|
69
|
+
end
|
70
|
+
sanitize_string(s)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -1,71 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/html/node/base'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
module Parsers
|
7
8
|
module Html
|
8
9
|
module Node
|
9
10
|
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
|
-
|
11
|
-
if node.at_xpath(xpath)
|
12
|
-
sanitize_string(node.at_xpath(xpath).text)
|
13
|
-
else
|
14
|
-
''
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def string_recursive_value(node, xpath, required = true)
|
19
|
-
root = node.at_xpath(xpath)
|
20
|
-
if root.blank?
|
21
|
-
return nil unless required
|
22
|
-
|
23
|
-
raise "No node found (Xpath: #{xpath})"
|
24
|
-
end
|
25
|
-
result = string_recursive(root)
|
26
|
-
return result if result.present?
|
27
|
-
return nil unless required
|
28
|
-
|
29
|
-
raise "String blank (Xpath: #{xpath})"
|
30
|
-
end
|
31
|
-
|
32
|
-
def string_recursive_optional_value(node, xpath)
|
33
|
-
string_recursive_value(node, xpath, false)
|
34
|
-
end
|
35
|
-
|
36
|
-
def quoted_value(node, xpath)
|
37
|
-
s = string_value(node, xpath)
|
38
|
-
return '' unless s
|
39
|
-
|
40
|
-
m = /\"([^\"]+)\"/.match(s)
|
41
|
-
return m[1] if m
|
42
|
-
|
43
|
-
''
|
44
|
-
end
|
45
|
-
|
46
|
-
def integer_value(node, xpath)
|
47
|
-
r = string_value(node, xpath)
|
48
|
-
return nil if r.blank?
|
49
|
-
|
50
|
-
m = /\d+/.match(r)
|
51
|
-
raise "Integer not found in \"#{r}\"" unless m
|
52
|
-
|
53
|
-
m[0].to_i
|
54
|
-
end
|
55
|
-
|
56
|
-
def integer_optional_value(node, xpath)
|
57
|
-
r = string_value(node, xpath)
|
58
|
-
m = /\d+/.match(r)
|
59
|
-
m ? m[0].to_i : nil
|
60
|
-
end
|
61
|
-
|
62
|
-
def float_value(node, xpath)
|
63
|
-
parse_float(node, xpath, true)
|
64
|
-
end
|
65
|
-
|
66
|
-
def float_optional_value(node, xpath)
|
67
|
-
parse_float(node, xpath, false)
|
68
|
-
end
|
11
|
+
require_sub __FILE__, include_modules: true
|
69
12
|
|
70
13
|
def array_value(node, xpath)
|
71
14
|
r = node.xpath(xpath).map { |n| n.text.strip }
|
@@ -84,41 +27,6 @@ module Aranha
|
|
84
27
|
m = /(\d+) m/.match(join_value(node, xpath))
|
85
28
|
m ? m[1].to_i : nil
|
86
29
|
end
|
87
|
-
|
88
|
-
def regxep(node, xpath, pattern)
|
89
|
-
s = string_value(node, xpath)
|
90
|
-
m = pattern.match(s)
|
91
|
-
return m if m
|
92
|
-
|
93
|
-
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
94
|
-
end
|
95
|
-
|
96
|
-
private
|
97
|
-
|
98
|
-
def parse_float(node, xpath, required)
|
99
|
-
s = string_value(node, xpath)
|
100
|
-
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
101
|
-
if m
|
102
|
-
m[0].delete('.').tr(',', '.').to_f
|
103
|
-
elsif required
|
104
|
-
raise "Float value not found in \"#{s}\""
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def sanitize_string(obj)
|
109
|
-
obj.to_s.tr("\u00A0", ' ').strip
|
110
|
-
end
|
111
|
-
|
112
|
-
def string_recursive(node)
|
113
|
-
return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
|
114
|
-
|
115
|
-
s = ''
|
116
|
-
node.children.each do |child|
|
117
|
-
child_s = string_recursive(child)
|
118
|
-
s += ' ' + child_s if child_s.present?
|
119
|
-
end
|
120
|
-
sanitize_string(s)
|
121
|
-
end
|
122
30
|
end
|
123
31
|
end
|
124
32
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aranha/parsers/base'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Aranha
|
7
|
+
module Parsers
|
8
|
+
module Json
|
9
|
+
class Base < ::Aranha::Parsers::Base
|
10
|
+
def data
|
11
|
+
default_data
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_data
|
15
|
+
::JSON.parse(content)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -3,6 +3,7 @@
|
|
3
3
|
require 'addressable'
|
4
4
|
require 'curb'
|
5
5
|
require 'aranha/parsers/source_address/fetch_content_error'
|
6
|
+
require 'faraday_middleware'
|
6
7
|
|
7
8
|
module Aranha
|
8
9
|
module Parsers
|
@@ -32,11 +33,18 @@ module Aranha
|
|
32
33
|
source
|
33
34
|
end
|
34
35
|
|
36
|
+
def final_url
|
37
|
+
content unless @final_url
|
38
|
+
@final_url
|
39
|
+
end
|
40
|
+
|
35
41
|
def content
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
conn = ::Faraday.new do |f|
|
43
|
+
f.request :retry # retry transient failures
|
44
|
+
f.response :follow_redirects # follow redirects
|
45
|
+
end
|
46
|
+
c = conn.get(url)
|
47
|
+
return c.body if c.status == 200
|
40
48
|
|
41
49
|
raise ::Aranha::Parsers::SourceAddress::FetchContentError,
|
42
50
|
"Get #{url} returned #{c.status.to_i}"
|
@@ -45,17 +53,6 @@ module Aranha
|
|
45
53
|
def serialize
|
46
54
|
url
|
47
55
|
end
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def curl_perform(curl)
|
52
|
-
unless curl.perform
|
53
|
-
raise(::Aranha::Parsers::SourceAddress::FetchContentError,
|
54
|
-
"Curl perform failed (URL: #{url})")
|
55
|
-
end
|
56
|
-
rescue Curl::Err::CurlError => e
|
57
|
-
raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
|
58
|
-
end
|
59
56
|
end
|
60
57
|
end
|
61
58
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.74'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: faraday_middleware
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: httpclient
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -156,7 +170,11 @@ files:
|
|
156
170
|
- lib/aranha/parsers/html/node.rb
|
157
171
|
- lib/aranha/parsers/html/node/base.rb
|
158
172
|
- lib/aranha/parsers/html/node/default.rb
|
173
|
+
- lib/aranha/parsers/html/node/default/numeric_support.rb
|
174
|
+
- lib/aranha/parsers/html/node/default/string_support.rb
|
159
175
|
- lib/aranha/parsers/invalid_state_exception.rb
|
176
|
+
- lib/aranha/parsers/json.rb
|
177
|
+
- lib/aranha/parsers/json/base.rb
|
160
178
|
- lib/aranha/parsers/patches.rb
|
161
179
|
- lib/aranha/parsers/patches/ofx_parser.rb
|
162
180
|
- lib/aranha/parsers/rspec.rb
|