aranha-parsers 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/base.rb +10 -5
- data/lib/aranha/parsers/html/base.rb +14 -0
- data/lib/aranha/parsers/html/node/default/string_support.rb +6 -6
- data/lib/aranha/parsers/html/node/default.rb +16 -2
- data/lib/aranha/parsers/source_address/hash_http_base.rb +13 -22
- data/lib/aranha/parsers/source_address/http_get.rb +6 -12
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +14 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb5cd7c64c21a8805a583f01c75efd58268003a8c8cc695ef809f938e79dc3ed
|
4
|
+
data.tar.gz: '078397e90586fe403b39dc8821e30f36efcf86fa55d31058bd94d18a42cb8eb8'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d017b16cb135ad2968fb35e83c086d3e5bdfbc59a8be5281303f26d8fbdbdff6697d9d1f029d949c69bb6341ae7eb25509d924e82a99c26e0391df7322a6a095
|
7
|
+
data.tar.gz: ada16700b30a9456f1140debf88f17020699d27ecc1dc6bcafbac43a1569d3bfe19415109b6f74688214bb94033dcb14ec2fc251fb3ae8060f5a45e62f8902c0
|
data/lib/aranha/parsers/base.rb
CHANGED
@@ -9,20 +9,25 @@ module Aranha
|
|
9
9
|
module Parsers
|
10
10
|
class Base
|
11
11
|
class << self
|
12
|
+
# @deprecated Use {#from_string} instead.
|
13
|
+
# @param content [String]
|
14
|
+
# @return [Aranha::Parsers::Base]
|
12
15
|
def from_content(content)
|
16
|
+
from_string(content)
|
17
|
+
end
|
18
|
+
|
19
|
+
# @param string [String]
|
20
|
+
# @return [Aranha::Parsers::Base]
|
21
|
+
def from_string(string)
|
13
22
|
::EacRubyUtils::Fs::Temp.on_file do |path|
|
14
23
|
::File.open(path.to_s, 'w:UTF-8') do |f|
|
15
|
-
f.write
|
24
|
+
f.write string.dup.force_encoding('UTF-8')
|
16
25
|
end
|
17
26
|
r = new(path.to_path)
|
18
27
|
r.content
|
19
28
|
r
|
20
29
|
end
|
21
30
|
end
|
22
|
-
|
23
|
-
def parse_content(content)
|
24
|
-
from_content(content).data
|
25
|
-
end
|
26
31
|
end
|
27
32
|
|
28
33
|
LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
|
@@ -19,6 +19,20 @@ module Aranha
|
|
19
19
|
@fields << Field.new(name, type, xpath)
|
20
20
|
end
|
21
21
|
|
22
|
+
# @param node [Nokogiri::XML::Node]
|
23
|
+
# @return [Aranha::Parsers::Html::Base]
|
24
|
+
def from_node(node)
|
25
|
+
from_string(node.to_html)
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param haystack [String]
|
29
|
+
# @param needle [String]
|
30
|
+
# @return [String]
|
31
|
+
def xpath_ends_with(haystack, needle)
|
32
|
+
"substring(#{haystack}, string-length(#{haystack}) - string-length(#{needle}) + 1) " \
|
33
|
+
"= #{needle}"
|
34
|
+
end
|
35
|
+
|
22
36
|
Field = Struct.new(:name, :type, :xpath)
|
23
37
|
end
|
24
38
|
|
@@ -27,16 +27,16 @@ module Aranha
|
|
27
27
|
raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
|
28
28
|
end
|
29
29
|
|
30
|
+
# @param node [Nokogiri::XML::Node]
|
31
|
+
# @param xpath [String]
|
32
|
+
# @return [String]
|
30
33
|
def string_value(node, xpath)
|
31
|
-
|
32
|
-
|
33
|
-
else
|
34
|
-
''
|
35
|
-
end
|
34
|
+
found = node_value(node, xpath)
|
35
|
+
found ? sanitize_string(found.text) : ''
|
36
36
|
end
|
37
37
|
|
38
38
|
def string_recursive_value(node, xpath, required = true)
|
39
|
-
root = node
|
39
|
+
root = node_value(node, xpath)
|
40
40
|
if root.blank?
|
41
41
|
return nil unless required
|
42
42
|
|
@@ -11,13 +11,13 @@ module Aranha
|
|
11
11
|
require_sub __FILE__, include_modules: true
|
12
12
|
|
13
13
|
def array_value(node, xpath)
|
14
|
-
r = node
|
14
|
+
r = node_set_value(node, xpath).map { |n| n.text.strip }
|
15
15
|
r.join('|')
|
16
16
|
end
|
17
17
|
|
18
18
|
def join_value(node, xpath)
|
19
19
|
m = ''
|
20
|
-
node
|
20
|
+
node_set_value(node, xpath).each do |n|
|
21
21
|
m << n.text.strip
|
22
22
|
end
|
23
23
|
m
|
@@ -27,6 +27,20 @@ module Aranha
|
|
27
27
|
m = /(\d+) m/.match(join_value(node, xpath))
|
28
28
|
m ? m[1].to_i : nil
|
29
29
|
end
|
30
|
+
|
31
|
+
# @param node [Nokogiri::XML::Node]
|
32
|
+
# @param xpath [String]
|
33
|
+
# @return [Nokogiri::XML::NodeSet]
|
34
|
+
def node_set_value(node, xpath)
|
35
|
+
node.xpath(xpath)
|
36
|
+
end
|
37
|
+
|
38
|
+
# @param node [Nokogiri::XML::Node]
|
39
|
+
# @param xpath [String]
|
40
|
+
# @return [Nokogiri::XML::Node]
|
41
|
+
def node_value(node, xpath)
|
42
|
+
node.at_xpath(xpath)
|
43
|
+
end
|
30
44
|
end
|
31
45
|
end
|
32
46
|
end
|
@@ -2,10 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/fetch_content_error'
|
4
4
|
require 'aranha/parsers/source_address/hash_http_base'
|
5
|
+
require 'eac_envs/http/error'
|
6
|
+
require 'eac_envs/http/request'
|
5
7
|
require 'eac_ruby_utils/core_ext'
|
6
|
-
require 'faraday'
|
7
|
-
require 'faraday/follow_redirects'
|
8
|
-
require 'faraday/gzip'
|
9
8
|
require 'yaml'
|
10
9
|
|
11
10
|
module Aranha
|
@@ -55,21 +54,11 @@ module Aranha
|
|
55
54
|
source.to_yaml
|
56
55
|
end
|
57
56
|
|
58
|
-
# @return [Faraday]
|
59
|
-
def faraday_connection
|
60
|
-
::Faraday.new do |f|
|
61
|
-
f.request :gzip
|
62
|
-
f.response :follow_redirects if follow_redirect?
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
57
|
def content
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
raise ::Aranha::Parsers::SourceAddress::FetchContentError.
|
71
|
-
"Get #{url} returned #{req.status.to_i}", req
|
72
|
-
)
|
58
|
+
request = http_request
|
59
|
+
request.response.body_str
|
60
|
+
rescue ::EacEnvs::Http::Error => e
|
61
|
+
raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
|
73
62
|
end
|
74
63
|
|
75
64
|
def param(key, default_value)
|
@@ -82,11 +71,13 @@ module Aranha
|
|
82
71
|
|
83
72
|
private
|
84
73
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
74
|
+
# @return [EacEnvs::Http::Request]
|
75
|
+
def http_request
|
76
|
+
r = ::EacEnvs::Http::Request.new.verb(self.class.http_method).url(url)
|
77
|
+
r = headers.if_present(r) { |v| r.headers(v) }
|
78
|
+
r = body.if_present(r) { |v| r.body(v) }
|
79
|
+
r = r.follow_redirect(true) if follow_redirect?
|
80
|
+
r
|
90
81
|
end
|
91
82
|
end
|
92
83
|
end
|
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
require 'addressable'
|
4
4
|
require 'aranha/parsers/source_address/fetch_content_error'
|
5
|
-
require '
|
6
|
-
require '
|
5
|
+
require 'eac_envs/http/error'
|
6
|
+
require 'eac_envs/http/request'
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
module Parsers
|
@@ -39,16 +39,10 @@ module Aranha
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def content
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
c = conn.get(url)
|
47
|
-
return c.body if c.status == 200
|
48
|
-
|
49
|
-
raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(
|
50
|
-
"Get #{url} returned #{c.status.to_i}", c
|
51
|
-
)
|
42
|
+
request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
|
43
|
+
request.response.body_str
|
44
|
+
rescue ::EacEnvs::Http::Error => e
|
45
|
+
raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
|
52
46
|
end
|
53
47
|
|
54
48
|
def serialize
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -33,7 +33,7 @@ dependencies:
|
|
33
33
|
version: '2.8'
|
34
34
|
- - ">="
|
35
35
|
- !ruby/object:Gem::Version
|
36
|
-
version: 2.8.
|
36
|
+
version: 2.8.4
|
37
37
|
type: :runtime
|
38
38
|
prerelease: false
|
39
39
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -43,48 +43,17 @@ dependencies:
|
|
43
43
|
version: '2.8'
|
44
44
|
- - ">="
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: 2.8.
|
46
|
+
version: 2.8.4
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
51
|
-
- - "~>"
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0.112'
|
54
|
-
type: :runtime
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
requirements:
|
58
|
-
- - "~>"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '0.112'
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: faraday
|
48
|
+
name: eac_envs-http
|
63
49
|
requirement: !ruby/object:Gem::Requirement
|
64
50
|
requirements:
|
65
51
|
- - "~>"
|
66
52
|
- !ruby/object:Gem::Version
|
67
|
-
version: '
|
68
|
-
- - ">="
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
version: 2.7.4
|
71
|
-
type: :runtime
|
72
|
-
prerelease: false
|
73
|
-
version_requirements: !ruby/object:Gem::Requirement
|
74
|
-
requirements:
|
75
|
-
- - "~>"
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
version: '2.7'
|
53
|
+
version: '0.3'
|
78
54
|
- - ">="
|
79
55
|
- !ruby/object:Gem::Version
|
80
|
-
version:
|
81
|
-
- !ruby/object:Gem::Dependency
|
82
|
-
name: faraday-follow_redirects
|
83
|
-
requirement: !ruby/object:Gem::Requirement
|
84
|
-
requirements:
|
85
|
-
- - "~>"
|
86
|
-
- !ruby/object:Gem::Version
|
87
|
-
version: '0.3'
|
56
|
+
version: 0.3.1
|
88
57
|
type: :runtime
|
89
58
|
prerelease: false
|
90
59
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -92,34 +61,23 @@ dependencies:
|
|
92
61
|
- - "~>"
|
93
62
|
- !ruby/object:Gem::Version
|
94
63
|
version: '0.3'
|
95
|
-
-
|
96
|
-
name: faraday-gzip
|
97
|
-
requirement: !ruby/object:Gem::Requirement
|
98
|
-
requirements:
|
99
|
-
- - "~>"
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
version: '0.1'
|
102
|
-
type: :runtime
|
103
|
-
prerelease: false
|
104
|
-
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
requirements:
|
106
|
-
- - "~>"
|
64
|
+
- - ">="
|
107
65
|
- !ruby/object:Gem::Version
|
108
|
-
version:
|
66
|
+
version: 0.3.1
|
109
67
|
- !ruby/object:Gem::Dependency
|
110
|
-
name:
|
68
|
+
name: eac_ruby_utils
|
111
69
|
requirement: !ruby/object:Gem::Requirement
|
112
70
|
requirements:
|
113
71
|
- - "~>"
|
114
72
|
- !ruby/object:Gem::Version
|
115
|
-
version: '
|
73
|
+
version: '0.116'
|
116
74
|
type: :runtime
|
117
75
|
prerelease: false
|
118
76
|
version_requirements: !ruby/object:Gem::Requirement
|
119
77
|
requirements:
|
120
78
|
- - "~>"
|
121
79
|
- !ruby/object:Gem::Version
|
122
|
-
version: '
|
80
|
+
version: '0.116'
|
123
81
|
- !ruby/object:Gem::Dependency
|
124
82
|
name: nokogiri
|
125
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -129,7 +87,7 @@ dependencies:
|
|
129
87
|
version: '1.14'
|
130
88
|
- - ">="
|
131
89
|
- !ruby/object:Gem::Version
|
132
|
-
version: 1.14.
|
90
|
+
version: 1.14.4
|
133
91
|
type: :runtime
|
134
92
|
prerelease: false
|
135
93
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -139,7 +97,7 @@ dependencies:
|
|
139
97
|
version: '1.14'
|
140
98
|
- - ">="
|
141
99
|
- !ruby/object:Gem::Version
|
142
|
-
version: 1.14.
|
100
|
+
version: 1.14.4
|
143
101
|
- !ruby/object:Gem::Dependency
|
144
102
|
name: ofx-parser
|
145
103
|
requirement: !ruby/object:Gem::Requirement
|