aranha-parsers 0.8.3 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee60e5eb7ad164eaac294af5fe274611cc72adb14645c59f12db7fe483dd551f
4
- data.tar.gz: 280ba737d37a5cd22d741b92961d3ee15c72b15a43f76604f4c9f2bc92e4731e
3
+ metadata.gz: f6ec27959167dfbae56e1fb00bfdbedad5bfe022a0708e493c623ca259df5c5d
4
+ data.tar.gz: c87db56022bfdf05f3f5cc99f58fbba4f0e73a5e8ddb7683d2569aabbf7bf28a
5
5
  SHA512:
6
- metadata.gz: 743b98993e76af5d18f0c58b4a197b0bf62a18771be883a6b29ce8551c3d236ebaa81acda3bd4200d4c3e8f6ab9f9db434d047bc80ae2ecd040fbf7cf706826f
7
- data.tar.gz: 1086dbe158f755c8716dbee97058c5cd6a72673b9769f92e9c11206bc072637131a0d698907ce292ac6fbe864e33d5702abc0aceb044d16b8f034d648199570e
6
+ metadata.gz: 9116a522005c0a01b650bc889ce1a4c70f7096b478febc2e1b24f6596a4d94c77e750d81e616bd70b6ca557a1c61e56d55fcfe2066ff4f62aea3a9a14237ed01
7
+ data.tar.gz: 5080f643d1da251f05afb094a1c7ac9c5cbbe140a15f0d05b77ec87591e700b148ba4766615ee1853284d24674e2720149f25e6858b485e0123ebd3add74c4db
@@ -11,7 +11,9 @@ module Aranha
11
11
  class << self
12
12
  def from_content(content)
13
13
  ::EacRubyUtils::Fs::Temp.on_file do |path|
14
- path.write(content)
14
+ ::File.open(path.to_s, 'w:UTF-8') do |f|
15
+ f.write content.force_encoding('UTF-8')
16
+ end
15
17
  r = new(path.to_path)
16
18
  r.content
17
19
  r
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/base'
4
+ require 'json'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Json
9
+ class Base < ::Aranha::Parsers::Base
10
+ def data
11
+ default_data
12
+ end
13
+
14
+ def default_data
15
+ ::JSON.parse(content)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Json
8
+ require_sub __FILE__
9
+ end
10
+ end
11
+ end
@@ -7,7 +7,7 @@ module Aranha
7
7
  module Rspec
8
8
  module Setup
9
9
  def self.extended(_setup_obj)
10
- require 'aranha/parsers/rspec/source_target_fixtures_example'
10
+ require 'aranha/parsers/rspec/shared_examples/source_target_fixtures'
11
11
  end
12
12
  end
13
13
  end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aranha
4
+ module Parsers
5
+ class SourceAddress
6
+ class FetchContentError < ::RuntimeError
7
+ end
8
+ end
9
+ end
10
+ end
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'addressable'
4
4
  require 'curb'
5
+ require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
5
7
 
6
8
  module Aranha
7
9
  module Parsers
@@ -31,13 +33,21 @@ module Aranha
31
33
  source
32
34
  end
33
35
 
36
+ def final_url
37
+ content unless @final_url
38
+ @final_url
39
+ end
40
+
34
41
  def content
35
- c = ::Curl::Easy.new(url)
36
- c.follow_location = true
37
- raise "Curl perform failed (URL: #{url})" unless c.perform
38
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
39
48
 
40
- raise "Get #{url} returned #{c.status.to_i}"
49
+ raise ::Aranha::Parsers::SourceAddress::FetchContentError,
50
+ "Get #{url} returned #{c.status.to_i}"
41
51
  end
42
52
 
43
53
  def serialize
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.8.3'
5
+ VERSION = '0.10.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.3
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.74'
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday_middleware
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: httpclient
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -86,6 +100,26 @@ dependencies:
86
100
  - - ">="
87
101
  - !ruby/object:Gem::Version
88
102
  version: 2.8.3
103
+ - !ruby/object:Gem::Dependency
104
+ name: nokogiri
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.12'
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 1.12.4
113
+ type: :runtime
114
+ prerelease: false
115
+ version_requirements: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '1.12'
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 1.12.4
89
123
  - !ruby/object:Gem::Dependency
90
124
  name: ofx-parser
91
125
  requirement: !ruby/object:Gem::Requirement
@@ -136,13 +170,18 @@ files:
136
170
  - lib/aranha/parsers/html/node.rb
137
171
  - lib/aranha/parsers/html/node/base.rb
138
172
  - lib/aranha/parsers/html/node/default.rb
173
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
174
+ - lib/aranha/parsers/html/node/default/string_support.rb
139
175
  - lib/aranha/parsers/invalid_state_exception.rb
176
+ - lib/aranha/parsers/json.rb
177
+ - lib/aranha/parsers/json/base.rb
140
178
  - lib/aranha/parsers/patches.rb
141
179
  - lib/aranha/parsers/patches/ofx_parser.rb
142
180
  - lib/aranha/parsers/rspec.rb
143
181
  - lib/aranha/parsers/rspec/setup.rb
144
- - lib/aranha/parsers/rspec/source_target_fixtures_example.rb
182
+ - lib/aranha/parsers/rspec/shared_examples/source_target_fixtures.rb
145
183
  - lib/aranha/parsers/source_address.rb
184
+ - lib/aranha/parsers/source_address/fetch_content_error.rb
146
185
  - lib/aranha/parsers/source_address/file.rb
147
186
  - lib/aranha/parsers/source_address/hash_http_get.rb
148
187
  - lib/aranha/parsers/source_address/hash_http_post.rb