aranha-parsers 0.8.5 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fd66650df384bfd84cc234f0ed901e426f0000a7f11bd8cdcec959d5690efdb
4
- data.tar.gz: a0b80baee8f763633753c9fefacd5a0ab528a8047a8f14a876a08f0cd443c6a1
3
+ metadata.gz: ea4e3ac094b66f5e1b02e6af4f102c752a869e0a234354b0badb3b2d666368c7
4
+ data.tar.gz: 95de53aed0f9e7157894515f3b3c881c740c6a9d4e2e787d4c5bbb2bb4432294
5
5
  SHA512:
6
- metadata.gz: f9bd1e8d032eb4f782526e45b549aecdf6951a9c28ba55296293128d4a655a14f8cafda545560d794c5f39a4798c65b96c39e2e5dcea324057a85c5dec492d01
7
- data.tar.gz: 577bd16ed893fa9df121de6be1844aefc1f178790871aa89caa9073633e83682059f326de48d8561a7928857d26c08f57bbc655d0205b2a7ef7126dd922e2398
6
+ metadata.gz: 176470a9a8163f44654485f96285c254ecf8d6661f3b435f7af0bc6ccf076c8081cb0554f7b31d77a751f2ce9caa4592454d7b3b5de4cd522a008c2798ecdc7d
7
+ data.tar.gz: ebc4b32f65dc83d6ba681a0af28babcd8375f4b4c9585985080fc5bcdb5f7bae653979bf21377f10075862c1ab7fa589cca4c1ce0b254b134f1d8d9e7c1f9d3e
@@ -11,7 +11,9 @@ module Aranha
11
11
  class << self
12
12
  def from_content(content)
13
13
  ::EacRubyUtils::Fs::Temp.on_file do |path|
14
- path.write(content)
14
+ ::File.open(path.to_s, 'w:UTF-8') do |f|
15
+ f.write content.force_encoding('UTF-8')
16
+ end
15
17
  r = new(path.to_path)
16
18
  r.content
17
19
  r
@@ -35,21 +37,21 @@ module Aranha
35
37
  delegate :url, to: :source_address
36
38
 
37
39
  def content
38
- @content ||= begin
39
- s = source_address.content
40
- log_content(s)
41
- s
42
- end
40
+ @content ||= log_content(source_address_content)
43
41
  end
44
42
 
43
+ # @return [String]
44
+ delegate :content, to: :source_address, prefix: true
45
+
45
46
  private
46
47
 
48
+ # @return [String]
47
49
  def log_content(content, suffix = '')
48
50
  path = log_file(suffix)
49
51
 
50
- return unless path
52
+ File.open(path, 'wb') { |file| file.write(content) } if path
51
53
 
52
- File.open(path, 'wb') { |file| file.write(content) }
54
+ content
53
55
  end
54
56
 
55
57
  def log_file(suffix)
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/base'
4
+ require 'json'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Json
9
+ class Base < ::Aranha::Parsers::Base
10
+ def data
11
+ default_data
12
+ end
13
+
14
+ def default_data
15
+ ::JSON.parse(content)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Json
8
+ require_sub __FILE__
9
+ end
10
+ end
11
+ end
@@ -3,6 +3,7 @@
3
3
  require 'addressable'
4
4
  require 'curb'
5
5
  require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
6
7
 
7
8
  module Aranha
8
9
  module Parsers
@@ -32,11 +33,18 @@ module Aranha
32
33
  source
33
34
  end
34
35
 
36
+ def final_url
37
+ content unless @final_url
38
+ @final_url
39
+ end
40
+
35
41
  def content
36
- c = ::Curl::Easy.new(url)
37
- c.follow_location = true
38
- curl_perform(c)
39
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
40
48
 
41
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError,
42
50
  "Get #{url} returned #{c.status.to_i}"
@@ -45,17 +53,6 @@ module Aranha
45
53
  def serialize
46
54
  url
47
55
  end
48
-
49
- private
50
-
51
- def curl_perform(curl)
52
- unless curl.perform
53
- raise(::Aranha::Parsers::SourceAddress::FetchContentError,
54
- "Curl perform failed (URL: #{url})")
55
- end
56
- rescue Curl::Err::CurlError => e
57
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
58
- end
59
56
  end
60
57
  end
61
58
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.8.5'
5
+ VERSION = '0.11.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-21 00:00:00.000000000 Z
11
+ date: 2022-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.74'
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday_middleware
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: httpclient
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -156,7 +170,11 @@ files:
156
170
  - lib/aranha/parsers/html/node.rb
157
171
  - lib/aranha/parsers/html/node/base.rb
158
172
  - lib/aranha/parsers/html/node/default.rb
173
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
174
+ - lib/aranha/parsers/html/node/default/string_support.rb
159
175
  - lib/aranha/parsers/invalid_state_exception.rb
176
+ - lib/aranha/parsers/json.rb
177
+ - lib/aranha/parsers/json/base.rb
160
178
  - lib/aranha/parsers/patches.rb
161
179
  - lib/aranha/parsers/patches/ofx_parser.rb
162
180
  - lib/aranha/parsers/rspec.rb