aranha-parsers 0.8.5 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fd66650df384bfd84cc234f0ed901e426f0000a7f11bd8cdcec959d5690efdb
4
- data.tar.gz: a0b80baee8f763633753c9fefacd5a0ab528a8047a8f14a876a08f0cd443c6a1
3
+ metadata.gz: ea4e3ac094b66f5e1b02e6af4f102c752a869e0a234354b0badb3b2d666368c7
4
+ data.tar.gz: 95de53aed0f9e7157894515f3b3c881c740c6a9d4e2e787d4c5bbb2bb4432294
5
5
  SHA512:
6
- metadata.gz: f9bd1e8d032eb4f782526e45b549aecdf6951a9c28ba55296293128d4a655a14f8cafda545560d794c5f39a4798c65b96c39e2e5dcea324057a85c5dec492d01
7
- data.tar.gz: 577bd16ed893fa9df121de6be1844aefc1f178790871aa89caa9073633e83682059f326de48d8561a7928857d26c08f57bbc655d0205b2a7ef7126dd922e2398
6
+ metadata.gz: 176470a9a8163f44654485f96285c254ecf8d6661f3b435f7af0bc6ccf076c8081cb0554f7b31d77a751f2ce9caa4592454d7b3b5de4cd522a008c2798ecdc7d
7
+ data.tar.gz: ebc4b32f65dc83d6ba681a0af28babcd8375f4b4c9585985080fc5bcdb5f7bae653979bf21377f10075862c1ab7fa589cca4c1ce0b254b134f1d8d9e7c1f9d3e
@@ -11,7 +11,9 @@ module Aranha
11
11
  class << self
12
12
  def from_content(content)
13
13
  ::EacRubyUtils::Fs::Temp.on_file do |path|
14
- path.write(content)
14
+ ::File.open(path.to_s, 'w:UTF-8') do |f|
15
+ f.write content.force_encoding('UTF-8')
16
+ end
15
17
  r = new(path.to_path)
16
18
  r.content
17
19
  r
@@ -35,21 +37,21 @@ module Aranha
35
37
  delegate :url, to: :source_address
36
38
 
37
39
  def content
38
- @content ||= begin
39
- s = source_address.content
40
- log_content(s)
41
- s
42
- end
40
+ @content ||= log_content(source_address_content)
43
41
  end
44
42
 
43
+ # @return [String]
44
+ delegate :content, to: :source_address, prefix: true
45
+
45
46
  private
46
47
 
48
+ # @return [String]
47
49
  def log_content(content, suffix = '')
48
50
  path = log_file(suffix)
49
51
 
50
- return unless path
52
+ File.open(path, 'wb') { |file| file.write(content) } if path
51
53
 
52
- File.open(path, 'wb') { |file| file.write(content) }
54
+ content
53
55
  end
54
56
 
55
57
  def log_file(suffix)
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/base'
4
+ require 'json'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Json
9
+ class Base < ::Aranha::Parsers::Base
10
+ def data
11
+ default_data
12
+ end
13
+
14
+ def default_data
15
+ ::JSON.parse(content)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Json
8
+ require_sub __FILE__
9
+ end
10
+ end
11
+ end
@@ -3,6 +3,7 @@
3
3
  require 'addressable'
4
4
  require 'curb'
5
5
  require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
6
7
 
7
8
  module Aranha
8
9
  module Parsers
@@ -32,11 +33,18 @@ module Aranha
32
33
  source
33
34
  end
34
35
 
36
+ def final_url
37
+ content unless @final_url
38
+ @final_url
39
+ end
40
+
35
41
  def content
36
- c = ::Curl::Easy.new(url)
37
- c.follow_location = true
38
- curl_perform(c)
39
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
40
48
 
41
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError,
42
50
  "Get #{url} returned #{c.status.to_i}"
@@ -45,17 +53,6 @@ module Aranha
45
53
  def serialize
46
54
  url
47
55
  end
48
-
49
- private
50
-
51
- def curl_perform(curl)
52
- unless curl.perform
53
- raise(::Aranha::Parsers::SourceAddress::FetchContentError,
54
- "Curl perform failed (URL: #{url})")
55
- end
56
- rescue Curl::Err::CurlError => e
57
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
58
- end
59
56
  end
60
57
  end
61
58
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.8.5'
5
+ VERSION = '0.11.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-21 00:00:00.000000000 Z
11
+ date: 2022-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.74'
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday_middleware
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: httpclient
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -156,7 +170,11 @@ files:
156
170
  - lib/aranha/parsers/html/node.rb
157
171
  - lib/aranha/parsers/html/node/base.rb
158
172
  - lib/aranha/parsers/html/node/default.rb
173
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
174
+ - lib/aranha/parsers/html/node/default/string_support.rb
159
175
  - lib/aranha/parsers/invalid_state_exception.rb
176
+ - lib/aranha/parsers/json.rb
177
+ - lib/aranha/parsers/json/base.rb
160
178
  - lib/aranha/parsers/patches.rb
161
179
  - lib/aranha/parsers/patches/ofx_parser.rb
162
180
  - lib/aranha/parsers/rspec.rb