aranha-parsers 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cb3631929ccd8b029ab5b097554f96f5eb0cc27dcf8767d3a8a97f050ee60b80
4
- data.tar.gz: f30f306452ac72997887550051954d290f7e6c321b694236fd9f49bc6fff5d4a
3
+ metadata.gz: f6ec27959167dfbae56e1fb00bfdbedad5bfe022a0708e493c623ca259df5c5d
4
+ data.tar.gz: c87db56022bfdf05f3f5cc99f58fbba4f0e73a5e8ddb7683d2569aabbf7bf28a
5
5
  SHA512:
6
- metadata.gz: 35ebd10ceebadd29ade03ef0c5a75ed024e8d4377b19295861d8ef4c4e3bb4e091df5ff9f74398f523c98e0e3ee65eac1352bedc39eae033000cdc8561c5b749
7
- data.tar.gz: 68a7664bef2ffd163cf110f14779feb92305b90cfb6b23e8d254c4cf2ca6871233d64032704790dd30397383cc67fd5a5b5dc9e67a1d866f1365524707b5ebb5
6
+ metadata.gz: 9116a522005c0a01b650bc889ce1a4c70f7096b478febc2e1b24f6596a4d94c77e750d81e616bd70b6ca557a1c61e56d55fcfe2066ff4f62aea3a9a14237ed01
7
+ data.tar.gz: 5080f643d1da251f05afb094a1c7ac9c5cbbe140a15f0d05b77ec87591e700b148ba4766615ee1853284d24674e2720149f25e6858b485e0123ebd3add74c4db
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -3,6 +3,7 @@
3
3
  require 'addressable'
4
4
  require 'curb'
5
5
  require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
6
7
 
7
8
  module Aranha
8
9
  module Parsers
@@ -38,10 +39,12 @@ module Aranha
38
39
  end
39
40
 
40
41
  def content
41
- c = ::Curl::Easy.new(url)
42
- c.follow_location = true
43
- curl_perform(c)
44
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
45
48
 
46
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError,
47
50
  "Get #{url} returned #{c.status.to_i}"
@@ -50,18 +53,6 @@ module Aranha
50
53
  def serialize
51
54
  url
52
55
  end
53
-
54
- private
55
-
56
- def curl_perform(curl)
57
- unless curl.perform
58
- raise(::Aranha::Parsers::SourceAddress::FetchContentError,
59
- "Curl perform failed (URL: #{url})")
60
- end
61
- @final_url = curl.url
62
- rescue Curl::Err::CurlError => e
63
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
64
- end
65
56
  end
66
57
  end
67
58
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.9.0'
5
+ VERSION = '0.10.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-19 00:00:00.000000000 Z
11
+ date: 2021-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.74'
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday_middleware
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: httpclient
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -156,6 +170,8 @@ files:
156
170
  - lib/aranha/parsers/html/node.rb
157
171
  - lib/aranha/parsers/html/node/base.rb
158
172
  - lib/aranha/parsers/html/node/default.rb
173
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
174
+ - lib/aranha/parsers/html/node/default/string_support.rb
159
175
  - lib/aranha/parsers/invalid_state_exception.rb
160
176
  - lib/aranha/parsers/json.rb
161
177
  - lib/aranha/parsers/json/base.rb