aranha-parsers 0.9.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cb3631929ccd8b029ab5b097554f96f5eb0cc27dcf8767d3a8a97f050ee60b80
4
- data.tar.gz: f30f306452ac72997887550051954d290f7e6c321b694236fd9f49bc6fff5d4a
3
+ metadata.gz: 955f0df6b2a59762cbdd597dd17ae6608a880ee74f4547449ad06bbef4cf9f09
4
+ data.tar.gz: 95e891f8db0aa252f3e385956e783c48e08e42e68b3e6666491356eb2e60618a
5
5
  SHA512:
6
- metadata.gz: 35ebd10ceebadd29ade03ef0c5a75ed024e8d4377b19295861d8ef4c4e3bb4e091df5ff9f74398f523c98e0e3ee65eac1352bedc39eae033000cdc8561c5b749
7
- data.tar.gz: 68a7664bef2ffd163cf110f14779feb92305b90cfb6b23e8d254c4cf2ca6871233d64032704790dd30397383cc67fd5a5b5dc9e67a1d866f1365524707b5ebb5
6
+ metadata.gz: e5e7266770218b0698edb3111db90d797a24bb3e0f5ee54d5728a5810a87559a22a3a9f3d368bb28fc83fda8c6cbc04c99cab44c50ee25ea76e7550ea4753249
7
+ data.tar.gz: 66873b24e21f3c3e43a4ae49b82997c6e401a4e500274e2fc76e4ba91b013eae1a4f6b8641880f94896f1dab40bf23c86b24e226c14c2b92afe024d2d18aa230
@@ -37,21 +37,21 @@ module Aranha
37
37
  delegate :url, to: :source_address
38
38
 
39
39
  def content
40
- @content ||= begin
41
- s = source_address.content
42
- log_content(s)
43
- s
44
- end
40
+ @content ||= log_content(source_address_content)
45
41
  end
46
42
 
43
+ # @return [String]
44
+ delegate :content, to: :source_address, prefix: true
45
+
47
46
  private
48
47
 
48
+ # @return [String]
49
49
  def log_content(content, suffix = '')
50
50
  path = log_file(suffix)
51
51
 
52
- return unless path
52
+ File.open(path, 'wb') { |file| file.write(content) } if path
53
53
 
54
- File.open(path, 'wb') { |file| file.write(content) }
54
+ content
55
55
  end
56
56
 
57
57
  def log_file(suffix)
@@ -10,11 +10,15 @@ module Aranha
10
10
  items_data
11
11
  end
12
12
 
13
+ def item_data(item)
14
+ item
15
+ end
16
+
13
17
  def items_data
14
18
  count = 0
15
19
  @data ||= nokogiri.xpath(items_xpath).map do |m|
16
20
  count += 1
17
- node_parser.parse(m)
21
+ item_data(node_parser.parse(m))
18
22
  end
19
23
  rescue StandardError => e
20
24
  raise StandardError, "#{e.message} (Count: #{count})"
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/source_address/fetch_content_error'
4
+ require 'aranha/parsers/source_address/hash_http_base'
5
+ require 'eac_ruby_utils/core_ext'
6
+ require 'faraday_middleware'
7
+ require 'yaml'
8
+
9
+ module Aranha
10
+ module Parsers
11
+ class SourceAddress
12
+ class HashHttpBase
13
+ class << self
14
+ def http_method
15
+ const_get 'HTTP_METHOD'
16
+ end
17
+
18
+ def valid_source?(source)
19
+ source.is_a?(::Hash) &&
20
+ source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
21
+ end
22
+ end
23
+
24
+ DEFAULT_BODY = ''
25
+ DEFAULT_FOLLOW_REDIRECT = true
26
+ DEFAULT_HEADERS = {}.freeze
27
+ DEFAULT_PARAMS = {}.freeze
28
+
29
+ common_constructor :source do
30
+ self.source = source.with_indifferent_access
31
+ end
32
+ compare_by :source
33
+
34
+ def body
35
+ param(:body, DEFAULT_BODY)
36
+ end
37
+
38
+ def follow_redirect?
39
+ param(:follow_redirect, DEFAULT_FOLLOW_REDIRECT)
40
+ end
41
+
42
+ def headers
43
+ param(:headers, DEFAULT_HEADERS)
44
+ end
45
+
46
+ def url
47
+ source.fetch(:url)
48
+ end
49
+
50
+ def serialize
51
+ source.to_yaml
52
+ end
53
+
54
+ # @return [Faraday]
55
+ def faraday_connection
56
+ ::Faraday.new do |f|
57
+ f.response :follow_redirects if follow_redirect?
58
+ end
59
+ end
60
+
61
+ def faraday_request
62
+ faraday_connection.send(self.class.http_method, url) do |req|
63
+ headers.if_present { |v| req.headers = v }
64
+ body.if_present { |v| req.body = v }
65
+ end
66
+ end
67
+
68
+ def content
69
+ req = faraday_request
70
+ return req.body if req.status == 200
71
+
72
+ raise ::Aranha::Parsers::SourceAddress::FetchContentError,
73
+ "Get #{url} returned #{req.status.to_i}"
74
+ end
75
+
76
+ def param(key, default_value)
77
+ source[key] || params[key] || default_value
78
+ end
79
+
80
+ def params
81
+ source[:params].if_present(DEFAULT_PARAMS)
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
@@ -1,24 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'aranha/parsers/source_address/hash_http_post'
3
+ require 'aranha/parsers/source_address/hash_http_base'
4
4
 
5
5
  module Aranha
6
6
  module Parsers
7
7
  class SourceAddress
8
- class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpPost
9
- class << self
10
- def valid_source?(source)
11
- source.is_a?(::Hash) &&
12
- source.with_indifferent_access[:method].to_s.downcase.strip == 'get'
13
- end
14
- end
15
-
16
- def content
17
- HTTPClient.new.get_content(
18
- source[:url],
19
- source[:params]
20
- )
21
- end
8
+ class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpBase
9
+ HTTP_METHOD = :get
22
10
  end
23
11
  end
24
12
  end
@@ -1,44 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'active_support/core_ext/hash/indifferent_access'
4
- require 'httpclient'
5
- require 'yaml'
3
+ require 'aranha/parsers/source_address/hash_http_base'
6
4
 
7
5
  module Aranha
8
6
  module Parsers
9
7
  class SourceAddress
10
- class HashHttpPost
11
- class << self
12
- def valid_source?(source)
13
- source.is_a?(::Hash) &&
14
- source.with_indifferent_access[:method].to_s.downcase.strip == 'post'
15
- end
16
- end
17
-
18
- attr_reader :source
19
-
20
- def initialize(source)
21
- @source = source.with_indifferent_access
22
- end
23
-
24
- def ==(other)
25
- self.class == other.class && source == other.source
26
- end
27
-
28
- def url
29
- source.fetch(:url)
30
- end
31
-
32
- def serialize
33
- source.to_yaml
34
- end
35
-
36
- def content
37
- HTTPClient.new.post_content(
38
- source[:url],
39
- source[:params].merge(follow_redirect: true)
40
- )
41
- end
8
+ class HashHttpPost < ::Aranha::Parsers::SourceAddress::HashHttpBase
9
+ HTTP_METHOD = :post
42
10
  end
43
11
  end
44
12
  end
@@ -3,6 +3,7 @@
3
3
  require 'addressable'
4
4
  require 'curb'
5
5
  require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
6
7
 
7
8
  module Aranha
8
9
  module Parsers
@@ -38,10 +39,12 @@ module Aranha
38
39
  end
39
40
 
40
41
  def content
41
- c = ::Curl::Easy.new(url)
42
- c.follow_location = true
43
- curl_perform(c)
44
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
45
48
 
46
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError,
47
50
  "Get #{url} returned #{c.status.to_i}"
@@ -50,18 +53,6 @@ module Aranha
50
53
  def serialize
51
54
  url
52
55
  end
53
-
54
- private
55
-
56
- def curl_perform(curl)
57
- unless curl.perform
58
- raise(::Aranha::Parsers::SourceAddress::FetchContentError,
59
- "Curl perform failed (URL: #{url})")
60
- end
61
- @final_url = curl.url
62
- rescue Curl::Err::CurlError => e
63
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
64
- end
65
56
  end
66
57
  end
67
58
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.9.0'
5
+ VERSION = '0.12.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-19 00:00:00.000000000 Z
11
+ date: 2022-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -58,34 +58,34 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.74'
61
+ version: '0.92'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 0.92.1
62
65
  type: :runtime
63
66
  prerelease: false
64
67
  version_requirements: !ruby/object:Gem::Requirement
65
68
  requirements:
66
69
  - - "~>"
67
70
  - !ruby/object:Gem::Version
68
- version: '0.74'
71
+ version: '0.92'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 0.92.1
69
75
  - !ruby/object:Gem::Dependency
70
- name: httpclient
76
+ name: faraday_middleware
71
77
  requirement: !ruby/object:Gem::Requirement
72
78
  requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '2.8'
76
79
  - - ">="
77
80
  - !ruby/object:Gem::Version
78
- version: 2.8.3
81
+ version: '0'
79
82
  type: :runtime
80
83
  prerelease: false
81
84
  version_requirements: !ruby/object:Gem::Requirement
82
85
  requirements:
83
- - - "~>"
84
- - !ruby/object:Gem::Version
85
- version: '2.8'
86
86
  - - ">="
87
87
  - !ruby/object:Gem::Version
88
- version: 2.8.3
88
+ version: '0'
89
89
  - !ruby/object:Gem::Dependency
90
90
  name: nokogiri
91
91
  requirement: !ruby/object:Gem::Requirement
@@ -156,6 +156,8 @@ files:
156
156
  - lib/aranha/parsers/html/node.rb
157
157
  - lib/aranha/parsers/html/node/base.rb
158
158
  - lib/aranha/parsers/html/node/default.rb
159
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
160
+ - lib/aranha/parsers/html/node/default/string_support.rb
159
161
  - lib/aranha/parsers/invalid_state_exception.rb
160
162
  - lib/aranha/parsers/json.rb
161
163
  - lib/aranha/parsers/json/base.rb
@@ -167,6 +169,7 @@ files:
167
169
  - lib/aranha/parsers/source_address.rb
168
170
  - lib/aranha/parsers/source_address/fetch_content_error.rb
169
171
  - lib/aranha/parsers/source_address/file.rb
172
+ - lib/aranha/parsers/source_address/hash_http_base.rb
170
173
  - lib/aranha/parsers/source_address/hash_http_get.rb
171
174
  - lib/aranha/parsers/source_address/hash_http_post.rb
172
175
  - lib/aranha/parsers/source_address/http_get.rb