aranha-parsers 0.9.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cb3631929ccd8b029ab5b097554f96f5eb0cc27dcf8767d3a8a97f050ee60b80
4
- data.tar.gz: f30f306452ac72997887550051954d290f7e6c321b694236fd9f49bc6fff5d4a
3
+ metadata.gz: 955f0df6b2a59762cbdd597dd17ae6608a880ee74f4547449ad06bbef4cf9f09
4
+ data.tar.gz: 95e891f8db0aa252f3e385956e783c48e08e42e68b3e6666491356eb2e60618a
5
5
  SHA512:
6
- metadata.gz: 35ebd10ceebadd29ade03ef0c5a75ed024e8d4377b19295861d8ef4c4e3bb4e091df5ff9f74398f523c98e0e3ee65eac1352bedc39eae033000cdc8561c5b749
7
- data.tar.gz: 68a7664bef2ffd163cf110f14779feb92305b90cfb6b23e8d254c4cf2ca6871233d64032704790dd30397383cc67fd5a5b5dc9e67a1d866f1365524707b5ebb5
6
+ metadata.gz: e5e7266770218b0698edb3111db90d797a24bb3e0f5ee54d5728a5810a87559a22a3a9f3d368bb28fc83fda8c6cbc04c99cab44c50ee25ea76e7550ea4753249
7
+ data.tar.gz: 66873b24e21f3c3e43a4ae49b82997c6e401a4e500274e2fc76e4ba91b013eae1a4f6b8641880f94896f1dab40bf23c86b24e226c14c2b92afe024d2d18aa230
@@ -37,21 +37,21 @@ module Aranha
37
37
  delegate :url, to: :source_address
38
38
 
39
39
  def content
40
- @content ||= begin
41
- s = source_address.content
42
- log_content(s)
43
- s
44
- end
40
+ @content ||= log_content(source_address_content)
45
41
  end
46
42
 
43
+ # @return [String]
44
+ delegate :content, to: :source_address, prefix: true
45
+
47
46
  private
48
47
 
48
+ # @return [String]
49
49
  def log_content(content, suffix = '')
50
50
  path = log_file(suffix)
51
51
 
52
- return unless path
52
+ File.open(path, 'wb') { |file| file.write(content) } if path
53
53
 
54
- File.open(path, 'wb') { |file| file.write(content) }
54
+ content
55
55
  end
56
56
 
57
57
  def log_file(suffix)
@@ -10,11 +10,15 @@ module Aranha
10
10
  items_data
11
11
  end
12
12
 
13
+ def item_data(item)
14
+ item
15
+ end
16
+
13
17
  def items_data
14
18
  count = 0
15
19
  @data ||= nokogiri.xpath(items_xpath).map do |m|
16
20
  count += 1
17
- node_parser.parse(m)
21
+ item_data(node_parser.parse(m))
18
22
  end
19
23
  rescue StandardError => e
20
24
  raise StandardError, "#{e.message} (Count: #{count})"
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module NumericSupport
12
+ def integer_value(node, xpath)
13
+ r = string_value(node, xpath)
14
+ return nil if r.blank?
15
+
16
+ m = /\d+/.match(r)
17
+ raise "Integer not found in \"#{r}\"" unless m
18
+
19
+ m[0].to_i
20
+ end
21
+
22
+ def integer_optional_value(node, xpath)
23
+ r = string_value(node, xpath)
24
+ m = /\d+/.match(r)
25
+ m ? m[0].to_i : nil
26
+ end
27
+
28
+ def float_value(node, xpath)
29
+ parse_float(node, xpath, true)
30
+ end
31
+
32
+ def float_optional_value(node, xpath)
33
+ parse_float(node, xpath, false)
34
+ end
35
+
36
+ def us_decimal_value(node, xpath)
37
+ parse_us_decimal(node, xpath, true)
38
+ end
39
+
40
+ def us_decimal_optional_value(node, xpath)
41
+ parse_us_decimal(node, xpath, false)
42
+ end
43
+
44
+ private
45
+
46
+ def parse_float(node, xpath, required)
47
+ s = string_value(node, xpath)
48
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
49
+ if m
50
+ m[0].delete('.').tr(',', '.').to_f
51
+ elsif required
52
+ raise "Float value not found in \"#{s}\""
53
+ end
54
+ end
55
+
56
+ def parse_us_decimal(node, xpath, required)
57
+ s = string_value(node, xpath)
58
+ m = /\d+(?:[\.\,](\d+))?/.match(s)
59
+ if m
60
+ m[0].delete(',').to_f
61
+ elsif required
62
+ raise "US decimal value not found in \"#{s}\""
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
5
+
6
+ module Aranha
7
+ module Parsers
8
+ module Html
9
+ module Node
10
+ class Default < ::Aranha::Parsers::Html::Node::Base
11
+ module StringSupport
12
+ def quoted_value(node, xpath)
13
+ s = string_value(node, xpath)
14
+ return '' unless s
15
+
16
+ m = /\"([^\"]+)\"/.match(s)
17
+ return m[1] if m
18
+
19
+ ''
20
+ end
21
+
22
+ def regxep(node, xpath, pattern)
23
+ s = string_value(node, xpath)
24
+ m = pattern.match(s)
25
+ return m if m
26
+
27
+ raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
28
+ end
29
+
30
+ def string_value(node, xpath)
31
+ if node.at_xpath(xpath)
32
+ sanitize_string(node.at_xpath(xpath).text)
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def string_recursive_value(node, xpath, required = true)
39
+ root = node.at_xpath(xpath)
40
+ if root.blank?
41
+ return nil unless required
42
+
43
+ raise "No node found (Xpath: #{xpath})"
44
+ end
45
+ result = string_recursive(root)
46
+ return result if result.present?
47
+ return nil unless required
48
+
49
+ raise "String blank (Xpath: #{xpath})"
50
+ end
51
+
52
+ def string_recursive_optional_value(node, xpath)
53
+ string_recursive_value(node, xpath, false)
54
+ end
55
+
56
+ private
57
+
58
+ def sanitize_string(obj)
59
+ obj.to_s.tr("\u00A0", ' ').strip
60
+ end
61
+
62
+ def string_recursive(node)
63
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
64
+
65
+ s = ''
66
+ node.children.each do |child|
67
+ child_s = string_recursive(child)
68
+ s += ' ' + child_s if child_s.present?
69
+ end
70
+ sanitize_string(s)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -1,71 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/html/node/base'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
7
8
  module Html
8
9
  module Node
9
10
  class Default < ::Aranha::Parsers::Html::Node::Base
10
- def string_value(node, xpath)
11
- if node.at_xpath(xpath)
12
- sanitize_string(node.at_xpath(xpath).text)
13
- else
14
- ''
15
- end
16
- end
17
-
18
- def string_recursive_value(node, xpath, required = true)
19
- root = node.at_xpath(xpath)
20
- if root.blank?
21
- return nil unless required
22
-
23
- raise "No node found (Xpath: #{xpath})"
24
- end
25
- result = string_recursive(root)
26
- return result if result.present?
27
- return nil unless required
28
-
29
- raise "String blank (Xpath: #{xpath})"
30
- end
31
-
32
- def string_recursive_optional_value(node, xpath)
33
- string_recursive_value(node, xpath, false)
34
- end
35
-
36
- def quoted_value(node, xpath)
37
- s = string_value(node, xpath)
38
- return '' unless s
39
-
40
- m = /\"([^\"]+)\"/.match(s)
41
- return m[1] if m
42
-
43
- ''
44
- end
45
-
46
- def integer_value(node, xpath)
47
- r = string_value(node, xpath)
48
- return nil if r.blank?
49
-
50
- m = /\d+/.match(r)
51
- raise "Integer not found in \"#{r}\"" unless m
52
-
53
- m[0].to_i
54
- end
55
-
56
- def integer_optional_value(node, xpath)
57
- r = string_value(node, xpath)
58
- m = /\d+/.match(r)
59
- m ? m[0].to_i : nil
60
- end
61
-
62
- def float_value(node, xpath)
63
- parse_float(node, xpath, true)
64
- end
65
-
66
- def float_optional_value(node, xpath)
67
- parse_float(node, xpath, false)
68
- end
11
+ require_sub __FILE__, include_modules: true
69
12
 
70
13
  def array_value(node, xpath)
71
14
  r = node.xpath(xpath).map { |n| n.text.strip }
@@ -84,41 +27,6 @@ module Aranha
84
27
  m = /(\d+) m/.match(join_value(node, xpath))
85
28
  m ? m[1].to_i : nil
86
29
  end
87
-
88
- def regxep(node, xpath, pattern)
89
- s = string_value(node, xpath)
90
- m = pattern.match(s)
91
- return m if m
92
-
93
- raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
94
- end
95
-
96
- private
97
-
98
- def parse_float(node, xpath, required)
99
- s = string_value(node, xpath)
100
- m = /\d+(?:[\.\,](\d+))?/.match(s)
101
- if m
102
- m[0].delete('.').tr(',', '.').to_f
103
- elsif required
104
- raise "Float value not found in \"#{s}\""
105
- end
106
- end
107
-
108
- def sanitize_string(obj)
109
- obj.to_s.tr("\u00A0", ' ').strip
110
- end
111
-
112
- def string_recursive(node)
113
- return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
114
-
115
- s = ''
116
- node.children.each do |child|
117
- child_s = string_recursive(child)
118
- s += ' ' + child_s if child_s.present?
119
- end
120
- sanitize_string(s)
121
- end
122
30
  end
123
31
  end
124
32
  end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/parsers/source_address/fetch_content_error'
4
+ require 'aranha/parsers/source_address/hash_http_base'
5
+ require 'eac_ruby_utils/core_ext'
6
+ require 'faraday_middleware'
7
+ require 'yaml'
8
+
9
+ module Aranha
10
+ module Parsers
11
+ class SourceAddress
12
+ class HashHttpBase
13
+ class << self
14
+ def http_method
15
+ const_get 'HTTP_METHOD'
16
+ end
17
+
18
+ def valid_source?(source)
19
+ source.is_a?(::Hash) &&
20
+ source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
21
+ end
22
+ end
23
+
24
+ DEFAULT_BODY = ''
25
+ DEFAULT_FOLLOW_REDIRECT = true
26
+ DEFAULT_HEADERS = {}.freeze
27
+ DEFAULT_PARAMS = {}.freeze
28
+
29
+ common_constructor :source do
30
+ self.source = source.with_indifferent_access
31
+ end
32
+ compare_by :source
33
+
34
+ def body
35
+ param(:body, DEFAULT_BODY)
36
+ end
37
+
38
+ def follow_redirect?
39
+ param(:follow_redirect, DEFAULT_FOLLOW_REDIRECT)
40
+ end
41
+
42
+ def headers
43
+ param(:headers, DEFAULT_HEADERS)
44
+ end
45
+
46
+ def url
47
+ source.fetch(:url)
48
+ end
49
+
50
+ def serialize
51
+ source.to_yaml
52
+ end
53
+
54
+ # @return [Faraday]
55
+ def faraday_connection
56
+ ::Faraday.new do |f|
57
+ f.response :follow_redirects if follow_redirect?
58
+ end
59
+ end
60
+
61
+ def faraday_request
62
+ faraday_connection.send(self.class.http_method, url) do |req|
63
+ headers.if_present { |v| req.headers = v }
64
+ body.if_present { |v| req.body = v }
65
+ end
66
+ end
67
+
68
+ def content
69
+ req = faraday_request
70
+ return req.body if req.status == 200
71
+
72
+ raise ::Aranha::Parsers::SourceAddress::FetchContentError,
73
+ "Get #{url} returned #{req.status.to_i}"
74
+ end
75
+
76
+ def param(key, default_value)
77
+ source[key] || params[key] || default_value
78
+ end
79
+
80
+ def params
81
+ source[:params].if_present(DEFAULT_PARAMS)
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
@@ -1,24 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'aranha/parsers/source_address/hash_http_post'
3
+ require 'aranha/parsers/source_address/hash_http_base'
4
4
 
5
5
  module Aranha
6
6
  module Parsers
7
7
  class SourceAddress
8
- class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpPost
9
- class << self
10
- def valid_source?(source)
11
- source.is_a?(::Hash) &&
12
- source.with_indifferent_access[:method].to_s.downcase.strip == 'get'
13
- end
14
- end
15
-
16
- def content
17
- HTTPClient.new.get_content(
18
- source[:url],
19
- source[:params]
20
- )
21
- end
8
+ class HashHttpGet < ::Aranha::Parsers::SourceAddress::HashHttpBase
9
+ HTTP_METHOD = :get
22
10
  end
23
11
  end
24
12
  end
@@ -1,44 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'active_support/core_ext/hash/indifferent_access'
4
- require 'httpclient'
5
- require 'yaml'
3
+ require 'aranha/parsers/source_address/hash_http_base'
6
4
 
7
5
  module Aranha
8
6
  module Parsers
9
7
  class SourceAddress
10
- class HashHttpPost
11
- class << self
12
- def valid_source?(source)
13
- source.is_a?(::Hash) &&
14
- source.with_indifferent_access[:method].to_s.downcase.strip == 'post'
15
- end
16
- end
17
-
18
- attr_reader :source
19
-
20
- def initialize(source)
21
- @source = source.with_indifferent_access
22
- end
23
-
24
- def ==(other)
25
- self.class == other.class && source == other.source
26
- end
27
-
28
- def url
29
- source.fetch(:url)
30
- end
31
-
32
- def serialize
33
- source.to_yaml
34
- end
35
-
36
- def content
37
- HTTPClient.new.post_content(
38
- source[:url],
39
- source[:params].merge(follow_redirect: true)
40
- )
41
- end
8
+ class HashHttpPost < ::Aranha::Parsers::SourceAddress::HashHttpBase
9
+ HTTP_METHOD = :post
42
10
  end
43
11
  end
44
12
  end
@@ -3,6 +3,7 @@
3
3
  require 'addressable'
4
4
  require 'curb'
5
5
  require 'aranha/parsers/source_address/fetch_content_error'
6
+ require 'faraday_middleware'
6
7
 
7
8
  module Aranha
8
9
  module Parsers
@@ -38,10 +39,12 @@ module Aranha
38
39
  end
39
40
 
40
41
  def content
41
- c = ::Curl::Easy.new(url)
42
- c.follow_location = true
43
- curl_perform(c)
44
- return c.body_str if c.status.to_i == 200
42
+ conn = ::Faraday.new do |f|
43
+ f.request :retry # retry transient failures
44
+ f.response :follow_redirects # follow redirects
45
+ end
46
+ c = conn.get(url)
47
+ return c.body if c.status == 200
45
48
 
46
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError,
47
50
  "Get #{url} returned #{c.status.to_i}"
@@ -50,18 +53,6 @@ module Aranha
50
53
  def serialize
51
54
  url
52
55
  end
53
-
54
- private
55
-
56
- def curl_perform(curl)
57
- unless curl.perform
58
- raise(::Aranha::Parsers::SourceAddress::FetchContentError,
59
- "Curl perform failed (URL: #{url})")
60
- end
61
- @final_url = curl.url
62
- rescue Curl::Err::CurlError => e
63
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, "CURL error: #{e.class.name}"
64
- end
65
56
  end
66
57
  end
67
58
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.9.0'
5
+ VERSION = '0.12.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-19 00:00:00.000000000 Z
11
+ date: 2022-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -58,34 +58,34 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.74'
61
+ version: '0.92'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 0.92.1
62
65
  type: :runtime
63
66
  prerelease: false
64
67
  version_requirements: !ruby/object:Gem::Requirement
65
68
  requirements:
66
69
  - - "~>"
67
70
  - !ruby/object:Gem::Version
68
- version: '0.74'
71
+ version: '0.92'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 0.92.1
69
75
  - !ruby/object:Gem::Dependency
70
- name: httpclient
76
+ name: faraday_middleware
71
77
  requirement: !ruby/object:Gem::Requirement
72
78
  requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '2.8'
76
79
  - - ">="
77
80
  - !ruby/object:Gem::Version
78
- version: 2.8.3
81
+ version: '0'
79
82
  type: :runtime
80
83
  prerelease: false
81
84
  version_requirements: !ruby/object:Gem::Requirement
82
85
  requirements:
83
- - - "~>"
84
- - !ruby/object:Gem::Version
85
- version: '2.8'
86
86
  - - ">="
87
87
  - !ruby/object:Gem::Version
88
- version: 2.8.3
88
+ version: '0'
89
89
  - !ruby/object:Gem::Dependency
90
90
  name: nokogiri
91
91
  requirement: !ruby/object:Gem::Requirement
@@ -156,6 +156,8 @@ files:
156
156
  - lib/aranha/parsers/html/node.rb
157
157
  - lib/aranha/parsers/html/node/base.rb
158
158
  - lib/aranha/parsers/html/node/default.rb
159
+ - lib/aranha/parsers/html/node/default/numeric_support.rb
160
+ - lib/aranha/parsers/html/node/default/string_support.rb
159
161
  - lib/aranha/parsers/invalid_state_exception.rb
160
162
  - lib/aranha/parsers/json.rb
161
163
  - lib/aranha/parsers/json/base.rb
@@ -167,6 +169,7 @@ files:
167
169
  - lib/aranha/parsers/source_address.rb
168
170
  - lib/aranha/parsers/source_address/fetch_content_error.rb
169
171
  - lib/aranha/parsers/source_address/file.rb
172
+ - lib/aranha/parsers/source_address/hash_http_base.rb
170
173
  - lib/aranha/parsers/source_address/hash_http_get.rb
171
174
  - lib/aranha/parsers/source_address/hash_http_post.rb
172
175
  - lib/aranha/parsers/source_address/http_get.rb