aranha-parsers 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9bd9b607e666c0b3b4ef0a54f1289891d5f958bbc9d72ad8632e4fc75858fef4
4
- data.tar.gz: a462e661c567337abb7184ae5d1bc8555def424742b351e9bc83e9bad7bb88f3
3
+ metadata.gz: 51a24e93c2727f9cdb6e187eb6f2c15719591c70c0afa5e04be790cc0cdaba7d
4
+ data.tar.gz: b56ba940af2dadeacca0ca2dd67fce81d28af267b768d80927d157b5a94d8f88
5
5
  SHA512:
6
- metadata.gz: 85566b737941e252dbdf0f446f926c2cb9421d800c0547f207df3a5350d847bb61e303d9f278b3a8071182291814a64c6213d8f35a575ac93bb793adfeb463d3
7
- data.tar.gz: 6f969ca1107f35c3aa41e52288d9e41da1b5f8f3a2efd9c13adf151a8d09e07810f8818d6a05e88e2800f02ed1fcaffc051a5bc1a886ec25d1cb373c79b6b507
6
+ metadata.gz: 6951921ada5daca5e9a0f6fc70726b447da1308d11334fc8e92639953c6a12c617ded89b969e3561e38d3f4f697a25673c981c945ea79f55fc23d06d580f6726
7
+ data.tar.gz: 9a201a0e54ae982da2b4f78ba24c2acb188637c370a03e1b9e92cceab27ab0f9991b35ba948de27e6a5564b14a5f6b3ee8591fa4e16691e9c5949677030a84b4
@@ -7,6 +7,10 @@ module Aranha
7
7
  module Html
8
8
  class ItemList < Base
9
9
  def data
10
+ items_data
11
+ end
12
+
13
+ def items_data
10
14
  count = 0
11
15
  @data ||= nokogiri.xpath(items_xpath).map do |m|
12
16
  count += 1
@@ -12,7 +12,13 @@ module Aranha
12
12
  end
13
13
 
14
14
  def parse(node)
15
- Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
15
+ fields.map do |f|
16
+ begin
17
+ [f[0], parse_field(node, f[2], f[1])]
18
+ rescue StandardError => e
19
+ raise StandardError, "#{e.message}\nFields: #{f}"
20
+ end
21
+ end.to_h
16
22
  end
17
23
 
18
24
  private
@@ -9,12 +9,28 @@ module Aranha
9
9
  class Default < ::Aranha::Parsers::Html::Node::Base
10
10
  def string_value(node, xpath)
11
11
  if node.at_xpath(xpath)
12
- node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
12
+ sanitize_string(node.at_xpath(xpath).text)
13
13
  else
14
14
  ''
15
15
  end
16
16
  end
17
17
 
18
+ def string_recursive_value(node, xpath, required = true)
19
+ root = node.at_xpath(xpath)
20
+ if root.blank?
21
+ return nil unless required
22
+ raise "No node found (Xpath: #{xpath})"
23
+ end
24
+ result = string_recursive(root)
25
+ return result unless result.blank?
26
+ return nil unless required
27
+ raise "String blank (Xpath: #{xpath})"
28
+ end
29
+
30
+ def string_recursive_optional_value(node, xpath)
31
+ string_recursive_value(node, xpath, false)
32
+ end
33
+
18
34
  def quoted_value(node, xpath)
19
35
  s = string_value(node, xpath)
20
36
  return '' unless s
@@ -81,11 +97,25 @@ module Aranha
81
97
  s = string_value(node, xpath)
82
98
  m = /\d+(?:[\.\,](\d+))?/.match(s)
83
99
  if m
84
- m[0].sub(',', '.').to_f
100
+ m[0].delete('.').tr(',', '.').to_f
85
101
  elsif required
86
102
  raise "Float value not found in \"#{s}\""
87
103
  end
88
104
  end
105
+
106
+ def sanitize_string(obj)
107
+ obj.to_s.tr("\u00A0", ' ').strip
108
+ end
109
+
110
+ def string_recursive(node)
111
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
112
+ s = ''
113
+ node.children.each do |child|
114
+ child_s = string_recursive(child)
115
+ s += ' ' + child_s if child_s.present?
116
+ end
117
+ sanitize_string(s)
118
+ end
89
119
  end
90
120
  end
91
121
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
- require 'net/http'
4
+ require 'curb'
5
5
 
6
6
  module Aranha
7
7
  module Parsers
@@ -32,29 +32,15 @@ module Aranha
32
32
  end
33
33
 
34
34
  def content
35
- content_fetch(url)
35
+ c = ::Curl::Easy.new(url)
36
+ raise "Curl perform failed (URL: #{url})" unless c.perform
37
+ return c.body_str if c.status.to_i == 200
38
+ raise "Get #{url} returned #{c.status.to_i}"
36
39
  end
37
40
 
38
41
  def serialize
39
42
  url
40
43
  end
41
-
42
- private
43
-
44
- def content_fetch(uri, limit = 10)
45
- raise 'too many HTTP redirects' if limit.zero?
46
-
47
- response = Net::HTTP.get_response(URI(uri))
48
-
49
- case response
50
- when Net::HTTPSuccess then
51
- response.body
52
- when Net::HTTPRedirection then
53
- content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
54
- else
55
- response.value
56
- end
57
- end
58
44
  end
59
45
  end
60
46
  end
@@ -1,5 +1,5 @@
1
1
  module Aranha
2
2
  module Parsers
3
- VERSION = '0.1.1'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-18 00:00:00.000000000 Z
11
+ date: 2019-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.7'
41
+ - !ruby/object:Gem::Dependency
42
+ name: curb
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.9.10
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.9.10
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: httpclient
43
57
  requirement: !ruby/object:Gem::Requirement