aranha-parsers 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9bd9b607e666c0b3b4ef0a54f1289891d5f958bbc9d72ad8632e4fc75858fef4
4
- data.tar.gz: a462e661c567337abb7184ae5d1bc8555def424742b351e9bc83e9bad7bb88f3
3
+ metadata.gz: 51a24e93c2727f9cdb6e187eb6f2c15719591c70c0afa5e04be790cc0cdaba7d
4
+ data.tar.gz: b56ba940af2dadeacca0ca2dd67fce81d28af267b768d80927d157b5a94d8f88
5
5
  SHA512:
6
- metadata.gz: 85566b737941e252dbdf0f446f926c2cb9421d800c0547f207df3a5350d847bb61e303d9f278b3a8071182291814a64c6213d8f35a575ac93bb793adfeb463d3
7
- data.tar.gz: 6f969ca1107f35c3aa41e52288d9e41da1b5f8f3a2efd9c13adf151a8d09e07810f8818d6a05e88e2800f02ed1fcaffc051a5bc1a886ec25d1cb373c79b6b507
6
+ metadata.gz: 6951921ada5daca5e9a0f6fc70726b447da1308d11334fc8e92639953c6a12c617ded89b969e3561e38d3f4f697a25673c981c945ea79f55fc23d06d580f6726
7
+ data.tar.gz: 9a201a0e54ae982da2b4f78ba24c2acb188637c370a03e1b9e92cceab27ab0f9991b35ba948de27e6a5564b14a5f6b3ee8591fa4e16691e9c5949677030a84b4
@@ -7,6 +7,10 @@ module Aranha
7
7
  module Html
8
8
  class ItemList < Base
9
9
  def data
10
+ items_data
11
+ end
12
+
13
+ def items_data
10
14
  count = 0
11
15
  @data ||= nokogiri.xpath(items_xpath).map do |m|
12
16
  count += 1
@@ -12,7 +12,13 @@ module Aranha
12
12
  end
13
13
 
14
14
  def parse(node)
15
- Hash[fields.map { |f| [f[0], parse_field(node, f[2], f[1])] }]
15
+ fields.map do |f|
16
+ begin
17
+ [f[0], parse_field(node, f[2], f[1])]
18
+ rescue StandardError => e
19
+ raise StandardError, "#{e.message}\nFields: #{f}"
20
+ end
21
+ end.to_h
16
22
  end
17
23
 
18
24
  private
@@ -9,12 +9,28 @@ module Aranha
9
9
  class Default < ::Aranha::Parsers::Html::Node::Base
10
10
  def string_value(node, xpath)
11
11
  if node.at_xpath(xpath)
12
- node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
12
+ sanitize_string(node.at_xpath(xpath).text)
13
13
  else
14
14
  ''
15
15
  end
16
16
  end
17
17
 
18
+ def string_recursive_value(node, xpath, required = true)
19
+ root = node.at_xpath(xpath)
20
+ if root.blank?
21
+ return nil unless required
22
+ raise "No node found (Xpath: #{xpath})"
23
+ end
24
+ result = string_recursive(root)
25
+ return result unless result.blank?
26
+ return nil unless required
27
+ raise "String blank (Xpath: #{xpath})"
28
+ end
29
+
30
+ def string_recursive_optional_value(node, xpath)
31
+ string_recursive_value(node, xpath, false)
32
+ end
33
+
18
34
  def quoted_value(node, xpath)
19
35
  s = string_value(node, xpath)
20
36
  return '' unless s
@@ -81,11 +97,25 @@ module Aranha
81
97
  s = string_value(node, xpath)
82
98
  m = /\d+(?:[\.\,](\d+))?/.match(s)
83
99
  if m
84
- m[0].sub(',', '.').to_f
100
+ m[0].delete('.').tr(',', '.').to_f
85
101
  elsif required
86
102
  raise "Float value not found in \"#{s}\""
87
103
  end
88
104
  end
105
+
106
+ def sanitize_string(obj)
107
+ obj.to_s.tr("\u00A0", ' ').strip
108
+ end
109
+
110
+ def string_recursive(node)
111
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
112
+ s = ''
113
+ node.children.each do |child|
114
+ child_s = string_recursive(child)
115
+ s += ' ' + child_s if child_s.present?
116
+ end
117
+ sanitize_string(s)
118
+ end
89
119
  end
90
120
  end
91
121
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
- require 'net/http'
4
+ require 'curb'
5
5
 
6
6
  module Aranha
7
7
  module Parsers
@@ -32,29 +32,15 @@ module Aranha
32
32
  end
33
33
 
34
34
  def content
35
- content_fetch(url)
35
+ c = ::Curl::Easy.new(url)
36
+ raise "Curl perform failed (URL: #{url})" unless c.perform
37
+ return c.body_str if c.status.to_i == 200
38
+ raise "Get #{url} returned #{c.status.to_i}"
36
39
  end
37
40
 
38
41
  def serialize
39
42
  url
40
43
  end
41
-
42
- private
43
-
44
- def content_fetch(uri, limit = 10)
45
- raise 'too many HTTP redirects' if limit.zero?
46
-
47
- response = Net::HTTP.get_response(URI(uri))
48
-
49
- case response
50
- when Net::HTTPSuccess then
51
- response.body
52
- when Net::HTTPRedirection then
53
- content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
54
- else
55
- response.value
56
- end
57
- end
58
44
  end
59
45
  end
60
46
  end
@@ -1,5 +1,5 @@
1
1
  module Aranha
2
2
  module Parsers
3
- VERSION = '0.1.1'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-18 00:00:00.000000000 Z
11
+ date: 2019-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.7'
41
+ - !ruby/object:Gem::Dependency
42
+ name: curb
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.9.10
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.9.10
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: httpclient
43
57
  requirement: !ruby/object:Gem::Requirement