aranha-parsers 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51a24e93c2727f9cdb6e187eb6f2c15719591c70c0afa5e04be790cc0cdaba7d
|
4
|
+
data.tar.gz: b56ba940af2dadeacca0ca2dd67fce81d28af267b768d80927d157b5a94d8f88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6951921ada5daca5e9a0f6fc70726b447da1308d11334fc8e92639953c6a12c617ded89b969e3561e38d3f4f697a25673c981c945ea79f55fc23d06d580f6726
|
7
|
+
data.tar.gz: 9a201a0e54ae982da2b4f78ba24c2acb188637c370a03e1b9e92cceab27ab0f9991b35ba948de27e6a5564b14a5f6b3ee8591fa4e16691e9c5949677030a84b4
|
@@ -12,7 +12,13 @@ module Aranha
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def parse(node)
|
15
|
-
|
15
|
+
fields.map do |f|
|
16
|
+
begin
|
17
|
+
[f[0], parse_field(node, f[2], f[1])]
|
18
|
+
rescue StandardError => e
|
19
|
+
raise StandardError, "#{e.message}\nFields: #{f}"
|
20
|
+
end
|
21
|
+
end.to_h
|
16
22
|
end
|
17
23
|
|
18
24
|
private
|
@@ -9,12 +9,28 @@ module Aranha
|
|
9
9
|
class Default < ::Aranha::Parsers::Html::Node::Base
|
10
10
|
def string_value(node, xpath)
|
11
11
|
if node.at_xpath(xpath)
|
12
|
-
node.at_xpath(xpath).text
|
12
|
+
sanitize_string(node.at_xpath(xpath).text)
|
13
13
|
else
|
14
14
|
''
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
+
def string_recursive_value(node, xpath, required = true)
|
19
|
+
root = node.at_xpath(xpath)
|
20
|
+
if root.blank?
|
21
|
+
return nil unless required
|
22
|
+
raise "No node found (Xpath: #{xpath})"
|
23
|
+
end
|
24
|
+
result = string_recursive(root)
|
25
|
+
return result unless result.blank?
|
26
|
+
return nil unless required
|
27
|
+
raise "String blank (Xpath: #{xpath})"
|
28
|
+
end
|
29
|
+
|
30
|
+
def string_recursive_optional_value(node, xpath)
|
31
|
+
string_recursive_value(node, xpath, false)
|
32
|
+
end
|
33
|
+
|
18
34
|
def quoted_value(node, xpath)
|
19
35
|
s = string_value(node, xpath)
|
20
36
|
return '' unless s
|
@@ -81,11 +97,25 @@ module Aranha
|
|
81
97
|
s = string_value(node, xpath)
|
82
98
|
m = /\d+(?:[\.\,](\d+))?/.match(s)
|
83
99
|
if m
|
84
|
-
m[0].
|
100
|
+
m[0].delete('.').tr(',', '.').to_f
|
85
101
|
elsif required
|
86
102
|
raise "Float value not found in \"#{s}\""
|
87
103
|
end
|
88
104
|
end
|
105
|
+
|
106
|
+
def sanitize_string(obj)
|
107
|
+
obj.to_s.tr("\u00A0", ' ').strip
|
108
|
+
end
|
109
|
+
|
110
|
+
def string_recursive(node)
|
111
|
+
return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
|
112
|
+
s = ''
|
113
|
+
node.children.each do |child|
|
114
|
+
child_s = string_recursive(child)
|
115
|
+
s += ' ' + child_s if child_s.present?
|
116
|
+
end
|
117
|
+
sanitize_string(s)
|
118
|
+
end
|
89
119
|
end
|
90
120
|
end
|
91
121
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
-
require '
|
4
|
+
require 'curb'
|
5
5
|
|
6
6
|
module Aranha
|
7
7
|
module Parsers
|
@@ -32,29 +32,15 @@ module Aranha
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def content
|
35
|
-
|
35
|
+
c = ::Curl::Easy.new(url)
|
36
|
+
raise "Curl perform failed (URL: #{url})" unless c.perform
|
37
|
+
return c.body_str if c.status.to_i == 200
|
38
|
+
raise "Get #{url} returned #{c.status.to_i}"
|
36
39
|
end
|
37
40
|
|
38
41
|
def serialize
|
39
42
|
url
|
40
43
|
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def content_fetch(uri, limit = 10)
|
45
|
-
raise 'too many HTTP redirects' if limit.zero?
|
46
|
-
|
47
|
-
response = Net::HTTP.get_response(URI(uri))
|
48
|
-
|
49
|
-
case response
|
50
|
-
when Net::HTTPSuccess then
|
51
|
-
response.body
|
52
|
-
when Net::HTTPRedirection then
|
53
|
-
content_fetch(self.class.location_uri(uri, response['location']), limit - 1)
|
54
|
-
else
|
55
|
-
response.value
|
56
|
-
end
|
57
|
-
end
|
58
44
|
end
|
59
45
|
end
|
60
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '2.7'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: curb
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.9.10
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.9.10
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: httpclient
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|