xml_data_extractor 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +36 -1
- data/lib/src/extract/hash_builder.rb +1 -1
- data/lib/src/extract/unescape.rb +12 -0
- data/lib/src/extract/value_builder.rb +4 -0
- data/lib/src/extractor.rb +16 -6
- data/xml_data_extractor.gemspec +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51d1d8f39ffa4227b1de6b3a51bd19f0bd8dd9d0d3d5819e6f43e15db85b048e
|
4
|
+
data.tar.gz: bb1785cea058ed551ba6cee8e9d64e041e0dbaf541dc5ce95895f55a74a57553
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e4aeb8c71ea104e1a6dc3fc3e5e2befadaade3e404e055ef7c569ad6b17e948be2a11eea373d5a84a0ceba6b549a3775516af82ff88a583c1ba8ac1b908debf
|
7
|
+
data.tar.gz: c1316f2a89e782c0916e186d55ee81393fec850d7543d7301f44f15daa461bb752caff8136bb38dce5a30cbd5970bcd58ef78672de0cc7b3de2139d4e7e3b4ee
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -171,7 +171,6 @@ schemas:
|
|
171
171
|
within: info/movie_data
|
172
172
|
title: original_title
|
173
173
|
actor: main_actor
|
174
|
-
|
175
174
|
```
|
176
175
|
```xml
|
177
176
|
<xml>
|
@@ -187,6 +186,42 @@ schemas:
|
|
187
186
|
{ movie: { title: "The Irishman", actor: "Robert De Niro" } }
|
188
187
|
```
|
189
188
|
|
189
|
+
#### unescape
|
190
|
+
|
191
|
+
This option is pretty usefull when you have embbed XML or HTML inside some tag, like CDATA elements, and you need to unescape them first in order to parse their content:
|
192
|
+
|
193
|
+
```yml
|
194
|
+
schemas:
|
195
|
+
movie:
|
196
|
+
unescape: response
|
197
|
+
title: response/original_title
|
198
|
+
actor: response/main_actor
|
199
|
+
|
200
|
+
```
|
201
|
+
|
202
|
+
```xml
|
203
|
+
<xml>
|
204
|
+
<response>
|
205
|
+
<original_title>1<original_title><main_actor>1<main_actor>
|
206
|
+
</response>
|
207
|
+
</xml>
|
208
|
+
```
|
209
|
+
|
210
|
+
This XML will be turned into this one during the parsing:
|
211
|
+
|
212
|
+
```xml
|
213
|
+
<xml>
|
214
|
+
<response>
|
215
|
+
<original_title>The Irishman</original_title>
|
216
|
+
<main_actor>Robert De Niro</main_actor>
|
217
|
+
</response>
|
218
|
+
</xml>
|
219
|
+
```
|
220
|
+
|
221
|
+
```ruby
|
222
|
+
{ movie: { title: "The Irishman", actor: "Robert De Niro" } }
|
223
|
+
```
|
224
|
+
|
190
225
|
#### array_of
|
191
226
|
|
192
227
|
Defines the path to a XML collection, which will be looped generating an array of hashes:
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Extract
|
2
|
+
class Unescape < Base
|
3
|
+
def unescape!
|
4
|
+
unescape_tag = node.props[:unescape]
|
5
|
+
|
6
|
+
paths_to_unescape = extractor.paths_of(node.path, unescape_tag)
|
7
|
+
return if paths_to_unescape.empty?
|
8
|
+
|
9
|
+
paths_to_unescape.each { |path| extractor.unescape!(path) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -6,6 +6,7 @@ require_relative "string_value"
|
|
6
6
|
require_relative "value_builder"
|
7
7
|
require_relative "within"
|
8
8
|
require_relative "expression"
|
9
|
+
require_relative "unescape"
|
9
10
|
|
10
11
|
module Extract
|
11
12
|
class ValueBuilder < Base
|
@@ -24,6 +25,9 @@ module Extract
|
|
24
25
|
|
25
26
|
def value_for_hash
|
26
27
|
props = node.props
|
28
|
+
|
29
|
+
Unescape.new(node, extractor).unescape! if props[:unescape]
|
30
|
+
|
27
31
|
fixed_value = props[:fixed]
|
28
32
|
return fixed_value if fixed_value
|
29
33
|
return ArrayOf.new(node, extractor).value if props[:array_of]
|
data/lib/src/extractor.rb
CHANGED
@@ -41,7 +41,7 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
-
class NodeParamsExtractor < Struct.new(:node)
|
44
|
+
class NodeParamsExtractor < Struct.new(:node)
|
45
45
|
def extract
|
46
46
|
[node.path, *node.props.values_at(:in_parent, :path, :link, :attr)]
|
47
47
|
end
|
@@ -49,7 +49,7 @@ end
|
|
49
49
|
|
50
50
|
class NodeExtractor
|
51
51
|
def initialize(xml)
|
52
|
-
@xml = Nokogiri::XML(
|
52
|
+
@xml = Nokogiri::XML(xml)
|
53
53
|
@xml.remove_namespaces!
|
54
54
|
end
|
55
55
|
|
@@ -59,12 +59,18 @@ class NodeExtractor
|
|
59
59
|
nil
|
60
60
|
end
|
61
61
|
|
62
|
-
|
62
|
+
def unescape!(path)
|
63
|
+
node = extract(path)
|
64
|
+
return if node.blank?
|
65
|
+
|
66
|
+
first_node = node.first
|
67
|
+
return if first_node.elements.present?
|
63
68
|
|
64
|
-
|
65
|
-
CGI.unescapeHTML(xml).gsub(/<br>|<\/br>| /, { " " => " ", "<br>" => "", "</br>" => "" })
|
69
|
+
first_node.children = Nokogiri::XML.fragment(first_node.content).children
|
66
70
|
end
|
67
71
|
|
72
|
+
private
|
73
|
+
|
68
74
|
attr_reader :xml
|
69
75
|
end
|
70
76
|
|
@@ -193,7 +199,11 @@ class Extractor
|
|
193
199
|
end
|
194
200
|
|
195
201
|
value = path_value(path, tag, attribute)
|
196
|
-
format_value(value, node.props)
|
202
|
+
format_value(value, node.props)
|
203
|
+
end
|
204
|
+
|
205
|
+
def unescape!(path)
|
206
|
+
node_extractor.unescape!(path)
|
197
207
|
end
|
198
208
|
|
199
209
|
def format_value(value, props)
|
data/xml_data_extractor.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xml_data_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Fernando Almeida
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-11-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -75,6 +75,7 @@ files:
|
|
75
75
|
- lib/src/extract/expression.rb
|
76
76
|
- lib/src/extract/hash_builder.rb
|
77
77
|
- lib/src/extract/string_value.rb
|
78
|
+
- lib/src/extract/unescape.rb
|
78
79
|
- lib/src/extract/value_builder.rb
|
79
80
|
- lib/src/extract/within.rb
|
80
81
|
- lib/src/extractor.rb
|