xml_data_extractor 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa8684820cc0394f1d83bc06c5eaa3378e6a7cc12e33c31b166018edea86d714
4
- data.tar.gz: 57ef32a54ff735afd93e25836b44773a8d79ad922fc3837437cb10beead43e1f
3
+ metadata.gz: 51d1d8f39ffa4227b1de6b3a51bd19f0bd8dd9d0d3d5819e6f43e15db85b048e
4
+ data.tar.gz: bb1785cea058ed551ba6cee8e9d64e041e0dbaf541dc5ce95895f55a74a57553
5
5
  SHA512:
6
- metadata.gz: 043ab0237d908959e98ebf371d36fc643c85d3df4fc46c213b2f8ff6e8ed1118b373c27b80685e4f4c486667d27b73476d44368aacec9e2a186ae24e0e9d8dcc
7
- data.tar.gz: 52baa3e42b3d65999024f89949152f2f98cf6049635aa37afc204ace50e8dbe75b7e40bcbf467c87309c487d50f1b1bf6e9e4849caa611fe290c218adee66685
6
+ metadata.gz: 8e4aeb8c71ea104e1a6dc3fc3e5e2befadaade3e404e055ef7c569ad6b17e948be2a11eea373d5a84a0ceba6b549a3775516af82ff88a583c1ba8ac1b908debf
7
+ data.tar.gz: c1316f2a89e782c0916e186d55ee81393fec850d7543d7301f44f15daa461bb752caff8136bb38dce5a30cbd5970bcd58ef78672de0cc7b3de2139d4e7e3b4ee
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xml_data_extractor (0.3.0)
4
+ xml_data_extractor (0.4.0)
5
5
  activesupport (~> 6.0)
6
6
  nokogiri (~> 1.0)
7
7
 
data/README.md CHANGED
@@ -171,7 +171,6 @@ schemas:
171
171
  within: info/movie_data
172
172
  title: original_title
173
173
  actor: main_actor
174
-
175
174
  ```
176
175
  ```xml
177
176
  <xml>
@@ -187,6 +186,42 @@ schemas:
187
186
  { movie: { title: "The Irishman", actor: "Robert De Niro" } }
188
187
  ```
189
188
 
189
+ #### unescape
190
+
191
+ This option is pretty usefull when you have embbed XML or HTML inside some tag, like CDATA elements, and you need to unescape them first in order to parse their content:
192
+
193
+ ```yml
194
+ schemas:
195
+ movie:
196
+ unescape: response
197
+ title: response/original_title
198
+ actor: response/main_actor
199
+
200
+ ```
201
+
202
+ ```xml
203
+ <xml>
204
+ <response>
205
+ &ltoriginal_title&gt1&ltoriginal_title&gt&ltmain_actor&gt1&ltmain_actor&gt
206
+ </response>
207
+ </xml>
208
+ ```
209
+
210
+ This XML will be turned into this one during the parsing:
211
+
212
+ ```xml
213
+ <xml>
214
+ <response>
215
+ <original_title>The Irishman</original_title>
216
+ <main_actor>Robert De Niro</main_actor>
217
+ </response>
218
+ </xml>
219
+ ```
220
+
221
+ ```ruby
222
+ { movie: { title: "The Irishman", actor: "Robert De Niro" } }
223
+ ```
224
+
190
225
  #### array_of
191
226
 
192
227
  Defines the path to a XML collection, which will be looped generating an array of hashes:
@@ -1,6 +1,6 @@
1
1
  module Extract
2
2
  class HashBuilder < Base
3
- INTERNAL_FIELDS = %i[array_of keep_if within].freeze
3
+ INTERNAL_FIELDS = %i[array_of keep_if within unescape].freeze
4
4
 
5
5
  def value(index = 0)
6
6
  path, props = node.to_h.values_at(:path, :props)
@@ -0,0 +1,12 @@
1
+ module Extract
2
+ class Unescape < Base
3
+ def unescape!
4
+ unescape_tag = node.props[:unescape]
5
+
6
+ paths_to_unescape = extractor.paths_of(node.path, unescape_tag)
7
+ return if paths_to_unescape.empty?
8
+
9
+ paths_to_unescape.each { |path| extractor.unescape!(path) }
10
+ end
11
+ end
12
+ end
@@ -6,6 +6,7 @@ require_relative "string_value"
6
6
  require_relative "value_builder"
7
7
  require_relative "within"
8
8
  require_relative "expression"
9
+ require_relative "unescape"
9
10
 
10
11
  module Extract
11
12
  class ValueBuilder < Base
@@ -24,6 +25,9 @@ module Extract
24
25
 
25
26
  def value_for_hash
26
27
  props = node.props
28
+
29
+ Unescape.new(node, extractor).unescape! if props[:unescape]
30
+
27
31
  fixed_value = props[:fixed]
28
32
  return fixed_value if fixed_value
29
33
  return ArrayOf.new(node, extractor).value if props[:array_of]
@@ -41,7 +41,7 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
41
41
  end
42
42
  end
43
43
 
44
- class NodeParamsExtractor < Struct.new(:node)
44
+ class NodeParamsExtractor < Struct.new(:node)
45
45
  def extract
46
46
  [node.path, *node.props.values_at(:in_parent, :path, :link, :attr)]
47
47
  end
@@ -49,7 +49,7 @@ end
49
49
 
50
50
  class NodeExtractor
51
51
  def initialize(xml)
52
- @xml = Nokogiri::XML(remove_special_elements(xml), nil, Encoding::UTF_8.to_s)
52
+ @xml = Nokogiri::XML(xml)
53
53
  @xml.remove_namespaces!
54
54
  end
55
55
 
@@ -59,12 +59,18 @@ class NodeExtractor
59
59
  nil
60
60
  end
61
61
 
62
- private
62
+ def unescape!(path)
63
+ node = extract(path)
64
+ return if node.blank?
65
+
66
+ first_node = node.first
67
+ return if first_node.elements.present?
63
68
 
64
- def remove_special_elements(xml)
65
- CGI.unescapeHTML(xml).gsub(/<br>|<\/br>|&nbsp;/, { "&nbsp;" => " ", "<br>" => "", "</br>" => "" })
69
+ first_node.children = Nokogiri::XML.fragment(first_node.content).children
66
70
  end
67
71
 
72
+ private
73
+
68
74
  attr_reader :xml
69
75
  end
70
76
 
@@ -193,7 +199,11 @@ class Extractor
193
199
  end
194
200
 
195
201
  value = path_value(path, tag, attribute)
196
- format_value(value, node.props)
202
+ format_value(value, node.props)
203
+ end
204
+
205
+ def unescape!(path)
206
+ node_extractor.unescape!(path)
197
207
  end
198
208
 
199
209
  def format_value(value, props)
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "xml_data_extractor"
3
- spec.version = "0.3.0"
3
+ spec.version = "0.4.0"
4
4
  spec.authors = ["Fernando Almeida"]
5
5
  spec.email = ["fernandoprsbr@gmail.com"]
6
6
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xml_data_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Fernando Almeida
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-30 00:00:00.000000000 Z
11
+ date: 2020-11-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -75,6 +75,7 @@ files:
75
75
  - lib/src/extract/expression.rb
76
76
  - lib/src/extract/hash_builder.rb
77
77
  - lib/src/extract/string_value.rb
78
+ - lib/src/extract/unescape.rb
78
79
  - lib/src/extract/value_builder.rb
79
80
  - lib/src/extract/within.rb
80
81
  - lib/src/extractor.rb