RubyGems - xml_data_extractor - Versions diffs - 0.3.0 → 0.4.0 - Mend

xml_data_extractor 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +36 -1
data/lib/src/extract/hash_builder.rb +1 -1
data/lib/src/extract/unescape.rb +12 -0
data/lib/src/extract/value_builder.rb +4 -0
data/lib/src/extractor.rb +16 -6
data/xml_data_extractor.gemspec +1 -1
metadata +3 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aa8684820cc0394f1d83bc06c5eaa3378e6a7cc12e33c31b166018edea86d714
-  data.tar.gz: 57ef32a54ff735afd93e25836b44773a8d79ad922fc3837437cb10beead43e1f
+  metadata.gz: 51d1d8f39ffa4227b1de6b3a51bd19f0bd8dd9d0d3d5819e6f43e15db85b048e
+  data.tar.gz: bb1785cea058ed551ba6cee8e9d64e041e0dbaf541dc5ce95895f55a74a57553
 SHA512:
-  metadata.gz: 043ab0237d908959e98ebf371d36fc643c85d3df4fc46c213b2f8ff6e8ed1118b373c27b80685e4f4c486667d27b73476d44368aacec9e2a186ae24e0e9d8dcc
-  data.tar.gz: 52baa3e42b3d65999024f89949152f2f98cf6049635aa37afc204ace50e8dbe75b7e40bcbf467c87309c487d50f1b1bf6e9e4849caa611fe290c218adee66685
+  metadata.gz: 8e4aeb8c71ea104e1a6dc3fc3e5e2befadaade3e404e055ef7c569ad6b17e948be2a11eea373d5a84a0ceba6b549a3775516af82ff88a583c1ba8ac1b908debf
+  data.tar.gz: c1316f2a89e782c0916e186d55ee81393fec850d7543d7301f44f15daa461bb752caff8136bb38dce5a30cbd5970bcd58ef78672de0cc7b3de2139d4e7e3b4ee

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    xml_data_extractor (0.3.0)
+    xml_data_extractor (0.4.0)
       activesupport (~> 6.0)
       nokogiri (~> 1.0)

data/README.md CHANGED

@@ -171,7 +171,6 @@ schemas:
     within: info/movie_data
     title: original_title
     actor: main_actor
 ```
 ```xml
 <xml>
@@ -187,6 +186,42 @@ schemas:
 { movie: { title: "The Irishman", actor: "Robert De Niro" } }
 ```
+#### unescape
+This option is pretty usefull when you have embbed XML or HTML inside some tag, like CDATA elements, and you need to unescape them first in order to parse their content:
+```yml
+schemas:
+  movie:
+    unescape: response
+    title: response/original_title
+    actor: response/main_actor
+```
+```xml
+<xml>
+  <response>
+    &ltoriginal_title&gt1&ltoriginal_title&gt&ltmain_actor&gt1&ltmain_actor&gt
+  </response>
+</xml>
+```
+This XML will be turned into this one during the parsing:
+```xml
+<xml>
+  <response>
+    <original_title>The Irishman</original_title>
+    <main_actor>Robert De Niro</main_actor>
+  </response>
+</xml>
+```
+```ruby
+{ movie: { title: "The Irishman", actor: "Robert De Niro" } }
+```
 #### array_of
 Defines the path to a XML collection, which will be looped generating an array of hashes:

data/lib/src/extract/hash_builder.rb CHANGED

@@ -1,6 +1,6 @@
 module Extract
   class HashBuilder < Base
-    INTERNAL_FIELDS = %i[array_of keep_if within].freeze
+    INTERNAL_FIELDS = %i[array_of keep_if within unescape].freeze
     def value(index = 0)
       path, props = node.to_h.values_at(:path, :props)

data/lib/src/extract/unescape.rb ADDED

@@ -0,0 +1,12 @@
+module Extract
+  class Unescape < Base
+    def unescape!
+      unescape_tag = node.props[:unescape]
+      paths_to_unescape = extractor.paths_of(node.path, unescape_tag)
+      return if paths_to_unescape.empty?
+      paths_to_unescape.each { |path| extractor.unescape!(path) }
+    end
+  end
+end

data/lib/src/extract/value_builder.rb CHANGED

@@ -6,6 +6,7 @@ require_relative "string_value"
 require_relative "value_builder"
 require_relative "within"
 require_relative "expression"
+require_relative "unescape"
 module Extract
   class ValueBuilder < Base
@@ -24,6 +25,9 @@ module Extract
     def value_for_hash
       props = node.props
+      Unescape.new(node, extractor).unescape! if props[:unescape]
       fixed_value = props[:fixed]
       return fixed_value if fixed_value
       return ArrayOf.new(node, extractor).value if props[:array_of]

data/lib/src/extractor.rb CHANGED

@@ -41,7 +41,7 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
   end
 end
-class NodeParamsExtractor < Struct.new(:node)
+class NodeParamsExtractor < Struct.new(:node)
   def extract
     [node.path, *node.props.values_at(:in_parent, :path, :link, :attr)]
   end
@@ -49,7 +49,7 @@ end
 class NodeExtractor
   def initialize(xml)
-    @xml = Nokogiri::XML(remove_special_elements(xml), nil, Encoding::UTF_8.to_s)
+    @xml = Nokogiri::XML(xml)
     @xml.remove_namespaces!
   end
@@ -59,12 +59,18 @@ class NodeExtractor
     nil
   end
-  private
+  def unescape!(path)
+    node = extract(path)
+    return if node.blank?
+    first_node = node.first
+    return if first_node.elements.present?
-  def remove_special_elements(xml)
-    CGI.unescapeHTML(xml).gsub(/<br>|<\/br>|&nbsp;/, { "&nbsp;" => " ", "<br>" => "", "</br>" => "" })
+    first_node.children = Nokogiri::XML.fragment(first_node.content).children
   end
+  private
   attr_reader :xml
 end
@@ -193,7 +199,11 @@ class Extractor
     end
     value = path_value(path, tag, attribute)
-    format_value(value, node.props)
+    format_value(value, node.props)
+  end
+  def unescape!(path)
+    node_extractor.unescape!(path)
   end
   def format_value(value, props)

data/xml_data_extractor.gemspec CHANGED

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "xml_data_extractor"
-  spec.version       = "0.3.0"
+  spec.version       = "0.4.0"
   spec.authors       = ["Fernando Almeida"]
   spec.email         = ["fernandoprsbr@gmail.com"]

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: xml_data_extractor
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Fernando Almeida
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-10-30 00:00:00.000000000 Z
+date: 2020-11-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -75,6 +75,7 @@ files:
 - lib/src/extract/expression.rb
 - lib/src/extract/hash_builder.rb
 - lib/src/extract/string_value.rb
+- lib/src/extract/unescape.rb
 - lib/src/extract/value_builder.rb
 - lib/src/extract/within.rb
 - lib/src/extractor.rb