multi_xml 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,71 +1,68 @@
1
- require "oga" unless defined?(Oga)
2
- require "multi_xml/parsers/libxml2_parser"
1
+ require "oga"
2
+ require_relative "dom_parser"
3
3
 
4
4
  module MultiXml
5
5
  module Parsers
6
- module Oga # :nodoc:
7
- include Libxml2Parser
6
+ # XML parser using the Oga library
7
+ #
8
+ # @api private
9
+ module Oga
10
+ include DomParser
8
11
  extend self
9
12
 
10
- def parse_error
11
- LL::ParserError
12
- end
13
+ # Get the parse error class for this parser
14
+ #
15
+ # @api private
16
+ # @return [Class] LL::ParserError
17
+ def parse_error = LL::ParserError
13
18
 
19
+ # Parse XML from an IO object
20
+ #
21
+ # @api private
22
+ # @param io [IO] IO-like object containing XML
23
+ # @return [Hash] Parsed XML as a hash
24
+ # @raise [LL::ParserError] if XML is malformed
14
25
  def parse(io)
15
- document = ::Oga.parse_xml(io)
16
- node_to_hash(document.children[0])
26
+ doc = ::Oga.parse_xml(io)
27
+ node_to_hash(doc.children.first)
17
28
  end
18
29
 
19
- def node_to_hash(node, hash = {}) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
20
- node_hash = {MultiXml::CONTENT_ROOT => ""}
21
-
22
- name = node_name(node)
23
-
24
- # Insert node hash into parent hash correctly.
25
- case hash[name]
26
- when Array
27
- hash[name] << node_hash
28
- when Hash
29
- hash[name] = [hash[name], node_hash]
30
- when NilClass
31
- hash[name] = node_hash
32
- end
33
-
34
- # Handle child elements
35
- each_child(node) do |c|
36
- if c.is_a?(::Oga::XML::Element)
37
- node_to_hash(c, node_hash)
38
- elsif c.is_a?(::Oga::XML::Text) || c.is_a?(::Oga::XML::Cdata)
39
- node_hash[MultiXml::CONTENT_ROOT] += c.text
30
+ # Collect child nodes into a hash (Oga-specific implementation)
31
+ #
32
+ # Oga uses different node types than Nokogiri/LibXML.
33
+ #
34
+ # @api private
35
+ # @param node [Oga::XML::Element] Parent node
36
+ # @param node_hash [Hash] Hash to populate
37
+ # @return [void]
38
+ def collect_children(node, node_hash)
39
+ each_child(node) do |child|
40
+ case child
41
+ when ::Oga::XML::Element then node_to_hash(child, node_hash)
42
+ when ::Oga::XML::Text, ::Oga::XML::Cdata then node_hash[TEXT_CONTENT_KEY] << child.text
40
43
  end
41
44
  end
42
-
43
- # Remove content node if it is empty
44
- node_hash.delete(MultiXml::CONTENT_ROOT) if node_hash[MultiXml::CONTENT_ROOT].strip.empty?
45
-
46
- # Handle attributes
47
- each_attr(node) do |a|
48
- key = node_name(a)
49
- v = node_hash[key]
50
- node_hash[key] = ((v) ? [a.value, v] : a.value)
51
- end
52
-
53
- hash
54
45
  end
55
46
 
56
47
  private
57
48
 
58
- def each_child(node, &)
59
- node.children.each(&)
60
- end
49
+ # Iterate over child nodes
50
+ #
51
+ # @param node [Oga::XML::Element] Parent node
52
+ # @return [void]
53
+ def each_child(node, &) = node.children.each(&)
61
54
 
62
- def each_attr(node, &)
63
- node.attributes.each(&)
64
- end
55
+ # Iterate over attribute nodes
56
+ #
57
+ # @param node [Oga::XML::Element] Element node
58
+ # @return [void]
59
+ def each_attr(node, &) = node.attributes.each(&)
65
60
 
66
- def node_name(node)
67
- node.name
68
- end
61
+ # Get the name of a node or attribute
62
+ #
63
+ # @param node [Oga::XML::Node] Node to get name from
64
+ # @return [String] Node name
65
+ def node_name(node) = node.name
69
66
  end
70
67
  end
71
68
  end
@@ -1,89 +1,131 @@
1
- require "ox" unless defined?(Ox)
2
-
3
- # Each MultiXml parser is expected to parse an XML document into a Hash. The
4
- # conversion rules are:
5
- #
6
- # - Each document starts out as an empty Hash.
7
- #
8
- # - Reading an element created an entry in the parent Hash that has a key of
9
- # the element name and a value of a Hash with attributes as key value
10
- # pairs. Children are added as described by this rule.
11
- #
12
- # - Text and CDATE is stored in the parent element Hash with a key of
13
- # MultiXml::CONTENT_ROOT and a value of the text itself.
14
- #
15
- # - If a key already exists in the Hash then the value associated with the key
16
- # is converted to an Array with the old and new value in it.
17
- #
18
- # - Other elements such as the xml prolog, doctype, and comments are ignored.
19
- #
1
+ require "ox"
20
2
 
21
3
  module MultiXml
22
4
  module Parsers
23
- module Ox # :nodoc:
5
+ # XML parser using the Ox library (fastest pure-Ruby parser)
6
+ #
7
+ # @api private
8
+ module Ox
24
9
  module_function
25
10
 
26
- def parse_error
27
- Exception
28
- end
11
+ # Get the parse error class for this parser
12
+ #
13
+ # @api private
14
+ # @return [Class] Ox::ParseError
15
+ def parse_error = ::Ox::ParseError
29
16
 
17
+ # Parse XML from an IO object
18
+ #
19
+ # @api private
20
+ # @param io [IO] IO-like object containing XML
21
+ # @return [Hash] Parsed XML as a hash
30
22
  def parse(io)
31
23
  handler = Handler.new
32
24
  ::Ox.sax_parse(handler, io, convert_special: true, skip: :skip_return)
33
- handler.doc
25
+ handler.result
34
26
  end
35
27
 
28
+ # SAX event handler that builds a hash tree while parsing
29
+ #
30
+ # @api private
36
31
  class Handler
37
- attr_accessor :stack
38
-
32
+ # Create a new SAX handler
33
+ #
34
+ # @return [Handler] new handler instance
39
35
  def initialize
40
36
  @stack = []
41
37
  end
42
38
 
43
- def doc
44
- @stack[0]
45
- end
39
+ # Get the parsed result
40
+ #
41
+ # @return [Hash, nil] the root hash or nil if empty
42
+ def result = @stack.first
46
43
 
47
- def attr(name, value)
48
- append(name, value) unless @stack.empty?
44
+ # Handle start of an element
45
+ #
46
+ # @param name [Symbol] Element name
47
+ # @return [void]
48
+ def start_element(name)
49
+ @stack << {} if @stack.empty?
50
+ child = {}
51
+ add_value(name.to_s, child)
52
+ @stack << child
49
53
  end
50
54
 
51
- def text(value)
52
- append(MultiXml::CONTENT_ROOT, value)
55
+ # Handle end of an element
56
+ #
57
+ # @param _name [Symbol] Element name (unused)
58
+ # @return [void]
59
+ def end_element(_name)
60
+ strip_whitespace_content if current.key?(TEXT_CONTENT_KEY)
61
+ @stack.pop
53
62
  end
54
63
 
55
- def cdata(value)
56
- append(MultiXml::CONTENT_ROOT, value)
64
+ # Handle an attribute
65
+ #
66
+ # @param name [Symbol] Attribute name
67
+ # @param value [String] Attribute value
68
+ # @return [void]
69
+ def attr(name, value)
70
+ add_value(name.to_s, value) unless @stack.empty?
57
71
  end
58
72
 
59
- def start_element(name)
60
- @stack.push({}) if @stack.empty?
61
- h = {}
62
- append(name, h)
63
- @stack.push(h)
73
+ # Handle text content
74
+ #
75
+ # @param value [String] Text content
76
+ # @return [void]
77
+ def text(value) = add_value(TEXT_CONTENT_KEY, value)
78
+
79
+ # Handle CDATA content
80
+ #
81
+ # @param value [String] CDATA content
82
+ # @return [void]
83
+ def cdata(value) = add_value(TEXT_CONTENT_KEY, value)
84
+
85
+ # Handle parse errors
86
+ #
87
+ # @param message [String] Error message
88
+ # @param line [Integer] Line number
89
+ # @param column [Integer] Column number
90
+ # @return [void]
91
+ # @raise [Ox::ParseError] always
92
+ def error(message, line, column)
93
+ raise ::Ox::ParseError, "#{message} at #{line}:#{column}"
64
94
  end
65
95
 
66
- def end_element(_)
67
- @stack.pop
96
+ private
97
+
98
+ # Get the current element hash
99
+ #
100
+ # @return [Hash] current hash being built
101
+ def current = @stack.last
102
+
103
+ # Add a value to the current hash, merging with existing if needed
104
+ #
105
+ # @param key [String] Key to add
106
+ # @param value [Object] Value to add
107
+ # @return [void]
108
+ def add_value(key, value)
109
+ existing = current[key]
110
+ current[key] = existing ? merge_values(existing, value) : value
68
111
  end
69
112
 
70
- def error(message, line, column)
71
- raise(StandardError, "#{message} at #{line}:#{column}")
113
+ # Merge a value with an existing value, creating array if needed
114
+ #
115
+ # @param existing [Object] Existing value
116
+ # @param value [Object] Value to append
117
+ # @return [Array] array with both values
118
+ def merge_values(existing, value)
119
+ existing.is_a?(Array) ? existing << value : [existing, value]
72
120
  end
73
121
 
74
- def append(key, value)
75
- key = key.to_s
76
- h = @stack.last
77
- if h.key?(key)
78
- v = h[key]
79
- if v.is_a?(Array)
80
- v << value
81
- else
82
- h[key] = [v, value]
83
- end
84
- else
85
- h[key] = value
86
- end
122
+ # Remove empty or whitespace-only text content
123
+ #
124
+ # @return [void]
125
+ def strip_whitespace_content
126
+ content = current[TEXT_CONTENT_KEY]
127
+ should_remove = content.empty? || (current.size > 1 && content.strip.empty?)
128
+ current.delete(TEXT_CONTENT_KEY) if should_remove
87
129
  end
88
130
  end
89
131
  end
@@ -1,111 +1,117 @@
1
- require "rexml/document" unless defined?(REXML::Document)
1
+ require "rexml/document"
2
2
 
3
3
  module MultiXml
4
4
  module Parsers
5
- module Rexml # :nodoc:
5
+ # XML parser using Ruby's built-in REXML library
6
+ #
7
+ # @api private
8
+ module Rexml
6
9
  extend self
7
10
 
8
- def parse_error
9
- ::REXML::ParseException
10
- end
11
-
12
- # Parse an XML Document IO into a simple hash using REXML
11
+ # Get the parse error class for this parser
13
12
  #
14
- # xml::
15
- # XML Document IO to parse
16
- def parse(xml)
17
- doc = REXML::Document.new(xml)
18
- raise(REXML::ParseException, "The document #{doc.to_s.inspect} does not have a valid root") unless doc.root
13
+ # @api private
14
+ # @return [Class] REXML::ParseException
15
+ def parse_error = ::REXML::ParseException
19
16
 
20
- merge_element!({}, doc.root)
17
+ # Parse XML from an IO object
18
+ #
19
+ # @api private
20
+ # @param io [IO] IO-like object containing XML
21
+ # @return [Hash] Parsed XML as a hash
22
+ # @raise [REXML::ParseException] if XML is malformed
23
+ def parse(io)
24
+ doc = REXML::Document.new(io)
25
+ element_to_hash({}, doc.root)
21
26
  end
22
27
 
23
28
  private
24
29
 
25
- # Convert an XML element and merge into the hash
30
+ # Convert an element to hash format
26
31
  #
27
- # hash::
28
- # Hash to merge the converted element into.
29
- # element::
30
- # XML element to merge into hash
31
- def merge_element!(hash, element)
32
- merge!(hash, element.name, collapse(element))
32
+ # @api private
33
+ # @param hash [Hash] Accumulator hash
34
+ # @param element [REXML::Element] Element to convert
35
+ # @return [Hash] Updated hash
36
+ def element_to_hash(hash, element)
37
+ add_to_hash(hash, element.name, collapse_element(element))
33
38
  end
34
39
 
35
- # Actually converts an XML document element into a data structure.
40
+ # Collapse an element into a hash with attributes and content
36
41
  #
37
- # element::
38
- # The document element to be collapsed.
39
- def collapse(element)
40
- hash = get_attributes(element)
42
+ # @api private
43
+ # @param element [REXML::Element] Element to collapse
44
+ # @return [Hash] Hash representation
45
+ def collapse_element(element)
46
+ node_hash = collect_attributes(element)
41
47
 
42
48
  if element.has_elements?
43
- element.each_element { |child| merge_element!(hash, child) }
44
- merge_texts!(hash, element) unless empty_content?(element)
45
- hash
46
- else
47
- merge_texts!(hash, element)
49
+ collect_child_elements(element, node_hash)
50
+ add_text_content(node_hash, element) unless whitespace_only?(element)
51
+ elsif node_hash.empty? || !whitespace_only?(element)
52
+ add_text_content(node_hash, element)
48
53
  end
54
+
55
+ node_hash
49
56
  end
50
57
 
51
- # Merge all the texts of an element into the hash
58
+ # Collect all attributes from an element into a hash
52
59
  #
53
- # hash::
54
- # Hash to add the converted element to.
55
- # element::
56
- # XML element whose texts are to me merged into the hash
57
- def merge_texts!(hash, element)
58
- if element.has_text?
59
- # must use value to prevent double-escaping
60
- texts = element.texts.map(&:value).join
61
- merge!(hash, MultiXml::CONTENT_ROOT, texts)
62
- else
63
- hash
64
- end
60
+ # @api private
61
+ # @param element [REXML::Element] Element with attributes
62
+ # @return [Hash] Hash of attribute name-value pairs
63
+ def collect_attributes(element)
64
+ element.attributes.each_with_object({}) { |(name, value), hash| hash[name] = value }
65
65
  end
66
66
 
67
- # Adds a new key/value pair to an existing Hash. If the key to be added
68
- # already exists and the existing value associated with key is not
69
- # an Array, it will be wrapped in an Array. Then the new value is
70
- # appended to that Array.
67
+ # Collect all child elements into a hash
71
68
  #
72
- # hash::
73
- # Hash to add key/value pair to.
74
- # key::
75
- # Key to be added.
76
- # value::
77
- # Value to be associated with key.
78
- def merge!(hash, key, value)
79
- if hash.key?(key)
80
- if hash[key].instance_of?(Array)
81
- hash[key] << value
82
- else
83
- hash[key] = [hash[key], value]
84
- end
85
- elsif value.instance_of?(Array)
86
- hash[key] = [value]
87
- else
88
- hash[key] = value
89
- end
90
- hash
69
+ # @api private
70
+ # @param element [REXML::Element] Parent element
71
+ # @param node_hash [Hash] Hash to populate
72
+ # @return [void]
73
+ def collect_child_elements(element, node_hash)
74
+ element.each_element { |child| element_to_hash(node_hash, child) }
75
+ end
76
+
77
+ # Add text content from an element to a hash
78
+ #
79
+ # @api private
80
+ # @param hash [Hash] Target hash
81
+ # @param element [REXML::Element] Element with text
82
+ # @return [Hash] Updated hash
83
+ def add_text_content(hash, element)
84
+ return hash unless element.has_text?
85
+
86
+ text = element.texts.map(&:value).join
87
+ add_to_hash(hash, TEXT_CONTENT_KEY, text)
91
88
  end
92
89
 
93
- # Converts the attributes array of an XML element into a hash.
94
- # Returns an empty Hash if node has no attributes.
90
+ # Add a value to a hash, handling duplicates as arrays
95
91
  #
96
- # element::
97
- # XML element to extract attributes from.
98
- def get_attributes(element)
99
- attributes = {}
100
- element.attributes.each { |n, v| attributes[n] = v }
101
- attributes
92
+ # @api private
93
+ # @param hash [Hash] Target hash
94
+ # @param key [String] Key to add
95
+ # @param value [Object] Value to add
96
+ # @return [Hash] Updated hash
97
+ def add_to_hash(hash, key, value)
98
+ existing = hash[key]
99
+ hash[key] = if existing
100
+ existing.is_a?(Array) ? existing << value : [existing, value]
101
+ elsif value.is_a?(Array)
102
+ [value]
103
+ else
104
+ value
105
+ end
106
+ hash
102
107
  end
103
108
 
104
- # Determines if a document element has text content
109
+ # Check if element contains only whitespace text
105
110
  #
106
- # element::
107
- # XML element to be checked.
108
- def empty_content?(element)
111
+ # @api private
112
+ # @param element [REXML::Element] Element to check
113
+ # @return [Boolean] true if whitespace only
114
+ def whitespace_only?(element)
109
115
  element.texts.join.strip.empty?
110
116
  end
111
117
  end
@@ -0,0 +1,117 @@
1
+ require "cgi/escape"
2
+
3
+ module MultiXml
4
+ module Parsers
5
+ # Shared SAX handler logic for building hash trees from XML events
6
+ #
7
+ # This module provides the core stack-based parsing logic used by both
8
+ # NokogiriSax and LibxmlSax parsers. Including classes must implement
9
+ # the callback methods that their respective SAX libraries expect.
10
+ #
11
+ # @api private
12
+ module SaxHandler
13
+ # Initialize the handler state
14
+ #
15
+ # @api private
16
+ # @return [void]
17
+ def initialize_handler
18
+ @result = {}
19
+ @stack = [@result]
20
+ @pending_attrs = []
21
+ end
22
+
23
+ # Get the parsed result
24
+ #
25
+ # @api private
26
+ # @return [Hash] the parsed hash
27
+ attr_reader :result
28
+
29
+ private
30
+
31
+ # Get the current element hash
32
+ #
33
+ # @api private
34
+ # @return [Hash] current hash being built
35
+ def current = @stack.last
36
+
37
+ # Handle start of an element by pushing onto the stack
38
+ #
39
+ # @api private
40
+ # @param name [String] Element name
41
+ # @param attrs [Hash, Array] Element attributes
42
+ # @return [void]
43
+ def handle_start_element(name, attrs)
44
+ child = {TEXT_CONTENT_KEY => +""}
45
+ add_child_to_current(name, child)
46
+ @stack << child
47
+ @pending_attrs << normalize_attrs(attrs)
48
+ end
49
+
50
+ # Handle end of an element by applying attributes and popping the stack
51
+ #
52
+ # @api private
53
+ # @return [void]
54
+ def handle_end_element
55
+ apply_attributes(@pending_attrs.pop)
56
+ strip_whitespace_content
57
+ @stack.pop
58
+ end
59
+
60
+ # Append text to the current element's content
61
+ #
62
+ # @api private
63
+ # @param text [String] Text to append
64
+ # @return [void]
65
+ def append_text(text)
66
+ current[TEXT_CONTENT_KEY] << text
67
+ end
68
+
69
+ # Add a child hash to the current element
70
+ #
71
+ # @api private
72
+ # @param name [String] Child element name
73
+ # @param child [Hash] Child hash to add
74
+ # @return [void]
75
+ def add_child_to_current(name, child)
76
+ existing = current[name]
77
+ current[name] = case existing
78
+ when Array then existing << child
79
+ when Hash then [existing, child]
80
+ else child
81
+ end
82
+ end
83
+
84
+ # Normalize attributes to a hash
85
+ #
86
+ # @api private
87
+ # @param attrs [Hash, Array] Attributes as hash or array of pairs
88
+ # @return [Hash] Normalized attributes hash
89
+ def normalize_attrs(attrs)
90
+ attrs.is_a?(Hash) ? attrs : attrs.to_h
91
+ end
92
+
93
+ # Apply pending attributes to the current element
94
+ #
95
+ # @api private
96
+ # @param attrs [Hash] Attributes to apply
97
+ # @return [void]
98
+ def apply_attributes(attrs)
99
+ attrs.each do |name, value|
100
+ unescaped = CGI.unescapeHTML(value)
101
+ existing = current[name]
102
+ current[name] = existing ? [unescaped, existing] : unescaped
103
+ end
104
+ end
105
+
106
+ # Remove empty or whitespace-only text content
107
+ #
108
+ # @api private
109
+ # @return [void]
110
+ def strip_whitespace_content
111
+ content = current[TEXT_CONTENT_KEY]
112
+ should_remove = content.empty? || (current.size > 1 && content.strip.empty?)
113
+ current.delete(TEXT_CONTENT_KEY) if should_remove
114
+ end
115
+ end
116
+ end
117
+ end
@@ -1,3 +1,7 @@
1
1
  module MultiXml
2
- VERSION = Gem::Version.create("0.7.2")
2
+ # The current version of MultiXml
3
+ #
4
+ # @api public
5
+ # @return [Gem::Version] the gem version
6
+ VERSION = Gem::Version.create("0.8.0")
3
7
  end