multi_xml 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,28 +1,28 @@
1
1
  require "rexml/document"
2
2
 
3
- module MultiXml
3
+ module MultiXML
4
4
  module Parsers
5
5
  # XML parser using Ruby's built-in REXML library
6
6
  #
7
7
  # @api private
8
8
  module Rexml
9
+ extend MultiXML::Parser
9
10
  extend self
10
11
 
11
- # Get the parse error class for this parser
12
- #
12
+ # Exception class raised on REXML parse failure
13
13
  # @api private
14
- # @return [Class] REXML::ParseException
15
- def parse_error = ::REXML::ParseException
14
+ ParseError = ::REXML::ParseException
16
15
 
17
16
  # Parse XML from an IO object
18
17
  #
19
18
  # @api private
20
19
  # @param io [IO] IO-like object containing XML
20
+ # @param namespaces [Symbol] Namespace handling mode
21
21
  # @return [Hash] Parsed XML as a hash
22
22
  # @raise [REXML::ParseException] if XML is malformed
23
- def parse(io)
23
+ def parse(io, namespaces: :strip)
24
24
  doc = REXML::Document.new(io)
25
- element_to_hash({}, doc.root)
25
+ element_to_hash({}, doc.root, namespaces)
26
26
  end
27
27
 
28
28
  private
@@ -32,21 +32,54 @@ module MultiXml
32
32
  # @api private
33
33
  # @param hash [Hash] Accumulator hash
34
34
  # @param element [REXML::Element] Element to convert
35
+ # @param mode [Symbol] Namespace handling mode
35
36
  # @return [Hash] Updated hash
36
- def element_to_hash(hash, element)
37
- add_to_hash(hash, element.name, collapse_element(element))
37
+ def element_to_hash(hash, element, mode)
38
+ add_to_hash(hash, format_element_name(element, mode), collapse_element(element, mode))
39
+ end
40
+
41
+ # Format element name using prefix/local and namespace mode
42
+ #
43
+ # @api private
44
+ # @param element [REXML::Element] Element node
45
+ # @param mode [Symbol] Namespace handling mode
46
+ # @return [String] formatted element name
47
+ def format_element_name(element, mode)
48
+ format_name(element.prefix, element.name, mode)
49
+ end
50
+
51
+ # Format attribute name using prefix/local and namespace mode
52
+ #
53
+ # @api private
54
+ # @param attr [REXML::Attribute] Attribute node
55
+ # @param mode [Symbol] Namespace handling mode
56
+ # @return [String] formatted attribute name
57
+ def format_attr_name(attr, mode)
58
+ format_name(attr.prefix, attr.name, mode)
59
+ end
60
+
61
+ # Produce a name string for a given [prefix, local] tuple
62
+ #
63
+ # @api private
64
+ # @param prefix [String, nil] Namespace prefix
65
+ # @param local [String] Local part of the name
66
+ # @param mode [Symbol] Namespace handling mode
67
+ # @return [String] formatted name
68
+ def format_name(prefix, local, mode)
69
+ (mode == :preserve && prefix && !prefix.empty?) ? "#{prefix}:#{local}" : local
38
70
  end
39
71
 
40
72
  # Collapse an element into a hash with attributes and content
41
73
  #
42
74
  # @api private
43
75
  # @param element [REXML::Element] Element to collapse
76
+ # @param mode [Symbol] Namespace handling mode
44
77
  # @return [Hash] Hash representation
45
- def collapse_element(element)
46
- node_hash = collect_attributes(element)
78
+ def collapse_element(element, mode)
79
+ node_hash = collect_attributes(element, mode)
47
80
 
48
81
  if element.has_elements?
49
- collect_child_elements(element, node_hash)
82
+ collect_child_elements(element, node_hash, mode)
50
83
  add_text_content(node_hash, element) unless whitespace_only?(element)
51
84
  elsif node_hash.empty? || !whitespace_only?(element)
52
85
  add_text_content(node_hash, element)
@@ -59,9 +92,34 @@ module MultiXml
59
92
  #
60
93
  # @api private
61
94
  # @param element [REXML::Element] Element with attributes
95
+ # @param mode [Symbol] Namespace handling mode
62
96
  # @return [Hash] Hash of attribute name-value pairs
63
- def collect_attributes(element)
64
- element.attributes.each_with_object({}) { |(name, value), hash| hash[name] = value }
97
+ def collect_attributes(element, mode)
98
+ element.attributes.each_attribute.with_object({}) do |attr, hash|
99
+ if xmlns_decl?(attr)
100
+ add_attribute_value(hash, xmlns_decl_key(attr), attr.value) if mode == :preserve
101
+ else
102
+ add_attribute_value(hash, format_attr_name(attr, mode), attr.value)
103
+ end
104
+ end
105
+ end
106
+
107
+ # Check whether an attribute represents an xmlns declaration
108
+ #
109
+ # @api private
110
+ # @param attr [REXML::Attribute] Attribute to inspect
111
+ # @return [Boolean] true if xmlns declaration
112
+ def xmlns_decl?(attr)
113
+ attr.prefix == "xmlns" || ((attr.prefix.nil? || attr.prefix.empty?) && attr.name == "xmlns")
114
+ end
115
+
116
+ # Build the key for an xmlns declaration under :preserve
117
+ #
118
+ # @api private
119
+ # @param attr [REXML::Attribute] Declaration attribute
120
+ # @return [String] key such as "xmlns" or "xmlns:atom"
121
+ def xmlns_decl_key(attr)
122
+ (attr.prefix == "xmlns") ? "xmlns:#{attr.name}" : "xmlns"
65
123
  end
66
124
 
67
125
  # Collect all child elements into a hash
@@ -69,9 +127,10 @@ module MultiXml
69
127
  # @api private
70
128
  # @param element [REXML::Element] Parent element
71
129
  # @param node_hash [Hash] Hash to populate
130
+ # @param mode [Symbol] Namespace handling mode
72
131
  # @return [void]
73
- def collect_child_elements(element, node_hash)
74
- element.each_element { |child| element_to_hash(node_hash, child) }
132
+ def collect_child_elements(element, node_hash, mode)
133
+ element.each_element { |child| element_to_hash(node_hash, child, mode) }
75
134
  end
76
135
 
77
136
  # Add text content from an element to a hash
@@ -106,6 +165,35 @@ module MultiXml
106
165
  hash
107
166
  end
108
167
 
168
+ # Add an attribute value while keeping document order on collisions
169
+ #
170
+ # @api private
171
+ # @param hash [Hash] Target hash
172
+ # @param key [String] Attribute key
173
+ # @param value [String] Attribute value
174
+ # @return [Hash] Updated hash
175
+ def add_attribute_value(hash, key, value)
176
+ existing = hash[key]
177
+ hash[key] = case existing
178
+ when nil then value
179
+ when Array then insert_attribute_before_children(existing, value)
180
+ when Hash then [value, existing]
181
+ else [existing, value]
182
+ end
183
+ hash
184
+ end
185
+
186
+ # Insert a later attribute before any child-element entries
187
+ #
188
+ # @api private
189
+ # @param values [Array] Existing colliding values
190
+ # @param value [String] Attribute value to insert
191
+ # @return [Array] Updated value list
192
+ def insert_attribute_before_children(values, value)
193
+ child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
194
+ values.dup.insert(child_index, value)
195
+ end
196
+
109
197
  # Check if element contains only whitespace text
110
198
  #
111
199
  # @api private
@@ -1,23 +1,29 @@
1
1
  require "cgi/escape"
2
2
 
3
- module MultiXml
3
+ module MultiXML
4
4
  module Parsers
5
- # Shared SAX handler logic for building hash trees from XML events
5
+ # Shared SAX handler logic for building hash trees from XML events.
6
6
  #
7
- # This module provides the core stack-based parsing logic used by both
8
- # NokogiriSax and LibxmlSax parsers. Including classes must implement
9
- # the callback methods that their respective SAX libraries expect.
7
+ # Provides a stack machine used by both NokogiriSax and LibxmlSax
8
+ # handlers. Parser-specific subclasses translate their native callbacks
9
+ # into calls on this entrypoint:
10
+ #
11
+ # - handle_start_element_ns(local, prefix, attr_tuples, ns_decls)
12
+ # where attr_tuples = [[attr_prefix_or_nil, local, value], ...]
13
+ # ns_decls = [[prefix_or_nil, uri], ...]
10
14
  #
11
15
  # @api private
12
16
  module SaxHandler
13
17
  # Initialize the handler state
14
18
  #
15
19
  # @api private
20
+ # @param mode [Symbol] Namespace handling mode
16
21
  # @return [void]
17
- def initialize_handler
22
+ def initialize_handler(mode)
23
+ @mode = mode
18
24
  @result = {}
19
25
  @stack = [@result]
20
- @pending_attrs = []
26
+ @pending = []
21
27
  end
22
28
 
23
29
  # Get the parsed result
@@ -28,31 +34,34 @@ module MultiXml
28
34
 
29
35
  private
30
36
 
31
- # Get the current element hash
37
+ # Get the current element hash on top of the stack
32
38
  #
33
39
  # @api private
34
40
  # @return [Hash] current hash being built
35
41
  def current = @stack.last
36
42
 
37
- # Handle start of an element by pushing onto the stack
43
+ # Entry point for namespace-aware start events
38
44
  #
39
45
  # @api private
40
- # @param name [String] Element name
41
- # @param attrs [Hash, Array] Element attributes
46
+ # @param local [String] Local element name
47
+ # @param prefix [String, nil] Element namespace prefix
48
+ # @param attr_tuples [Array] Attributes as [prefix, local, value]
49
+ # @param ns_decls [Array] xmlns declarations as [prefix, uri] pairs
42
50
  # @return [void]
43
- def handle_start_element(name, attrs)
51
+ def handle_start_element_ns(local, prefix, attr_tuples, ns_decls)
44
52
  child = {TEXT_CONTENT_KEY => +""}
45
- add_child_to_current(name, child)
53
+ add_child_to_current(format_name(prefix, local), child)
46
54
  @stack << child
47
- @pending_attrs << normalize_attrs(attrs)
55
+
56
+ @pending << build_pending_attrs(ns_decls, attr_tuples)
48
57
  end
49
58
 
50
- # Handle end of an element by applying attributes and popping the stack
59
+ # Apply attributes and pop the current element from the stack
51
60
  #
52
61
  # @api private
53
62
  # @return [void]
54
63
  def handle_end_element
55
- apply_attributes(@pending_attrs.pop)
64
+ @pending.pop.each { |key, value| add_attr_to_current(key, value) }
56
65
  strip_whitespace_content
57
66
  @stack.pop
58
67
  end
@@ -66,7 +75,40 @@ module MultiXml
66
75
  current[TEXT_CONTENT_KEY] << text
67
76
  end
68
77
 
69
- # Add a child hash to the current element
78
+ # Build the list of attributes to apply at element-end
79
+ #
80
+ # @api private
81
+ # @param ns_decls [Array] xmlns declarations
82
+ # @param attr_tuples [Array] Attribute [prefix, local, value] tuples
83
+ # @return [Array<Array>] list of [key, value] pairs
84
+ def build_pending_attrs(ns_decls, attr_tuples)
85
+ preserved_ns_decls(ns_decls) + attr_tuples.map do |prefix, local, value|
86
+ [format_name(prefix, local), CGI.unescapeHTML(value)]
87
+ end
88
+ end
89
+
90
+ # Transform xmlns declarations into attribute pairs for :preserve mode
91
+ #
92
+ # @api private
93
+ # @param ns_decls [Array] Declarations as [prefix, uri]
94
+ # @return [Array<Array>] [xmlns key, uri] pairs (empty outside :preserve)
95
+ def preserved_ns_decls(ns_decls)
96
+ return [] unless @mode == :preserve
97
+
98
+ ns_decls.map { |prefix, uri| [prefix ? "xmlns:#{prefix}" : "xmlns", uri] }
99
+ end
100
+
101
+ # Produce a name string for a [prefix, local] tuple
102
+ #
103
+ # @api private
104
+ # @param prefix [String, nil] Namespace prefix
105
+ # @param local [String] Local part of the name
106
+ # @return [String] formatted name
107
+ def format_name(prefix, local)
108
+ (@mode == :preserve && prefix) ? "#{prefix}:#{local}" : local
109
+ end
110
+
111
+ # Add a child element to the current hash, folding on collision
70
112
  #
71
113
  # @api private
72
114
  # @param name [String] Child element name
@@ -81,29 +123,39 @@ module MultiXml
81
123
  end
82
124
  end
83
125
 
84
- # Normalize attributes to a hash
126
+ # Add an attribute value to the current hash (attr wins on collision)
127
+ #
128
+ # Attributes are applied at end_element, after children have already
129
+ # populated the hash. When an attribute collides with a child of the
130
+ # same local name, the attribute is placed first in the resulting
131
+ # array (matching DomParser / REXML behavior and existing tests).
85
132
  #
86
133
  # @api private
87
- # @param attrs [Hash, Array] Attributes as hash or array of pairs
88
- # @return [Hash] Normalized attributes hash
89
- def normalize_attrs(attrs)
90
- attrs.is_a?(Hash) ? attrs : attrs.to_h
134
+ # @param key [String] Attribute key
135
+ # @param value [String] Attribute value
136
+ # @return [void]
137
+ def add_attr_to_current(key, value)
138
+ existing = current[key]
139
+ current[key] = case existing
140
+ when nil then value
141
+ when Array then insert_attribute_before_children(existing, value)
142
+ when Hash then [value, existing]
143
+ else [existing, value]
144
+ end
91
145
  end
92
146
 
93
- # Apply pending attributes to the current element
147
+ # Insert a later attribute before any child-element entries
94
148
  #
95
149
  # @api private
96
- # @param attrs [Hash] Attributes to apply
97
- # @return [void]
98
- def apply_attributes(attrs)
99
- attrs.each do |name, value|
100
- unescaped = CGI.unescapeHTML(value)
101
- existing = current[name]
102
- current[name] = existing ? [unescaped, existing] : unescaped
103
- end
150
+ # @param values [Array] Existing colliding values
151
+ # @param value [String] Attribute value to insert
152
+ # @return [Array] Updated value list
153
+ def insert_attribute_before_children(values, value)
154
+ child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
155
+ values.dup.insert(child_index, value)
104
156
  end
105
157
 
106
- # Remove empty or whitespace-only text content
158
+ # Remove empty or whitespace-only text content from the current hash
107
159
  #
108
160
  # @api private
109
161
  # @return [void]
@@ -1,7 +1,7 @@
1
- module MultiXml
2
- # The current version of MultiXml
1
+ module MultiXML
2
+ # The current version of MultiXML
3
3
  #
4
4
  # @api public
5
5
  # @return [Gem::Version] the gem version
6
- VERSION = Gem::Version.create("0.8.1")
6
+ VERSION = Gem::Version.create("0.9.0")
7
7
  end