multi_xml 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,36 +2,41 @@ require "nokogiri"
2
2
  require "stringio"
3
3
  require_relative "sax_handler"
4
4
 
5
- module MultiXml
5
+ module MultiXML
6
6
  module Parsers
7
7
  # SAX-based parser using Nokogiri (faster for large documents)
8
8
  #
9
9
  # @api private
10
10
  module NokogiriSax
11
+ extend MultiXML::Parser
12
+
11
13
  module_function
12
14
 
13
- # Get the parse error class for this parser
14
- #
15
+ # Exception class raised on Nokogiri parse failure
15
16
  # @api private
16
- # @return [Class] Nokogiri::XML::SyntaxError
17
- def parse_error = ::Nokogiri::XML::SyntaxError
17
+ ParseError = ::Nokogiri::XML::SyntaxError
18
18
 
19
19
  # Parse XML from a string or IO object
20
20
  #
21
21
  # @api private
22
22
  # @param xml [String, IO] XML content
23
+ # @param namespaces [Symbol] Namespace handling mode
23
24
  # @return [Hash] Parsed XML as a hash
24
25
  # @raise [Nokogiri::XML::SyntaxError] if XML is malformed
25
- def parse(xml)
26
+ def parse(xml, namespaces: :strip)
26
27
  io = xml.respond_to?(:read) ? xml : StringIO.new(xml)
27
28
  return {} if io.eof?
28
29
 
29
- handler = Handler.new
30
+ handler = Handler.new(namespaces)
30
31
  ::Nokogiri::XML::SAX::Parser.new(handler).parse(io)
31
32
  handler.result
32
33
  end
33
34
 
34
- # Nokogiri SAX handler that builds a hash tree while parsing
35
+ # Nokogiri SAX handler.
36
+ #
37
+ # Nokogiri always invokes `start_element_namespace` (even for documents
38
+ # without namespaces — prefix/uri come through as nil). We don't define
39
+ # `start_element` because it would never fire.
35
40
  #
36
41
  # @api private
37
42
  class Handler < ::Nokogiri::XML::SAX::Document
@@ -40,10 +45,11 @@ module MultiXml
40
45
  # Create a new SAX handler
41
46
  #
42
47
  # @api private
48
+ # @param mode [Symbol] Namespace handling mode
43
49
  # @return [Handler] new handler instance
44
- def initialize
45
- super
46
- initialize_handler
50
+ def initialize(mode)
51
+ super()
52
+ initialize_handler(mode)
47
53
  end
48
54
 
49
55
  # Handle start of document (no-op)
@@ -70,22 +76,33 @@ module MultiXml
70
76
  raise ::Nokogiri::XML::SyntaxError, message
71
77
  end
72
78
 
73
- # Handle start of an element
79
+ # Handle start of a namespaced element
80
+ #
81
+ # Signature is fixed by the Nokogiri SAX protocol.
74
82
  #
75
83
  # @api private
76
- # @param name [String] Element name
77
- # @param attrs [Array] Element attributes as pairs
84
+ # @param local [String] Local element name
85
+ # @param attrs [Array<Nokogiri::XML::SAX::Parser::Attribute>] Attributes
86
+ # @param prefix [String, nil] Element namespace prefix
87
+ # @param _uri [String, nil] Element namespace URI (unused)
88
+ # @param ns [Array] Namespace declarations as [prefix, uri] pairs
78
89
  # @return [void]
79
- def start_element(name, attrs = [])
80
- handle_start_element(name, attrs)
90
+ # rubocop:disable Metrics/ParameterLists, Naming/MethodParameterName
91
+ def start_element_namespace(local, attrs = [], prefix = nil, _uri = nil, ns = [])
92
+ ns_decls = ns.map { |p, u| [normalize(p), u] }
93
+ attr_tuples = attrs.map { |a| [normalize(a.prefix), a.localname, a.value] }
94
+ handle_start_element_ns(local, normalize(prefix), attr_tuples, ns_decls)
81
95
  end
96
+ # rubocop:enable Metrics/ParameterLists, Naming/MethodParameterName
82
97
 
83
- # Handle end of an element
98
+ # Handle end of a namespaced element
84
99
  #
85
100
  # @api private
86
- # @param _name [String] Element name (unused)
101
+ # @param _local [String] Local element name (unused)
102
+ # @param _prefix [String, nil] Namespace prefix (unused)
103
+ # @param _uri [String, nil] Namespace URI (unused)
87
104
  # @return [void]
88
- def end_element(_name)
105
+ def end_element_namespace(_local, _prefix = nil, _uri = nil)
89
106
  handle_end_element
90
107
  end
91
108
 
@@ -96,6 +113,17 @@ module MultiXml
96
113
  # @return [void]
97
114
  def characters(text) = append_text(text)
98
115
  alias_method :cdata_block, :characters
116
+
117
+ private
118
+
119
+ # Normalize a value, returning nil for empty or nil input
120
+ #
121
+ # @api private
122
+ # @param value [String, nil] Value to normalize
123
+ # @return [String, nil] value or nil if empty
124
+ def normalize(value)
125
+ (value.nil? || value.to_s.empty?) ? nil : value
126
+ end
99
127
  end
100
128
  end
101
129
  end
@@ -1,30 +1,30 @@
1
1
  require "oga"
2
2
  require_relative "dom_parser"
3
3
 
4
- module MultiXml
4
+ module MultiXML
5
5
  module Parsers
6
6
  # XML parser using the Oga library
7
7
  #
8
8
  # @api private
9
9
  module Oga
10
+ extend MultiXML::Parser
10
11
  include DomParser
11
12
  extend self
12
13
 
13
- # Get the parse error class for this parser
14
- #
14
+ # Exception class raised on Oga parse failure
15
15
  # @api private
16
- # @return [Class] LL::ParserError
17
- def parse_error = LL::ParserError
16
+ ParseError = LL::ParserError
18
17
 
19
18
  # Parse XML from an IO object
20
19
  #
21
20
  # @api private
22
21
  # @param io [IO] IO-like object containing XML
22
+ # @param namespaces [Symbol] Namespace handling mode
23
23
  # @return [Hash] Parsed XML as a hash
24
24
  # @raise [LL::ParserError] if XML is malformed
25
- def parse(io)
25
+ def parse(io, namespaces: :strip)
26
26
  doc = ::Oga.parse_xml(io)
27
- node_to_hash(doc.children.first)
27
+ node_to_hash(doc.children.first, mode: namespaces)
28
28
  end
29
29
 
30
30
  # Collect child nodes into a hash (Oga-specific implementation)
@@ -34,11 +34,12 @@ module MultiXml
34
34
  # @api private
35
35
  # @param node [Oga::XML::Element] Parent node
36
36
  # @param node_hash [Hash] Hash to populate
37
+ # @param mode [Symbol] Namespace handling mode
37
38
  # @return [void]
38
- def collect_children(node, node_hash)
39
+ def collect_children(node, node_hash, mode)
39
40
  each_child(node) do |child|
40
41
  case child
41
- when ::Oga::XML::Element then node_to_hash(child, node_hash)
42
+ when ::Oga::XML::Element then node_to_hash(child, node_hash, mode: mode)
42
43
  when ::Oga::XML::Text, ::Oga::XML::Cdata then node_hash[TEXT_CONTENT_KEY] << child.text
43
44
  end
44
45
  end
@@ -48,21 +49,92 @@ module MultiXml
48
49
 
49
50
  # Iterate over child nodes
50
51
  #
52
+ # @api private
51
53
  # @param node [Oga::XML::Element] Parent node
52
54
  # @return [void]
53
55
  def each_child(node, &) = node.children.each(&)
54
56
 
55
- # Iterate over attribute nodes
57
+ # Iterate over attribute nodes (excludes xmlns declarations)
56
58
  #
59
+ # @api private
57
60
  # @param node [Oga::XML::Element] Element node
58
61
  # @return [void]
59
- def each_attr(node, &) = node.attributes.each(&)
62
+ def each_element_attr(node)
63
+ node.attributes.each do |attr|
64
+ next if oga_xmlns_attr?(attr)
65
+
66
+ yield attr
67
+ end
68
+ end
69
+
70
+ # Yield each xmlns declaration on this element
71
+ #
72
+ # Oga stores only locally declared namespaces on each element
73
+ # (inherited ones are resolved via lookup, not merged into
74
+ # #namespaces), so we can yield them directly.
75
+ #
76
+ # @api private
77
+ # @param node [Oga::XML::Element] Element node
78
+ # @return [void]
79
+ def each_namespace_decl(node)
80
+ namespace_scope(node).each do |key, ns|
81
+ prefix = (key == "xmlns") ? nil : key
82
+ yield prefix, ns.uri
83
+ end
84
+ end
85
+
86
+ # Return [prefix, local] for an element
87
+ #
88
+ # @api private
89
+ # @param node [Oga::XML::Element] Element node
90
+ # @return [Array<String, nil>] prefix and local name
91
+ def element_parts(node)
92
+ [oga_prefix(node.namespace), node.name]
93
+ end
60
94
 
61
- # Get the name of a node or attribute
95
+ # Return [prefix, local] for an attribute
62
96
  #
63
- # @param node [Oga::XML::Node] Node to get name from
64
- # @return [String] Node name
65
- def node_name(node) = node.name
97
+ # @api private
98
+ # @param attr [Oga::XML::Attribute] Attribute node
99
+ # @return [Array<String, nil>] prefix and local name
100
+ def attr_parts(attr)
101
+ [oga_prefix(attr.namespace), attr.name]
102
+ end
103
+
104
+ # Translate Oga's default-namespace sentinel to nil
105
+ #
106
+ # Oga represents the default namespace with the sentinel name "xmlns";
107
+ # we translate that to nil so it isn't emitted as a prefix.
108
+ #
109
+ # @api private
110
+ # @param namespace [Oga::XML::Namespace, nil] Namespace object
111
+ # @return [String, nil] prefix or nil
112
+ def oga_prefix(namespace)
113
+ return nil unless namespace
114
+
115
+ (namespace.name == "xmlns") ? nil : namespace.name
116
+ end
117
+
118
+ # Check whether an Oga attribute is actually an xmlns declaration
119
+ #
120
+ # Oga exposes xmlns declarations via Element#namespaces but may also
121
+ # surface them as raw attributes in some cases — filter either shape.
122
+ #
123
+ # @api private
124
+ # @param attr [Oga::XML::Attribute] Attribute node
125
+ # @return [Boolean] true if it's an xmlns declaration
126
+ def oga_xmlns_attr?(attr)
127
+ attr.name == "xmlns" || attr.namespace_name == "xmlns"
128
+ end
129
+
130
+ # Local namespace scope for a node
131
+ #
132
+ # @api private
133
+ # @param node [Oga::XML::Element] Element node
134
+ # @return [Hash{String => Oga::XML::Namespace}] scope
135
+ def namespace_scope(node)
136
+ node.namespaces || {}
137
+ end
66
138
  end
67
139
  end
68
140
  end
@@ -1,89 +1,106 @@
1
1
  require "ox"
2
2
 
3
- module MultiXml
3
+ module MultiXML
4
4
  module Parsers
5
5
  # XML parser using the Ox library (fastest pure-Ruby parser)
6
6
  #
7
7
  # @api private
8
8
  module Ox
9
+ extend MultiXML::Parser
10
+
9
11
  module_function
10
12
 
11
- # Get the parse error class for this parser
12
- #
13
+ # Exception class raised on Ox parse failure
13
14
  # @api private
14
- # @return [Class] Ox::ParseError
15
- def parse_error = ::Ox::ParseError
15
+ ParseError = ::Ox::ParseError
16
16
 
17
17
  # Parse XML from an IO object
18
18
  #
19
19
  # @api private
20
20
  # @param io [IO] IO-like object containing XML
21
+ # @param namespaces [Symbol] Namespace handling mode
21
22
  # @return [Hash] Parsed XML as a hash
22
- def parse(io)
23
- handler = Handler.new
23
+ def parse(io, namespaces: :strip)
24
+ handler = Handler.new(namespaces)
24
25
  ::Ox.sax_parse(handler, io, convert_special: true, skip: :skip_return)
25
26
  handler.result
26
27
  end
27
28
 
28
- # SAX event handler that builds a hash tree while parsing
29
+ # SAX event handler that builds a hash tree while parsing.
30
+ #
31
+ # Ox's SAX callbacks expose element and attribute names in prefixed
32
+ # form (e.g. "atom:feed"). Under :preserve we keep the source form
33
+ # verbatim; under :strip we drop the prefix and filter xmlns
34
+ # declarations out of the attribute stream.
29
35
  #
30
36
  # @api private
31
37
  class Handler
32
38
  # Create a new SAX handler
33
39
  #
40
+ # @api private
41
+ # @param mode [Symbol] Namespace handling mode
34
42
  # @return [Handler] new handler instance
35
- def initialize
36
- @stack = []
43
+ def initialize(mode)
44
+ @mode = mode
45
+ @stack = [{}]
37
46
  end
38
47
 
39
48
  # Get the parsed result
40
49
  #
41
- # @return [Hash, nil] the root hash or nil if empty
50
+ # @api private
51
+ # @return [Hash] the parsed hash
42
52
  def result = @stack.first
43
53
 
44
54
  # Handle start of an element
45
55
  #
46
- # @param name [Symbol] Element name
56
+ # @api private
57
+ # @param name [Symbol, String] Element name
47
58
  # @return [void]
48
59
  def start_element(name)
49
- @stack << {} if @stack.empty?
50
60
  child = {}
51
- add_value(name.to_s, child)
61
+ add_value(current, format_name(name.to_s), child)
52
62
  @stack << child
53
63
  end
54
64
 
55
- # Handle end of an element
56
- #
57
- # @param _name [Symbol] Element name (unused)
58
- # @return [void]
59
- def end_element(_name)
60
- strip_whitespace_content if current.key?(TEXT_CONTENT_KEY)
61
- @stack.pop
62
- end
63
-
64
65
  # Handle an attribute
65
66
  #
66
- # @param name [Symbol] Attribute name
67
+ # Ignored outside an element (e.g. attributes on the XML declaration
68
+ # such as `<?xml version="1.0"?>`, which fire before any `start_element`).
69
+ #
70
+ # @api private
71
+ # @param name [Symbol, String] Attribute name
67
72
  # @param value [String] Attribute value
68
73
  # @return [void]
69
74
  def attr(name, value)
70
- add_value(name.to_s, value) unless @stack.empty?
75
+ return if @stack.size < 2
76
+
77
+ name = name.to_s
78
+ return if xmlns_decl?(name) && @mode != :preserve
79
+
80
+ add_attribute_value(current, format_name(name), value)
71
81
  end
72
82
 
73
- # Handle text content
83
+ # Handle text content (also aliased as `cdata`)
74
84
  #
85
+ # @api private
75
86
  # @param value [String] Text content
76
87
  # @return [void]
77
- def text(value) = add_value(TEXT_CONTENT_KEY, value)
88
+ def text(value) = append_text(current, value)
89
+ alias_method :cdata, :text
78
90
 
79
- # Handle CDATA content
91
+ # Handle end of an element
80
92
  #
81
- # @param value [String] CDATA content
93
+ # @api private
94
+ # @param _name [Symbol, String] Element name (unused)
82
95
  # @return [void]
83
- def cdata(value) = add_value(TEXT_CONTENT_KEY, value)
96
+ def end_element(_name)
97
+ strip_whitespace_content if current.key?(TEXT_CONTENT_KEY)
98
+ @stack.pop
99
+ end
84
100
 
85
101
  # Handle parse errors
86
102
  #
103
+ # @api private
87
104
  # @param message [String] Error message
88
105
  # @param line [Integer] Line number
89
106
  # @param column [Integer] Column number
@@ -95,23 +112,60 @@ module MultiXml
95
112
 
96
113
  private
97
114
 
98
- # Get the current element hash
115
+ # Current element hash on top of the stack
99
116
  #
117
+ # @api private
100
118
  # @return [Hash] current hash being built
101
119
  def current = @stack.last
102
120
 
103
- # Add a value to the current hash, merging with existing if needed
121
+ # Format a prefixed-or-local name according to the namespace mode
104
122
  #
123
+ # @api private
124
+ # @param name [String] Prefixed or local name
125
+ # @return [String] formatted name
126
+ def format_name(name)
127
+ (@mode == :preserve) ? name : name.split(":", 2).last
128
+ end
129
+
130
+ # Check whether an attribute name is an xmlns declaration
131
+ #
132
+ # @api private
133
+ # @param name [String] Attribute name
134
+ # @return [Boolean] true if xmlns or xmlns:*
135
+ def xmlns_decl?(name)
136
+ name == "xmlns" || name.start_with?("xmlns:")
137
+ end
138
+
139
+ # Add a value to a hash, folding into an array on collision
140
+ #
141
+ # @api private
142
+ # @param hash [Hash] Target hash
105
143
  # @param key [String] Key to add
106
144
  # @param value [Object] Value to add
107
145
  # @return [void]
108
- def add_value(key, value)
109
- existing = current[key]
110
- current[key] = existing ? merge_values(existing, value) : value
146
+ def add_value(hash, key, value)
147
+ existing = hash[key]
148
+ hash[key] = existing ? merge_values(existing, value) : value
111
149
  end
112
150
 
113
- # Merge a value with an existing value, creating array if needed
151
+ # Append a text fragment to the current node's content
152
+ #
153
+ # SAX parsers may deliver element text in multiple callbacks when
154
+ # inline elements split the text stream. MultiXML represents that
155
+ # as one concatenated ``__content__`` string, not an array.
114
156
  #
157
+ # @api private
158
+ # @param hash [Hash] Target hash
159
+ # @param value [String] Text fragment
160
+ # @return [void]
161
+ def append_text(hash, value)
162
+ existing = hash[TEXT_CONTENT_KEY]
163
+ hash[TEXT_CONTENT_KEY] = existing ? "#{existing}#{value}" : value
164
+ end
165
+
166
+ # Merge a value with an existing value, creating an array if needed
167
+ #
168
+ # @api private
115
169
  # @param existing [Object] Existing value
116
170
  # @param value [Object] Value to append
117
171
  # @return [Array] array with both values
@@ -119,8 +173,37 @@ module MultiXml
119
173
  existing.is_a?(Array) ? existing << value : [existing, value]
120
174
  end
121
175
 
122
- # Remove empty or whitespace-only text content
176
+ # Add an attribute value while keeping document order on collisions
177
+ #
178
+ # @api private
179
+ # @param hash [Hash] Target hash
180
+ # @param key [String] Attribute key
181
+ # @param value [String] Attribute value
182
+ # @return [void]
183
+ def add_attribute_value(hash, key, value)
184
+ existing = hash[key]
185
+ hash[key] = case existing
186
+ when nil then value
187
+ when Array then insert_attribute_before_children(existing, value)
188
+ when Hash then [value, existing]
189
+ else [existing, value]
190
+ end
191
+ end
192
+
193
+ # Insert a later attribute before any child-element entries
194
+ #
195
+ # @api private
196
+ # @param values [Array] Existing colliding values
197
+ # @param value [String] Attribute value to insert
198
+ # @return [Array] Updated value list
199
+ def insert_attribute_before_children(values, value)
200
+ child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
201
+ values.dup.insert(child_index, value)
202
+ end
203
+
204
+ # Remove empty or whitespace-only text content from the current hash
123
205
  #
206
+ # @api private
124
207
  # @return [void]
125
208
  def strip_whitespace_content
126
209
  content = current[TEXT_CONTENT_KEY]