multi_xml 0.8.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +6 -1
- data/CHANGELOG.md +30 -0
- data/Gemfile +2 -1
- data/README.md +183 -39
- data/Rakefile +7 -0
- data/Steepfile +8 -1
- data/benchmark/overall_parser_benchmark.rb +5 -0
- data/benchmark.rb +1002 -0
- data/lib/multi_xml/concurrency.rb +31 -0
- data/lib/multi_xml/constants.rb +65 -20
- data/lib/multi_xml/deprecated.rb +35 -0
- data/lib/multi_xml/errors.rb +62 -8
- data/lib/multi_xml/file_like.rb +2 -2
- data/lib/multi_xml/helpers.rb +2 -2
- data/lib/multi_xml/options.rb +63 -0
- data/lib/multi_xml/options_normalization.rb +40 -0
- data/lib/multi_xml/parse_support.rb +113 -0
- data/lib/multi_xml/parser.rb +47 -0
- data/lib/multi_xml/parser_resolution.rb +150 -0
- data/lib/multi_xml/parsers/dom_parser.rb +107 -14
- data/lib/multi_xml/parsers/libxml.rb +36 -13
- data/lib/multi_xml/parsers/libxml_sax.rb +104 -19
- data/lib/multi_xml/parsers/nokogiri.rb +36 -13
- data/lib/multi_xml/parsers/nokogiri_sax.rb +47 -19
- data/lib/multi_xml/parsers/oga.rb +87 -15
- data/lib/multi_xml/parsers/ox.rb +120 -37
- data/lib/multi_xml/parsers/rexml.rb +104 -16
- data/lib/multi_xml/parsers/sax_handler.rb +84 -32
- data/lib/multi_xml/version.rb +3 -3
- data/lib/multi_xml.rb +132 -139
- data/sig/multi_xml.rbs +93 -16
- metadata +11 -2
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
require "rexml/document"
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module MultiXML
|
|
4
4
|
module Parsers
|
|
5
5
|
# XML parser using Ruby's built-in REXML library
|
|
6
6
|
#
|
|
7
7
|
# @api private
|
|
8
8
|
module Rexml
|
|
9
|
+
extend MultiXML::Parser
|
|
9
10
|
extend self
|
|
10
11
|
|
|
11
|
-
#
|
|
12
|
-
#
|
|
12
|
+
# Exception class raised on REXML parse failure
|
|
13
13
|
# @api private
|
|
14
|
-
|
|
15
|
-
def parse_error = ::REXML::ParseException
|
|
14
|
+
ParseError = ::REXML::ParseException
|
|
16
15
|
|
|
17
16
|
# Parse XML from an IO object
|
|
18
17
|
#
|
|
19
18
|
# @api private
|
|
20
19
|
# @param io [IO] IO-like object containing XML
|
|
20
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
21
21
|
# @return [Hash] Parsed XML as a hash
|
|
22
22
|
# @raise [REXML::ParseException] if XML is malformed
|
|
23
|
-
def parse(io)
|
|
23
|
+
def parse(io, namespaces: :strip)
|
|
24
24
|
doc = REXML::Document.new(io)
|
|
25
|
-
element_to_hash({}, doc.root)
|
|
25
|
+
element_to_hash({}, doc.root, namespaces)
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
private
|
|
@@ -32,21 +32,54 @@ module MultiXml
|
|
|
32
32
|
# @api private
|
|
33
33
|
# @param hash [Hash] Accumulator hash
|
|
34
34
|
# @param element [REXML::Element] Element to convert
|
|
35
|
+
# @param mode [Symbol] Namespace handling mode
|
|
35
36
|
# @return [Hash] Updated hash
|
|
36
|
-
def element_to_hash(hash, element)
|
|
37
|
-
add_to_hash(hash, element
|
|
37
|
+
def element_to_hash(hash, element, mode)
|
|
38
|
+
add_to_hash(hash, format_element_name(element, mode), collapse_element(element, mode))
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Format element name using prefix/local and namespace mode
|
|
42
|
+
#
|
|
43
|
+
# @api private
|
|
44
|
+
# @param element [REXML::Element] Element node
|
|
45
|
+
# @param mode [Symbol] Namespace handling mode
|
|
46
|
+
# @return [String] formatted element name
|
|
47
|
+
def format_element_name(element, mode)
|
|
48
|
+
format_name(element.prefix, element.name, mode)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Format attribute name using prefix/local and namespace mode
|
|
52
|
+
#
|
|
53
|
+
# @api private
|
|
54
|
+
# @param attr [REXML::Attribute] Attribute node
|
|
55
|
+
# @param mode [Symbol] Namespace handling mode
|
|
56
|
+
# @return [String] formatted attribute name
|
|
57
|
+
def format_attr_name(attr, mode)
|
|
58
|
+
format_name(attr.prefix, attr.name, mode)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Produce a name string for a given [prefix, local] tuple
|
|
62
|
+
#
|
|
63
|
+
# @api private
|
|
64
|
+
# @param prefix [String, nil] Namespace prefix
|
|
65
|
+
# @param local [String] Local part of the name
|
|
66
|
+
# @param mode [Symbol] Namespace handling mode
|
|
67
|
+
# @return [String] formatted name
|
|
68
|
+
def format_name(prefix, local, mode)
|
|
69
|
+
(mode == :preserve && prefix && !prefix.empty?) ? "#{prefix}:#{local}" : local
|
|
38
70
|
end
|
|
39
71
|
|
|
40
72
|
# Collapse an element into a hash with attributes and content
|
|
41
73
|
#
|
|
42
74
|
# @api private
|
|
43
75
|
# @param element [REXML::Element] Element to collapse
|
|
76
|
+
# @param mode [Symbol] Namespace handling mode
|
|
44
77
|
# @return [Hash] Hash representation
|
|
45
|
-
def collapse_element(element)
|
|
46
|
-
node_hash = collect_attributes(element)
|
|
78
|
+
def collapse_element(element, mode)
|
|
79
|
+
node_hash = collect_attributes(element, mode)
|
|
47
80
|
|
|
48
81
|
if element.has_elements?
|
|
49
|
-
collect_child_elements(element, node_hash)
|
|
82
|
+
collect_child_elements(element, node_hash, mode)
|
|
50
83
|
add_text_content(node_hash, element) unless whitespace_only?(element)
|
|
51
84
|
elsif node_hash.empty? || !whitespace_only?(element)
|
|
52
85
|
add_text_content(node_hash, element)
|
|
@@ -59,9 +92,34 @@ module MultiXml
|
|
|
59
92
|
#
|
|
60
93
|
# @api private
|
|
61
94
|
# @param element [REXML::Element] Element with attributes
|
|
95
|
+
# @param mode [Symbol] Namespace handling mode
|
|
62
96
|
# @return [Hash] Hash of attribute name-value pairs
|
|
63
|
-
def collect_attributes(element)
|
|
64
|
-
element.attributes.
|
|
97
|
+
def collect_attributes(element, mode)
|
|
98
|
+
element.attributes.each_attribute.with_object({}) do |attr, hash|
|
|
99
|
+
if xmlns_decl?(attr)
|
|
100
|
+
add_attribute_value(hash, xmlns_decl_key(attr), attr.value) if mode == :preserve
|
|
101
|
+
else
|
|
102
|
+
add_attribute_value(hash, format_attr_name(attr, mode), attr.value)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Check whether an attribute represents an xmlns declaration
|
|
108
|
+
#
|
|
109
|
+
# @api private
|
|
110
|
+
# @param attr [REXML::Attribute] Attribute to inspect
|
|
111
|
+
# @return [Boolean] true if xmlns declaration
|
|
112
|
+
def xmlns_decl?(attr)
|
|
113
|
+
attr.prefix == "xmlns" || ((attr.prefix.nil? || attr.prefix.empty?) && attr.name == "xmlns")
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Build the key for an xmlns declaration under :preserve
|
|
117
|
+
#
|
|
118
|
+
# @api private
|
|
119
|
+
# @param attr [REXML::Attribute] Declaration attribute
|
|
120
|
+
# @return [String] key such as "xmlns" or "xmlns:atom"
|
|
121
|
+
def xmlns_decl_key(attr)
|
|
122
|
+
(attr.prefix == "xmlns") ? "xmlns:#{attr.name}" : "xmlns"
|
|
65
123
|
end
|
|
66
124
|
|
|
67
125
|
# Collect all child elements into a hash
|
|
@@ -69,9 +127,10 @@ module MultiXml
|
|
|
69
127
|
# @api private
|
|
70
128
|
# @param element [REXML::Element] Parent element
|
|
71
129
|
# @param node_hash [Hash] Hash to populate
|
|
130
|
+
# @param mode [Symbol] Namespace handling mode
|
|
72
131
|
# @return [void]
|
|
73
|
-
def collect_child_elements(element, node_hash)
|
|
74
|
-
element.each_element { |child| element_to_hash(node_hash, child) }
|
|
132
|
+
def collect_child_elements(element, node_hash, mode)
|
|
133
|
+
element.each_element { |child| element_to_hash(node_hash, child, mode) }
|
|
75
134
|
end
|
|
76
135
|
|
|
77
136
|
# Add text content from an element to a hash
|
|
@@ -106,6 +165,35 @@ module MultiXml
|
|
|
106
165
|
hash
|
|
107
166
|
end
|
|
108
167
|
|
|
168
|
+
# Add an attribute value while keeping document order on collisions
|
|
169
|
+
#
|
|
170
|
+
# @api private
|
|
171
|
+
# @param hash [Hash] Target hash
|
|
172
|
+
# @param key [String] Attribute key
|
|
173
|
+
# @param value [String] Attribute value
|
|
174
|
+
# @return [Hash] Updated hash
|
|
175
|
+
def add_attribute_value(hash, key, value)
|
|
176
|
+
existing = hash[key]
|
|
177
|
+
hash[key] = case existing
|
|
178
|
+
when nil then value
|
|
179
|
+
when Array then insert_attribute_before_children(existing, value)
|
|
180
|
+
when Hash then [value, existing]
|
|
181
|
+
else [existing, value]
|
|
182
|
+
end
|
|
183
|
+
hash
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Insert a later attribute before any child-element entries
|
|
187
|
+
#
|
|
188
|
+
# @api private
|
|
189
|
+
# @param values [Array] Existing colliding values
|
|
190
|
+
# @param value [String] Attribute value to insert
|
|
191
|
+
# @return [Array] Updated value list
|
|
192
|
+
def insert_attribute_before_children(values, value)
|
|
193
|
+
child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
|
|
194
|
+
values.dup.insert(child_index, value)
|
|
195
|
+
end
|
|
196
|
+
|
|
109
197
|
# Check if element contains only whitespace text
|
|
110
198
|
#
|
|
111
199
|
# @api private
|
|
@@ -1,23 +1,29 @@
|
|
|
1
1
|
require "cgi/escape"
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module MultiXML
|
|
4
4
|
module Parsers
|
|
5
|
-
# Shared SAX handler logic for building hash trees from XML events
|
|
5
|
+
# Shared SAX handler logic for building hash trees from XML events.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
7
|
+
# Provides a stack machine used by both NokogiriSax and LibxmlSax
|
|
8
|
+
# handlers. Parser-specific subclasses translate their native callbacks
|
|
9
|
+
# into calls on this entrypoint:
|
|
10
|
+
#
|
|
11
|
+
# - handle_start_element_ns(local, prefix, attr_tuples, ns_decls)
|
|
12
|
+
# where attr_tuples = [[attr_prefix_or_nil, local, value], ...]
|
|
13
|
+
# ns_decls = [[prefix_or_nil, uri], ...]
|
|
10
14
|
#
|
|
11
15
|
# @api private
|
|
12
16
|
module SaxHandler
|
|
13
17
|
# Initialize the handler state
|
|
14
18
|
#
|
|
15
19
|
# @api private
|
|
20
|
+
# @param mode [Symbol] Namespace handling mode
|
|
16
21
|
# @return [void]
|
|
17
|
-
def initialize_handler
|
|
22
|
+
def initialize_handler(mode)
|
|
23
|
+
@mode = mode
|
|
18
24
|
@result = {}
|
|
19
25
|
@stack = [@result]
|
|
20
|
-
@
|
|
26
|
+
@pending = []
|
|
21
27
|
end
|
|
22
28
|
|
|
23
29
|
# Get the parsed result
|
|
@@ -28,31 +34,34 @@ module MultiXml
|
|
|
28
34
|
|
|
29
35
|
private
|
|
30
36
|
|
|
31
|
-
# Get the current element hash
|
|
37
|
+
# Get the current element hash on top of the stack
|
|
32
38
|
#
|
|
33
39
|
# @api private
|
|
34
40
|
# @return [Hash] current hash being built
|
|
35
41
|
def current = @stack.last
|
|
36
42
|
|
|
37
|
-
#
|
|
43
|
+
# Entry point for namespace-aware start events
|
|
38
44
|
#
|
|
39
45
|
# @api private
|
|
40
|
-
# @param
|
|
41
|
-
# @param
|
|
46
|
+
# @param local [String] Local element name
|
|
47
|
+
# @param prefix [String, nil] Element namespace prefix
|
|
48
|
+
# @param attr_tuples [Array] Attributes as [prefix, local, value]
|
|
49
|
+
# @param ns_decls [Array] xmlns declarations as [prefix, uri] pairs
|
|
42
50
|
# @return [void]
|
|
43
|
-
def
|
|
51
|
+
def handle_start_element_ns(local, prefix, attr_tuples, ns_decls)
|
|
44
52
|
child = {TEXT_CONTENT_KEY => +""}
|
|
45
|
-
add_child_to_current(
|
|
53
|
+
add_child_to_current(format_name(prefix, local), child)
|
|
46
54
|
@stack << child
|
|
47
|
-
|
|
55
|
+
|
|
56
|
+
@pending << build_pending_attrs(ns_decls, attr_tuples)
|
|
48
57
|
end
|
|
49
58
|
|
|
50
|
-
#
|
|
59
|
+
# Apply attributes and pop the current element from the stack
|
|
51
60
|
#
|
|
52
61
|
# @api private
|
|
53
62
|
# @return [void]
|
|
54
63
|
def handle_end_element
|
|
55
|
-
|
|
64
|
+
@pending.pop.each { |key, value| add_attr_to_current(key, value) }
|
|
56
65
|
strip_whitespace_content
|
|
57
66
|
@stack.pop
|
|
58
67
|
end
|
|
@@ -66,7 +75,40 @@ module MultiXml
|
|
|
66
75
|
current[TEXT_CONTENT_KEY] << text
|
|
67
76
|
end
|
|
68
77
|
|
|
69
|
-
#
|
|
78
|
+
# Build the list of attributes to apply at element-end
|
|
79
|
+
#
|
|
80
|
+
# @api private
|
|
81
|
+
# @param ns_decls [Array] xmlns declarations
|
|
82
|
+
# @param attr_tuples [Array] Attribute [prefix, local, value] tuples
|
|
83
|
+
# @return [Array<Array>] list of [key, value] pairs
|
|
84
|
+
def build_pending_attrs(ns_decls, attr_tuples)
|
|
85
|
+
preserved_ns_decls(ns_decls) + attr_tuples.map do |prefix, local, value|
|
|
86
|
+
[format_name(prefix, local), CGI.unescapeHTML(value)]
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Transform xmlns declarations into attribute pairs for :preserve mode
|
|
91
|
+
#
|
|
92
|
+
# @api private
|
|
93
|
+
# @param ns_decls [Array] Declarations as [prefix, uri]
|
|
94
|
+
# @return [Array<Array>] [xmlns key, uri] pairs (empty outside :preserve)
|
|
95
|
+
def preserved_ns_decls(ns_decls)
|
|
96
|
+
return [] unless @mode == :preserve
|
|
97
|
+
|
|
98
|
+
ns_decls.map { |prefix, uri| [prefix ? "xmlns:#{prefix}" : "xmlns", uri] }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Produce a name string for a [prefix, local] tuple
|
|
102
|
+
#
|
|
103
|
+
# @api private
|
|
104
|
+
# @param prefix [String, nil] Namespace prefix
|
|
105
|
+
# @param local [String] Local part of the name
|
|
106
|
+
# @return [String] formatted name
|
|
107
|
+
def format_name(prefix, local)
|
|
108
|
+
(@mode == :preserve && prefix) ? "#{prefix}:#{local}" : local
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Add a child element to the current hash, folding on collision
|
|
70
112
|
#
|
|
71
113
|
# @api private
|
|
72
114
|
# @param name [String] Child element name
|
|
@@ -81,29 +123,39 @@ module MultiXml
|
|
|
81
123
|
end
|
|
82
124
|
end
|
|
83
125
|
|
|
84
|
-
#
|
|
126
|
+
# Add an attribute value to the current hash (attr wins on collision)
|
|
127
|
+
#
|
|
128
|
+
# Attributes are applied at end_element, after children have already
|
|
129
|
+
# populated the hash. When an attribute collides with a child of the
|
|
130
|
+
# same local name, the attribute is placed first in the resulting
|
|
131
|
+
# array (matching DomParser / REXML behavior and existing tests).
|
|
85
132
|
#
|
|
86
133
|
# @api private
|
|
87
|
-
# @param
|
|
88
|
-
# @
|
|
89
|
-
|
|
90
|
-
|
|
134
|
+
# @param key [String] Attribute key
|
|
135
|
+
# @param value [String] Attribute value
|
|
136
|
+
# @return [void]
|
|
137
|
+
def add_attr_to_current(key, value)
|
|
138
|
+
existing = current[key]
|
|
139
|
+
current[key] = case existing
|
|
140
|
+
when nil then value
|
|
141
|
+
when Array then insert_attribute_before_children(existing, value)
|
|
142
|
+
when Hash then [value, existing]
|
|
143
|
+
else [existing, value]
|
|
144
|
+
end
|
|
91
145
|
end
|
|
92
146
|
|
|
93
|
-
#
|
|
147
|
+
# Insert a later attribute before any child-element entries
|
|
94
148
|
#
|
|
95
149
|
# @api private
|
|
96
|
-
# @param
|
|
97
|
-
# @
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
current[name] = existing ? [unescaped, existing] : unescaped
|
|
103
|
-
end
|
|
150
|
+
# @param values [Array] Existing colliding values
|
|
151
|
+
# @param value [String] Attribute value to insert
|
|
152
|
+
# @return [Array] Updated value list
|
|
153
|
+
def insert_attribute_before_children(values, value)
|
|
154
|
+
child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
|
|
155
|
+
values.dup.insert(child_index, value)
|
|
104
156
|
end
|
|
105
157
|
|
|
106
|
-
# Remove empty or whitespace-only text content
|
|
158
|
+
# Remove empty or whitespace-only text content from the current hash
|
|
107
159
|
#
|
|
108
160
|
# @api private
|
|
109
161
|
# @return [void]
|
data/lib/multi_xml/version.rb
CHANGED