multi_xml 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +6 -1
- data/CHANGELOG.md +30 -0
- data/Gemfile +5 -3
- data/README.md +183 -39
- data/Rakefile +8 -1
- data/Steepfile +8 -1
- data/benchmark/overall_parser_benchmark.rb +5 -0
- data/benchmark.rb +1002 -0
- data/lib/multi_xml/concurrency.rb +31 -0
- data/lib/multi_xml/constants.rb +65 -20
- data/lib/multi_xml/deprecated.rb +35 -0
- data/lib/multi_xml/errors.rb +62 -8
- data/lib/multi_xml/file_like.rb +2 -2
- data/lib/multi_xml/helpers.rb +3 -3
- data/lib/multi_xml/options.rb +63 -0
- data/lib/multi_xml/options_normalization.rb +40 -0
- data/lib/multi_xml/parse_support.rb +113 -0
- data/lib/multi_xml/parser.rb +47 -0
- data/lib/multi_xml/parser_resolution.rb +150 -0
- data/lib/multi_xml/parsers/dom_parser.rb +107 -14
- data/lib/multi_xml/parsers/libxml.rb +36 -13
- data/lib/multi_xml/parsers/libxml_sax.rb +104 -19
- data/lib/multi_xml/parsers/nokogiri.rb +36 -13
- data/lib/multi_xml/parsers/nokogiri_sax.rb +47 -19
- data/lib/multi_xml/parsers/oga.rb +87 -15
- data/lib/multi_xml/parsers/ox.rb +120 -37
- data/lib/multi_xml/parsers/rexml.rb +104 -16
- data/lib/multi_xml/parsers/sax_handler.rb +84 -32
- data/lib/multi_xml/version.rb +3 -3
- data/lib/multi_xml.rb +137 -134
- data/sig/multi_xml.rbs +93 -16
- metadata +11 -2
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
module MultiXML
|
|
2
|
+
# Internal helpers for resolving and loading parser backends
|
|
3
|
+
#
|
|
4
|
+
# @api private
|
|
5
|
+
module ParserResolution
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
# Resolve a parser specification to a module
|
|
9
|
+
#
|
|
10
|
+
# @api private
|
|
11
|
+
# @param spec [Symbol, String, Class, Module] Parser specification
|
|
12
|
+
# @return [Module] Resolved parser module
|
|
13
|
+
# @raise [ParserLoadError] if spec is invalid, the parser file
|
|
14
|
+
# can't be required, or the resolved parser doesn't satisfy
|
|
15
|
+
# the parser contract
|
|
16
|
+
def resolve_parser(spec)
|
|
17
|
+
parser = case spec
|
|
18
|
+
when String, Symbol then load_parser(spec)
|
|
19
|
+
when Module then spec
|
|
20
|
+
else raise ParserLoadError, "expected parser to be a Symbol, String, or Module, got #{spec.inspect}"
|
|
21
|
+
end
|
|
22
|
+
validate_parser!(parser)
|
|
23
|
+
rescue ::LoadError => e
|
|
24
|
+
raise ParserLoadError.build(e)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Load a parser by name
|
|
28
|
+
#
|
|
29
|
+
# @api private
|
|
30
|
+
# @param name [Symbol, String] Parser name
|
|
31
|
+
# @return [Module] Loaded parser module
|
|
32
|
+
def load_parser(name)
|
|
33
|
+
name = name.to_s.downcase
|
|
34
|
+
require "multi_xml/parsers/#{name}"
|
|
35
|
+
Parsers.const_get(camelize(name))
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Validate that a parser satisfies the documented contract
|
|
39
|
+
#
|
|
40
|
+
# Custom parsers are accepted as modules/classes, so fail fast
|
|
41
|
+
# during parser resolution rather than later on the first parse
|
|
42
|
+
# call. A parser must respond to ``.parse`` and must either
|
|
43
|
+
# define a ``ParseError`` constant or respond to ``.parse_error``.
|
|
44
|
+
#
|
|
45
|
+
# @api private
|
|
46
|
+
# @param parser [Module] parser class or module
|
|
47
|
+
# @return [Module] the validated parser
|
|
48
|
+
# @raise [ParserLoadError] when the parser is missing a required method
|
|
49
|
+
def validate_parser!(parser)
|
|
50
|
+
raise ParserLoadError, "Parser #{parser} must respond to .parse" unless parser.respond_to?(:parse)
|
|
51
|
+
unless parser.const_defined?(:ParseError, false) || parser.respond_to?(:parse_error)
|
|
52
|
+
raise ParserLoadError, "Parser #{parser} must define a ParseError constant or a .parse_error method"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
parser
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Convert underscored string to CamelCase
|
|
59
|
+
#
|
|
60
|
+
# @api private
|
|
61
|
+
# @param name [String] Underscored string
|
|
62
|
+
# @return [String] CamelCased string
|
|
63
|
+
def camelize(name)
|
|
64
|
+
name.split("_").map(&:capitalize).join
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Detect the best available parser
|
|
68
|
+
#
|
|
69
|
+
# @api private
|
|
70
|
+
# @return [Symbol] Parser name
|
|
71
|
+
# @raise [NoParserError] if no parser is available
|
|
72
|
+
def detect_parser
|
|
73
|
+
find_loaded_parser || find_available_parser || raise_no_parser_error
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Parser constant names mapped to their symbols, in preference order
|
|
77
|
+
#
|
|
78
|
+
# @api private
|
|
79
|
+
LOADED_PARSER_CHECKS = {
|
|
80
|
+
Ox: :ox,
|
|
81
|
+
LibXML: :libxml,
|
|
82
|
+
Nokogiri: :nokogiri,
|
|
83
|
+
Oga: :oga
|
|
84
|
+
}.freeze
|
|
85
|
+
private_constant :LOADED_PARSER_CHECKS
|
|
86
|
+
|
|
87
|
+
# Find an already-loaded parser library
|
|
88
|
+
#
|
|
89
|
+
# @api private
|
|
90
|
+
# @return [Symbol, nil] Parser name or nil if none loaded
|
|
91
|
+
def find_loaded_parser
|
|
92
|
+
LOADED_PARSER_CHECKS.each do |const_name, parser_name|
|
|
93
|
+
next if skip_on_platform?(parser_name)
|
|
94
|
+
return parser_name if Object.const_defined?(const_name)
|
|
95
|
+
end
|
|
96
|
+
nil
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Try to load and find an available parser
|
|
100
|
+
#
|
|
101
|
+
# @api private
|
|
102
|
+
# @return [Symbol, nil] Parser name or nil if none available
|
|
103
|
+
def find_available_parser
|
|
104
|
+
PARSER_PREFERENCE.each do |library, parser_name|
|
|
105
|
+
next if skip_on_platform?(parser_name)
|
|
106
|
+
return parser_name if try_require(library)
|
|
107
|
+
end
|
|
108
|
+
nil
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Whether a parser should be skipped during auto-detection
|
|
112
|
+
#
|
|
113
|
+
# Ox loads on TruffleRuby but its SAX callbacks misbehave under the
|
|
114
|
+
# native interpreter, so type-attributed XML parses to an empty hash
|
|
115
|
+
# and the disallowed-type check is silently bypassed. Skip it during
|
|
116
|
+
# auto-detection so MultiXML falls through to a working backend.
|
|
117
|
+
# Callers that pass ``parser: :ox`` explicitly still get Ox.
|
|
118
|
+
#
|
|
119
|
+
# @api private
|
|
120
|
+
# @param parser_name [Symbol] parser symbol from preference list
|
|
121
|
+
# @return [Boolean] true when this parser must be skipped
|
|
122
|
+
def skip_on_platform?(parser_name)
|
|
123
|
+
parser_name == :ox && RUBY_ENGINE == "truffleruby"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Attempt to require a library
|
|
127
|
+
#
|
|
128
|
+
# @api private
|
|
129
|
+
# @param library [String] Library to require
|
|
130
|
+
# @return [Boolean] true if successful, false if LoadError
|
|
131
|
+
def try_require(library)
|
|
132
|
+
require library
|
|
133
|
+
true
|
|
134
|
+
rescue LoadError
|
|
135
|
+
false
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Raise an error indicating no parser is available
|
|
139
|
+
#
|
|
140
|
+
# @api private
|
|
141
|
+
# @return [void]
|
|
142
|
+
# @raise [NoParserError] always
|
|
143
|
+
def raise_no_parser_error
|
|
144
|
+
raise NoParserError, <<~MSG.chomp
|
|
145
|
+
No XML parser detected. Install one of: ox, nokogiri, libxml-ruby, or oga.
|
|
146
|
+
See https://github.com/sferik/multi_xml for more information.
|
|
147
|
+
MSG
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
@@ -1,12 +1,21 @@
|
|
|
1
|
-
module
|
|
1
|
+
module MultiXML
|
|
2
|
+
# Namespace for all supported XML parser backends
|
|
3
|
+
#
|
|
4
|
+
# Each parser (Nokogiri, LibXML, Ox, Oga, REXML, plus SAX variants) is
|
|
5
|
+
# defined as a module under this namespace and exposes a common `parse`
|
|
6
|
+
# and `parse_error` interface.
|
|
7
|
+
#
|
|
8
|
+
# @api private
|
|
2
9
|
module Parsers
|
|
3
10
|
# Shared DOM traversal logic for converting XML nodes to hashes
|
|
4
11
|
#
|
|
5
12
|
# Used by Nokogiri, LibXML, and Oga parsers.
|
|
6
13
|
# Including modules must implement:
|
|
7
14
|
# - each_child(node) { |child| ... }
|
|
8
|
-
# -
|
|
9
|
-
# -
|
|
15
|
+
# - each_element_attr(node) { |attr| ... } (non-namespace-decl attrs only)
|
|
16
|
+
# - each_namespace_decl(node) { |prefix_or_nil, uri| ... }
|
|
17
|
+
# - element_parts(node) -> [prefix_or_nil, local_name]
|
|
18
|
+
# - attr_parts(attr) -> [prefix_or_nil, local_name]
|
|
10
19
|
#
|
|
11
20
|
# @api private
|
|
12
21
|
module DomParser
|
|
@@ -15,12 +24,14 @@ module MultiXml
|
|
|
15
24
|
# @api private
|
|
16
25
|
# @param node [Object] XML node to convert
|
|
17
26
|
# @param hash [Hash] Accumulator hash for results
|
|
27
|
+
# @param mode [Symbol] Namespace handling mode (:strip, :preserve)
|
|
18
28
|
# @return [Hash] Hash representation of the node
|
|
19
|
-
def node_to_hash(node, hash = {})
|
|
29
|
+
def node_to_hash(node, hash = {}, mode: :strip)
|
|
20
30
|
node_hash = {TEXT_CONTENT_KEY => +""}
|
|
21
|
-
add_value(hash,
|
|
22
|
-
collect_children(node, node_hash)
|
|
23
|
-
|
|
31
|
+
add_value(hash, format_element_name(node, mode), node_hash)
|
|
32
|
+
collect_children(node, node_hash, mode)
|
|
33
|
+
collect_namespace_decls(node, node_hash, mode)
|
|
34
|
+
collect_attributes(node, node_hash, mode)
|
|
24
35
|
strip_whitespace_content(node_hash)
|
|
25
36
|
hash
|
|
26
37
|
end
|
|
@@ -48,11 +59,12 @@ module MultiXml
|
|
|
48
59
|
# @api private
|
|
49
60
|
# @param node [Object] Parent node
|
|
50
61
|
# @param node_hash [Hash] Hash to populate
|
|
62
|
+
# @param mode [Symbol] Namespace handling mode
|
|
51
63
|
# @return [void]
|
|
52
|
-
def collect_children(node, node_hash)
|
|
64
|
+
def collect_children(node, node_hash, mode)
|
|
53
65
|
each_child(node) do |child|
|
|
54
66
|
if child.element?
|
|
55
|
-
node_to_hash(child, node_hash)
|
|
67
|
+
node_to_hash(child, node_hash, mode: mode)
|
|
56
68
|
elsif text_or_cdata?(child)
|
|
57
69
|
node_hash[TEXT_CONTENT_KEY] << child.content
|
|
58
70
|
end
|
|
@@ -68,20 +80,101 @@ module MultiXml
|
|
|
68
80
|
node.text? || node.cdata?
|
|
69
81
|
end
|
|
70
82
|
|
|
83
|
+
# Collect xmlns declarations into the hash under :preserve mode
|
|
84
|
+
#
|
|
85
|
+
# Declarations are unique per prefix on a given element, so no
|
|
86
|
+
# collision handling is needed here.
|
|
87
|
+
#
|
|
88
|
+
# @api private
|
|
89
|
+
# @param node [Object] Node with potential xmlns declarations
|
|
90
|
+
# @param node_hash [Hash] Hash to populate
|
|
91
|
+
# @param mode [Symbol] Namespace handling mode
|
|
92
|
+
# @return [void]
|
|
93
|
+
def collect_namespace_decls(node, node_hash, mode)
|
|
94
|
+
return unless mode == :preserve
|
|
95
|
+
|
|
96
|
+
each_namespace_decl(node) do |prefix, uri|
|
|
97
|
+
node_hash[prefix ? "xmlns:#{prefix}" : "xmlns"] = uri
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
71
101
|
# Collect all attributes from a node
|
|
72
102
|
#
|
|
103
|
+
# Attributes arrive after child elements. When an attribute collides
|
|
104
|
+
# with a child of the same name, the attribute is placed first in the
|
|
105
|
+
# resulting array (e.g. `<user name="A"><name>B</name></user>` →
|
|
106
|
+
# `["A", "B"]`). See `test/attribute_tests.rb`.
|
|
107
|
+
#
|
|
73
108
|
# @api private
|
|
74
109
|
# @param node [Object] Node with attributes
|
|
75
110
|
# @param node_hash [Hash] Hash to populate
|
|
111
|
+
# @param mode [Symbol] Namespace handling mode
|
|
76
112
|
# @return [void]
|
|
77
|
-
def collect_attributes(node, node_hash)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
existing = node_hash[name]
|
|
81
|
-
node_hash[name] = existing ? [attr.value, existing] : attr.value
|
|
113
|
+
def collect_attributes(node, node_hash, mode)
|
|
114
|
+
each_element_attr(node) do |attr|
|
|
115
|
+
add_attribute_value(node_hash, format_attr_name(attr, mode), attr.value)
|
|
82
116
|
end
|
|
83
117
|
end
|
|
84
118
|
|
|
119
|
+
# Format an element's name according to the namespace mode
|
|
120
|
+
#
|
|
121
|
+
# @api private
|
|
122
|
+
# @param node [Object] Element node
|
|
123
|
+
# @param mode [Symbol] Namespace handling mode
|
|
124
|
+
# @return [String] formatted element name
|
|
125
|
+
def format_element_name(node, mode)
|
|
126
|
+
format_name(*element_parts(node), mode)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Format an attribute's name according to the namespace mode
|
|
130
|
+
#
|
|
131
|
+
# @api private
|
|
132
|
+
# @param attr [Object] Attribute node
|
|
133
|
+
# @param mode [Symbol] Namespace handling mode
|
|
134
|
+
# @return [String] formatted attribute name
|
|
135
|
+
def format_attr_name(attr, mode)
|
|
136
|
+
format_name(*attr_parts(attr), mode)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Produce a name string for a given [prefix, local] tuple
|
|
140
|
+
#
|
|
141
|
+
# @api private
|
|
142
|
+
# @param prefix [String, nil] Namespace prefix (nil for default / unprefixed)
|
|
143
|
+
# @param local [String] Local part of the name
|
|
144
|
+
# @param mode [Symbol] Namespace handling mode
|
|
145
|
+
# @return [String] formatted name
|
|
146
|
+
def format_name(prefix, local, mode)
|
|
147
|
+
(mode == :preserve && prefix) ? "#{prefix}:#{local}" : local
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Add an attribute value, preserving attr-before-child collision order
|
|
151
|
+
#
|
|
152
|
+
# @api private
|
|
153
|
+
# @param hash [Hash] Target hash
|
|
154
|
+
# @param key [String] Attribute key
|
|
155
|
+
# @param value [String] Attribute value
|
|
156
|
+
# @return [void]
|
|
157
|
+
def add_attribute_value(hash, key, value)
|
|
158
|
+
existing = hash[key]
|
|
159
|
+
hash[key] = case existing
|
|
160
|
+
when nil then value
|
|
161
|
+
when Array then insert_attribute_before_children(existing, value)
|
|
162
|
+
when Hash then [value, existing]
|
|
163
|
+
else [existing, value]
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Insert a later attribute before any child-element entries
|
|
168
|
+
#
|
|
169
|
+
# @api private
|
|
170
|
+
# @param values [Array] Existing colliding values
|
|
171
|
+
# @param value [String] Attribute value to insert
|
|
172
|
+
# @return [Array] Updated value list
|
|
173
|
+
def insert_attribute_before_children(values, value)
|
|
174
|
+
child_index = values.index { |entry| entry.is_a?(Hash) } || values.length
|
|
175
|
+
values.dup.insert(child_index, value)
|
|
176
|
+
end
|
|
177
|
+
|
|
85
178
|
# Remove empty or whitespace-only text content
|
|
86
179
|
#
|
|
87
180
|
# @api private
|
|
@@ -1,50 +1,73 @@
|
|
|
1
|
-
require "libxml"
|
|
1
|
+
require "libxml-ruby"
|
|
2
2
|
require_relative "dom_parser"
|
|
3
3
|
|
|
4
|
-
module
|
|
4
|
+
module MultiXML
|
|
5
5
|
module Parsers
|
|
6
6
|
# XML parser using the LibXML library
|
|
7
7
|
#
|
|
8
8
|
# @api private
|
|
9
9
|
module Libxml
|
|
10
|
+
extend MultiXML::Parser
|
|
10
11
|
include DomParser
|
|
11
12
|
extend self
|
|
12
13
|
|
|
13
|
-
#
|
|
14
|
-
#
|
|
14
|
+
# Exception class raised on LibXML parse failure
|
|
15
15
|
# @api private
|
|
16
|
-
|
|
17
|
-
def parse_error = ::LibXML::XML::Error
|
|
16
|
+
ParseError = ::LibXML::XML::Error
|
|
18
17
|
|
|
19
18
|
# Parse XML from an IO object
|
|
20
19
|
#
|
|
21
20
|
# @api private
|
|
22
21
|
# @param io [IO] IO-like object containing XML
|
|
22
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
23
23
|
# @return [Hash] Parsed XML as a hash
|
|
24
24
|
# @raise [LibXML::XML::Error] if XML is malformed
|
|
25
|
-
def parse(io)
|
|
26
|
-
node_to_hash(LibXML::XML::Parser.io(io).parse.root)
|
|
25
|
+
def parse(io, namespaces: :strip)
|
|
26
|
+
node_to_hash(::LibXML::XML::Parser.io(io).parse.root, mode: namespaces)
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
private
|
|
30
30
|
|
|
31
31
|
# Iterate over child nodes
|
|
32
32
|
#
|
|
33
|
+
# @api private
|
|
33
34
|
# @param node [LibXML::XML::Node] Parent node
|
|
34
35
|
# @return [void]
|
|
35
36
|
def each_child(node, &) = node.each_child(&)
|
|
36
37
|
|
|
37
38
|
# Iterate over attribute nodes
|
|
38
39
|
#
|
|
40
|
+
# @api private
|
|
41
|
+
# @param node [LibXML::XML::Node] Element node
|
|
42
|
+
# @return [void]
|
|
43
|
+
def each_element_attr(node, &) = node.each_attr(&)
|
|
44
|
+
|
|
45
|
+
# Yield each xmlns declaration on this element
|
|
46
|
+
#
|
|
47
|
+
# @api private
|
|
39
48
|
# @param node [LibXML::XML::Node] Element node
|
|
40
49
|
# @return [void]
|
|
41
|
-
def
|
|
50
|
+
def each_namespace_decl(node)
|
|
51
|
+
node.namespaces.definitions.each { |ns| yield ns.prefix, ns.href }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Return [prefix, local] for an element
|
|
55
|
+
#
|
|
56
|
+
# @api private
|
|
57
|
+
# @param node [LibXML::XML::Node] Element node
|
|
58
|
+
# @return [Array<String, nil>] prefix and local name
|
|
59
|
+
def element_parts(node)
|
|
60
|
+
[node.namespaces.namespace&.prefix, node.name]
|
|
61
|
+
end
|
|
42
62
|
|
|
43
|
-
#
|
|
63
|
+
# Return [prefix, local] for an attribute
|
|
44
64
|
#
|
|
45
|
-
# @
|
|
46
|
-
# @
|
|
47
|
-
|
|
65
|
+
# @api private
|
|
66
|
+
# @param attr [LibXML::XML::Attr] Attribute node
|
|
67
|
+
# @return [Array<String, nil>] prefix and local name
|
|
68
|
+
def attr_parts(attr)
|
|
69
|
+
[attr.ns? ? attr.ns.prefix : nil, attr.name]
|
|
70
|
+
end
|
|
48
71
|
end
|
|
49
72
|
end
|
|
50
73
|
end
|
|
@@ -1,40 +1,101 @@
|
|
|
1
|
-
require "libxml"
|
|
1
|
+
require "libxml-ruby"
|
|
2
2
|
require "stringio"
|
|
3
3
|
require_relative "sax_handler"
|
|
4
|
+
require_relative "libxml"
|
|
4
5
|
|
|
5
|
-
module
|
|
6
|
+
module MultiXML
|
|
6
7
|
module Parsers
|
|
7
8
|
# SAX-based parser using LibXML (faster for large documents)
|
|
8
9
|
#
|
|
9
10
|
# @api private
|
|
10
11
|
module LibxmlSax
|
|
12
|
+
extend MultiXML::Parser
|
|
13
|
+
|
|
11
14
|
module_function
|
|
12
15
|
|
|
13
|
-
#
|
|
14
|
-
#
|
|
16
|
+
# Exception class raised on LibXML parse failure
|
|
15
17
|
# @api private
|
|
16
|
-
|
|
17
|
-
def parse_error = ::LibXML::XML::Error
|
|
18
|
+
ParseError = ::LibXML::XML::Error
|
|
18
19
|
|
|
19
20
|
# Parse XML from a string or IO object
|
|
20
21
|
#
|
|
21
22
|
# @api private
|
|
22
23
|
# @param xml [String, IO] XML content
|
|
24
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
23
25
|
# @return [Hash] Parsed XML as a hash
|
|
24
26
|
# @raise [LibXML::XML::Error] if XML is malformed
|
|
25
|
-
def parse(xml)
|
|
26
|
-
|
|
27
|
-
return {} if
|
|
27
|
+
def parse(xml, namespaces: :strip)
|
|
28
|
+
source = xml.respond_to?(:read) ? xml.read : xml.to_s
|
|
29
|
+
return {} if source.empty?
|
|
30
|
+
|
|
31
|
+
return parse_with_dom(source, namespaces) if dom_fallback?(source, namespaces)
|
|
32
|
+
|
|
33
|
+
parse_with_sax(source, namespaces)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Detect whether a start tag has attributes that collide after stripping
|
|
37
|
+
#
|
|
38
|
+
# @api private
|
|
39
|
+
# @param source [String] XML source
|
|
40
|
+
# @return [Boolean] true when stripped attribute locals collide
|
|
41
|
+
def stripped_attribute_collision?(source)
|
|
42
|
+
source.scan(%r{<(?![!?/])[^>]*>}m).any? do |tag|
|
|
43
|
+
local_names = attribute_names(tag).map { |name| name.split(":", 2).last }
|
|
44
|
+
local_names.uniq.length < local_names.length
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Extract non-xmlns attribute names from a start tag
|
|
49
|
+
#
|
|
50
|
+
# @api private
|
|
51
|
+
# @param tag [String] Start tag source
|
|
52
|
+
# @return [Array<String>] attribute names
|
|
53
|
+
def attribute_names(tag)
|
|
54
|
+
tag.scan(/\s([a-zA-Z_][\w.-]*(?::[a-zA-Z_][\w.-]*)?)\s*=/).flatten.reject do |name|
|
|
55
|
+
name == "xmlns" || name.start_with?("xmlns:")
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Determine whether libxml_sax must fall back to the DOM parser
|
|
60
|
+
#
|
|
61
|
+
# @api private
|
|
62
|
+
# @param source [String] XML source
|
|
63
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
64
|
+
# @return [Boolean] true when DOM parsing is required
|
|
65
|
+
def dom_fallback?(source, namespaces)
|
|
66
|
+
namespaces != :strip || stripped_attribute_collision?(source)
|
|
67
|
+
end
|
|
28
68
|
|
|
69
|
+
# Parse via the DOM libxml backend
|
|
70
|
+
#
|
|
71
|
+
# @api private
|
|
72
|
+
# @param source [String] XML source
|
|
73
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
74
|
+
# @return [Hash] Parsed XML as a hash
|
|
75
|
+
def parse_with_dom(source, namespaces)
|
|
76
|
+
Libxml.parse(StringIO.new(source), namespaces: namespaces)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Parse via libxml-ruby's SAX parser
|
|
80
|
+
#
|
|
81
|
+
# @api private
|
|
82
|
+
# @param source [String] XML source
|
|
83
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
84
|
+
# @return [Hash] Parsed XML as a hash
|
|
85
|
+
def parse_with_sax(source, namespaces)
|
|
29
86
|
LibXML::XML::Error.set_handler(&LibXML::XML::Error::QUIET_HANDLER)
|
|
30
|
-
handler = Handler.new
|
|
31
|
-
parser = ::LibXML::XML::SaxParser.io(
|
|
87
|
+
handler = Handler.new(namespaces)
|
|
88
|
+
parser = ::LibXML::XML::SaxParser.io(StringIO.new(source))
|
|
32
89
|
parser.callbacks = handler
|
|
33
90
|
parser.parse
|
|
34
91
|
handler.result
|
|
35
92
|
end
|
|
36
93
|
|
|
37
|
-
# LibXML SAX handler
|
|
94
|
+
# LibXML SAX handler.
|
|
95
|
+
#
|
|
96
|
+
# libxml-ruby's namespace-aware callback strips prefixes from the attrs
|
|
97
|
+
# hash, so we rely on the qname-preserving `on_start_element` callback
|
|
98
|
+
# and resolve namespaces via SaxHandler's scope stack.
|
|
38
99
|
#
|
|
39
100
|
# @api private
|
|
40
101
|
class Handler
|
|
@@ -44,9 +105,10 @@ module MultiXml
|
|
|
44
105
|
# Create a new SAX handler
|
|
45
106
|
#
|
|
46
107
|
# @api private
|
|
108
|
+
# @param mode [Symbol] Namespace handling mode
|
|
47
109
|
# @return [Handler] new handler instance
|
|
48
|
-
def initialize
|
|
49
|
-
initialize_handler
|
|
110
|
+
def initialize(mode)
|
|
111
|
+
initialize_handler(mode)
|
|
50
112
|
end
|
|
51
113
|
|
|
52
114
|
# Handle start of document (no-op)
|
|
@@ -63,7 +125,7 @@ module MultiXml
|
|
|
63
125
|
def on_end_document
|
|
64
126
|
end
|
|
65
127
|
|
|
66
|
-
# Handle parse errors (no-op
|
|
128
|
+
# Handle parse errors (no-op; libxml-ruby raises directly)
|
|
67
129
|
#
|
|
68
130
|
# @api private
|
|
69
131
|
# @param _error [String] Error message (unused)
|
|
@@ -73,12 +135,23 @@ module MultiXml
|
|
|
73
135
|
|
|
74
136
|
# Handle start of an element
|
|
75
137
|
#
|
|
138
|
+
# libxml-ruby strips xmlns declarations from attrs and passes through
|
|
139
|
+
# prefixed names for regular attributes. Since libxml_sax only uses
|
|
140
|
+
# this handler in :strip mode, we route through the namespace-aware
|
|
141
|
+
# entrypoint with empty ns_decls and treat attribute qnames as-if
|
|
142
|
+
# they had no namespace — matching the desired :strip output.
|
|
143
|
+
#
|
|
76
144
|
# @api private
|
|
77
|
-
# @param name [String] Element name
|
|
78
|
-
# @param attrs [Hash]
|
|
145
|
+
# @param name [String] Element name (possibly prefixed)
|
|
146
|
+
# @param attrs [Hash] Attributes as name => value
|
|
79
147
|
# @return [void]
|
|
80
148
|
def on_start_element(name, attrs = {})
|
|
81
|
-
|
|
149
|
+
prefix, local = sax_split_qname(name.to_s)
|
|
150
|
+
tuples = attrs.map do |k, v|
|
|
151
|
+
ap, al = sax_split_qname(k.to_s)
|
|
152
|
+
[ap, al, v]
|
|
153
|
+
end
|
|
154
|
+
handle_start_element_ns(local, prefix, tuples, [])
|
|
82
155
|
end
|
|
83
156
|
|
|
84
157
|
# Handle end of an element
|
|
@@ -90,7 +163,19 @@ module MultiXml
|
|
|
90
163
|
handle_end_element
|
|
91
164
|
end
|
|
92
165
|
|
|
93
|
-
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
# Split a prefixed name into [prefix, local]
|
|
169
|
+
#
|
|
170
|
+
# @api private
|
|
171
|
+
# @param name [String] Prefixed or local name
|
|
172
|
+
# @return [Array<String, nil>] prefix and local name
|
|
173
|
+
def sax_split_qname(name)
|
|
174
|
+
p, l = name.split(":", 2)
|
|
175
|
+
l ? [p, l] : [nil, p]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Handle character data (also aliased as `on_cdata_block`)
|
|
94
179
|
#
|
|
95
180
|
# @api private
|
|
96
181
|
# @param text [String] Text content
|
|
@@ -1,53 +1,76 @@
|
|
|
1
1
|
require "nokogiri"
|
|
2
2
|
require_relative "dom_parser"
|
|
3
3
|
|
|
4
|
-
module
|
|
4
|
+
module MultiXML
|
|
5
5
|
module Parsers
|
|
6
6
|
# XML parser using the Nokogiri library
|
|
7
7
|
#
|
|
8
8
|
# @api private
|
|
9
9
|
module Nokogiri
|
|
10
|
+
extend MultiXML::Parser
|
|
10
11
|
include DomParser
|
|
11
12
|
extend self
|
|
12
13
|
|
|
13
|
-
#
|
|
14
|
-
#
|
|
14
|
+
# Exception class raised on Nokogiri parse failure
|
|
15
15
|
# @api private
|
|
16
|
-
|
|
17
|
-
def parse_error = ::Nokogiri::XML::SyntaxError
|
|
16
|
+
ParseError = ::Nokogiri::XML::SyntaxError
|
|
18
17
|
|
|
19
18
|
# Parse XML from an IO object
|
|
20
19
|
#
|
|
21
20
|
# @api private
|
|
22
21
|
# @param io [IO] IO-like object containing XML
|
|
22
|
+
# @param namespaces [Symbol] Namespace handling mode
|
|
23
23
|
# @return [Hash] Parsed XML as a hash
|
|
24
24
|
# @raise [Nokogiri::XML::SyntaxError] if XML is malformed
|
|
25
|
-
def parse(io)
|
|
25
|
+
def parse(io, namespaces: :strip)
|
|
26
26
|
doc = ::Nokogiri::XML(io)
|
|
27
27
|
raise doc.errors.first unless doc.errors.empty?
|
|
28
28
|
|
|
29
|
-
node_to_hash(doc.root)
|
|
29
|
+
node_to_hash(doc.root, mode: namespaces)
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
private
|
|
33
33
|
|
|
34
34
|
# Iterate over child nodes
|
|
35
35
|
#
|
|
36
|
+
# @api private
|
|
36
37
|
# @param node [Nokogiri::XML::Node] Parent node
|
|
37
38
|
# @return [void]
|
|
38
39
|
def each_child(node, &) = node.children.each(&)
|
|
39
40
|
|
|
40
|
-
# Iterate over attribute nodes
|
|
41
|
+
# Iterate over attribute nodes (excludes xmlns declarations)
|
|
42
|
+
#
|
|
43
|
+
# @api private
|
|
44
|
+
# @param node [Nokogiri::XML::Node] Element node
|
|
45
|
+
# @return [void]
|
|
46
|
+
def each_element_attr(node, &) = node.attribute_nodes.each(&)
|
|
47
|
+
|
|
48
|
+
# Yield each xmlns declaration on this element
|
|
41
49
|
#
|
|
50
|
+
# @api private
|
|
42
51
|
# @param node [Nokogiri::XML::Node] Element node
|
|
43
52
|
# @return [void]
|
|
44
|
-
def
|
|
53
|
+
def each_namespace_decl(node)
|
|
54
|
+
node.namespace_definitions.each { |ns| yield ns.prefix, ns.href }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Return [prefix, local] for an element
|
|
58
|
+
#
|
|
59
|
+
# @api private
|
|
60
|
+
# @param node [Nokogiri::XML::Node] Element node
|
|
61
|
+
# @return [Array<String, nil>] prefix and local name
|
|
62
|
+
def element_parts(node)
|
|
63
|
+
[node.namespace&.prefix, node.name]
|
|
64
|
+
end
|
|
45
65
|
|
|
46
|
-
#
|
|
66
|
+
# Return [prefix, local] for an attribute
|
|
47
67
|
#
|
|
48
|
-
# @
|
|
49
|
-
# @
|
|
50
|
-
|
|
68
|
+
# @api private
|
|
69
|
+
# @param attr [Nokogiri::XML::Attr] Attribute node
|
|
70
|
+
# @return [Array<String, nil>] prefix and local name
|
|
71
|
+
def attr_parts(attr)
|
|
72
|
+
[attr.namespace&.prefix, attr.name]
|
|
73
|
+
end
|
|
51
74
|
end
|
|
52
75
|
end
|
|
53
76
|
end
|