multi_xml 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +16 -0
- data/.rubocop.yml +14 -5
- data/CHANGELOG.md +8 -0
- data/Gemfile +12 -9
- data/README.md +1 -1
- data/Rakefile +35 -7
- data/Steepfile +22 -0
- data/lib/multi_xml/constants.rb +134 -0
- data/lib/multi_xml/errors.rb +93 -0
- data/lib/multi_xml/file_like.rb +62 -0
- data/lib/multi_xml/helpers.rb +228 -0
- data/lib/multi_xml/parsers/dom_parser.rb +97 -0
- data/lib/multi_xml/parsers/libxml.rb +35 -18
- data/lib/multi_xml/parsers/libxml_sax.rb +103 -0
- data/lib/multi_xml/parsers/nokogiri.rb +39 -22
- data/lib/multi_xml/parsers/nokogiri_sax.rb +102 -0
- data/lib/multi_xml/parsers/oga.rb +48 -51
- data/lib/multi_xml/parsers/ox.rb +99 -57
- data/lib/multi_xml/parsers/rexml.rb +84 -78
- data/lib/multi_xml/parsers/sax_handler.rb +117 -0
- data/lib/multi_xml/version.rb +5 -1
- data/lib/multi_xml.rb +173 -269
- data/sig/multi_xml.rbs +227 -0
- metadata +21 -5
- data/lib/multi_xml/parsers/libxml2_parser.rb +0 -70
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
module MultiXml
|
|
2
|
+
# Methods for transforming parsed XML hash structures
|
|
3
|
+
#
|
|
4
|
+
# These helper methods handle key transformation and type casting
|
|
5
|
+
# of parsed XML data structures.
|
|
6
|
+
#
|
|
7
|
+
# @api public
|
|
8
|
+
module Helpers
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
# Recursively convert all hash keys to symbols
|
|
12
|
+
#
|
|
13
|
+
# @api private
|
|
14
|
+
# @param data [Hash, Array, Object] Data to transform
|
|
15
|
+
# @return [Hash, Array, Object] Transformed data with symbolized keys
|
|
16
|
+
# @example Symbolize hash keys
|
|
17
|
+
# symbolize_keys({"name" => "John"}) #=> {name: "John"}
|
|
18
|
+
def symbolize_keys(data)
|
|
19
|
+
transform_keys(data, &:to_sym)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Recursively convert dashes in hash keys to underscores
|
|
23
|
+
#
|
|
24
|
+
# @api private
|
|
25
|
+
# @param data [Hash, Array, Object] Data to transform
|
|
26
|
+
# @return [Hash, Array, Object] Transformed data with undasherized keys
|
|
27
|
+
# @example Convert dashed keys
|
|
28
|
+
# undasherize_keys({"first-name" => "John"}) #=> {"first_name" => "John"}
|
|
29
|
+
def undasherize_keys(data)
|
|
30
|
+
transform_keys(data) { |key| key.tr("-", "_") }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Recursively typecast XML values based on type attributes
|
|
34
|
+
#
|
|
35
|
+
# @api private
|
|
36
|
+
# @param value [Hash, Array, Object] Value to typecast
|
|
37
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
38
|
+
# @return [Object] Typecasted value
|
|
39
|
+
# @raise [DisallowedTypeError] if a disallowed type is encountered
|
|
40
|
+
# @example Typecast integer value
|
|
41
|
+
# typecast_xml_value({"__content__" => "42", "type" => "integer"})
|
|
42
|
+
# #=> 42
|
|
43
|
+
def typecast_xml_value(value, disallowed_types = DISALLOWED_TYPES)
|
|
44
|
+
case value
|
|
45
|
+
when Hash then typecast_hash(value, disallowed_types)
|
|
46
|
+
when Array then typecast_array(value, disallowed_types)
|
|
47
|
+
else value
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Typecast array elements and unwrap single-element arrays
|
|
52
|
+
#
|
|
53
|
+
# @api private
|
|
54
|
+
# @param array [Array] Array to typecast
|
|
55
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
56
|
+
# @return [Object, Array] Typecasted array or single element
|
|
57
|
+
def typecast_array(array, disallowed_types)
|
|
58
|
+
array.map! { |item| typecast_xml_value(item, disallowed_types) }
|
|
59
|
+
array.one? ? array.first : array
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Typecast a hash based on its type attribute
|
|
63
|
+
#
|
|
64
|
+
# @api private
|
|
65
|
+
# @param hash [Hash] Hash to typecast
|
|
66
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
67
|
+
# @return [Object] Typecasted value
|
|
68
|
+
# @raise [DisallowedTypeError] if type is disallowed
|
|
69
|
+
def typecast_hash(hash, disallowed_types)
|
|
70
|
+
type = hash["type"]
|
|
71
|
+
raise DisallowedTypeError, type if disallowed_type?(type, disallowed_types)
|
|
72
|
+
|
|
73
|
+
convert_hash(hash, type, disallowed_types)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Check if a type is in the disallowed list
|
|
77
|
+
#
|
|
78
|
+
# @api private
|
|
79
|
+
# @param type [String, nil] Type to check
|
|
80
|
+
# @param disallowed_types [Array<String>] Disallowed type list
|
|
81
|
+
# @return [Boolean] true if type is disallowed
|
|
82
|
+
def disallowed_type?(type, disallowed_types)
|
|
83
|
+
type && !type.is_a?(Hash) && disallowed_types.include?(type)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Convert a hash based on its type and content
|
|
87
|
+
#
|
|
88
|
+
# @api private
|
|
89
|
+
# @param hash [Hash] Hash to convert
|
|
90
|
+
# @param type [String, nil] Type attribute value
|
|
91
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
92
|
+
# @return [Object] Converted value
|
|
93
|
+
def convert_hash(hash, type, disallowed_types)
|
|
94
|
+
return extract_array_entries(hash, disallowed_types) if type == "array"
|
|
95
|
+
return convert_text_content(hash) if hash.key?(TEXT_CONTENT_KEY)
|
|
96
|
+
return "" if type == "string" && !hash["nil"].eql?("true")
|
|
97
|
+
return nil if empty_value?(hash, type)
|
|
98
|
+
|
|
99
|
+
typecast_children(hash, disallowed_types)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Typecast all child values in a hash
|
|
103
|
+
#
|
|
104
|
+
# @api private
|
|
105
|
+
# @param hash [Hash] Hash with children to typecast
|
|
106
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
107
|
+
# @return [Hash, StringIO] Typecasted hash or unwrapped file
|
|
108
|
+
def typecast_children(hash, disallowed_types)
|
|
109
|
+
result = hash.transform_values { |v| typecast_xml_value(v, disallowed_types) }
|
|
110
|
+
unwrap_file_if_present(result)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Extract array entries from element with type="array"
|
|
114
|
+
#
|
|
115
|
+
# @api private
|
|
116
|
+
# @param hash [Hash] Hash containing array entries
|
|
117
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
118
|
+
# @return [Array] Extracted and typecasted entries
|
|
119
|
+
# @see https://github.com/jnunemaker/httparty/issues/102
|
|
120
|
+
def extract_array_entries(hash, disallowed_types)
|
|
121
|
+
entries = find_array_entries(hash)
|
|
122
|
+
return [] unless entries
|
|
123
|
+
|
|
124
|
+
wrap_and_typecast(entries, disallowed_types)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Find array or hash entries in a hash, excluding the type key
|
|
128
|
+
#
|
|
129
|
+
# @api private
|
|
130
|
+
# @param hash [Hash] Hash to search
|
|
131
|
+
# @return [Array, Hash, nil] Found entries or nil
|
|
132
|
+
def find_array_entries(hash)
|
|
133
|
+
hash.each do |key, value|
|
|
134
|
+
return value if !key.eql?("type") && (value.is_a?(Array) || value.is_a?(Hash))
|
|
135
|
+
end
|
|
136
|
+
nil
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Wrap hash in array if needed and typecast all entries
|
|
140
|
+
#
|
|
141
|
+
# @api private
|
|
142
|
+
# @param entries [Array, Hash] Entries to process
|
|
143
|
+
# @param disallowed_types [Array<String>] Types to reject
|
|
144
|
+
# @return [Array] Typecasted entries
|
|
145
|
+
def wrap_and_typecast(entries, disallowed_types)
|
|
146
|
+
entries = [entries] if entries.is_a?(Hash)
|
|
147
|
+
entries.map { |entry| typecast_xml_value(entry, disallowed_types) }
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Convert text content using type converters
|
|
151
|
+
#
|
|
152
|
+
# @api private
|
|
153
|
+
# @param hash [Hash] Hash containing text content and type
|
|
154
|
+
# @return [Object] Converted value
|
|
155
|
+
def convert_text_content(hash)
|
|
156
|
+
content = hash.fetch(TEXT_CONTENT_KEY)
|
|
157
|
+
converter = TYPE_CONVERTERS[hash["type"]]
|
|
158
|
+
|
|
159
|
+
return unwrap_if_simple(hash, content) unless converter
|
|
160
|
+
|
|
161
|
+
apply_converter(hash, content, converter)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Unwrap value if hash has no other significant keys
|
|
165
|
+
#
|
|
166
|
+
# @api private
|
|
167
|
+
# @param hash [Hash] Original hash
|
|
168
|
+
# @param value [Object] Converted value
|
|
169
|
+
# @return [Object, Hash] Value or hash with merged content
|
|
170
|
+
def unwrap_if_simple(hash, value)
|
|
171
|
+
(hash.size > 1) ? hash.merge(TEXT_CONTENT_KEY => value) : value
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Check if a hash represents an empty value
|
|
175
|
+
#
|
|
176
|
+
# @api private
|
|
177
|
+
# @param hash [Hash] Hash to check
|
|
178
|
+
# @param type [String, nil] Type attribute value
|
|
179
|
+
# @return [Boolean] true if value should be nil
|
|
180
|
+
def empty_value?(hash, type)
|
|
181
|
+
hash.empty? ||
|
|
182
|
+
hash["nil"] == "true" ||
|
|
183
|
+
(type && hash.size == 1 && !type.is_a?(Hash))
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
private
|
|
187
|
+
|
|
188
|
+
# Recursively transform hash keys using a block
|
|
189
|
+
#
|
|
190
|
+
# @api private
|
|
191
|
+
# @param data [Hash, Array, Object] Data to transform
|
|
192
|
+
# @return [Hash, Array, Object] Transformed data
|
|
193
|
+
def transform_keys(data, &block)
|
|
194
|
+
case data
|
|
195
|
+
when Hash then data.each_with_object(
|
|
196
|
+
{} #: Hash[Symbol, MultiXml::xmlValue] # rubocop:disable Layout/LeadingCommentSpace
|
|
197
|
+
) { |(key, value), acc| acc[yield(key)] = transform_keys(value, &block) }
|
|
198
|
+
when Array then data.map { |item| transform_keys(item, &block) }
|
|
199
|
+
else data
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Unwrap a file object from the result hash if present
|
|
204
|
+
#
|
|
205
|
+
# @api private
|
|
206
|
+
# @param result [Hash] Hash that may contain a file
|
|
207
|
+
# @return [Hash, StringIO] The file if present, otherwise the hash
|
|
208
|
+
def unwrap_file_if_present(result)
|
|
209
|
+
file = result["file"]
|
|
210
|
+
file.is_a?(StringIO) ? file : result
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Apply a type converter to content
|
|
214
|
+
#
|
|
215
|
+
# @api private
|
|
216
|
+
# @param hash [Hash] Original hash with type info
|
|
217
|
+
# @param content [String] Content to convert
|
|
218
|
+
# @param converter [Proc] Converter to apply
|
|
219
|
+
# @return [Object] Converted value
|
|
220
|
+
def apply_converter(hash, content, converter)
|
|
221
|
+
# Binary converters need access to entity attributes (e.g., encoding, name)
|
|
222
|
+
return converter.call(content, hash) if converter.arity == 2
|
|
223
|
+
|
|
224
|
+
hash.delete("type")
|
|
225
|
+
unwrap_if_simple(hash, converter.call(content))
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module MultiXml
|
|
2
|
+
module Parsers
|
|
3
|
+
# Shared DOM traversal logic for converting XML nodes to hashes
|
|
4
|
+
#
|
|
5
|
+
# Used by Nokogiri, LibXML, and Oga parsers.
|
|
6
|
+
# Including modules must implement:
|
|
7
|
+
# - each_child(node) { |child| ... }
|
|
8
|
+
# - each_attr(node) { |attr| ... }
|
|
9
|
+
# - node_name(node) -> String
|
|
10
|
+
#
|
|
11
|
+
# @api private
|
|
12
|
+
module DomParser
|
|
13
|
+
# Convert an XML node to a hash representation
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @param node [Object] XML node to convert
|
|
17
|
+
# @param hash [Hash] Accumulator hash for results
|
|
18
|
+
# @return [Hash] Hash representation of the node
|
|
19
|
+
def node_to_hash(node, hash = {})
|
|
20
|
+
node_hash = {TEXT_CONTENT_KEY => +""}
|
|
21
|
+
add_value(hash, node_name(node), node_hash)
|
|
22
|
+
collect_children(node, node_hash)
|
|
23
|
+
collect_attributes(node, node_hash)
|
|
24
|
+
strip_whitespace_content(node_hash)
|
|
25
|
+
hash
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
# Add a value to a hash, converting to array on duplicates
|
|
31
|
+
#
|
|
32
|
+
# @api private
|
|
33
|
+
# @param hash [Hash] Target hash
|
|
34
|
+
# @param key [String] Key to add
|
|
35
|
+
# @param value [Object] Value to add
|
|
36
|
+
# @return [void]
|
|
37
|
+
def add_value(hash, key, value)
|
|
38
|
+
existing = hash[key]
|
|
39
|
+
hash[key] = case existing
|
|
40
|
+
when Array then existing << value
|
|
41
|
+
when Hash then [existing, value]
|
|
42
|
+
else value
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Collect all child nodes into a hash
|
|
47
|
+
#
|
|
48
|
+
# @api private
|
|
49
|
+
# @param node [Object] Parent node
|
|
50
|
+
# @param node_hash [Hash] Hash to populate
|
|
51
|
+
# @return [void]
|
|
52
|
+
def collect_children(node, node_hash)
|
|
53
|
+
each_child(node) do |child|
|
|
54
|
+
if child.element?
|
|
55
|
+
node_to_hash(child, node_hash)
|
|
56
|
+
elsif text_or_cdata?(child)
|
|
57
|
+
node_hash[TEXT_CONTENT_KEY] << child.content
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Check if a node is text or CDATA
|
|
63
|
+
#
|
|
64
|
+
# @api private
|
|
65
|
+
# @param node [Object] Node to check
|
|
66
|
+
# @return [Boolean] true if text or CDATA
|
|
67
|
+
def text_or_cdata?(node)
|
|
68
|
+
node.text? || node.cdata?
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Collect all attributes from a node
|
|
72
|
+
#
|
|
73
|
+
# @api private
|
|
74
|
+
# @param node [Object] Node with attributes
|
|
75
|
+
# @param node_hash [Hash] Hash to populate
|
|
76
|
+
# @return [void]
|
|
77
|
+
def collect_attributes(node, node_hash)
|
|
78
|
+
each_attr(node) do |attr|
|
|
79
|
+
name = node_name(attr)
|
|
80
|
+
existing = node_hash[name]
|
|
81
|
+
node_hash[name] = existing ? [attr.value, existing] : attr.value
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Remove empty or whitespace-only text content
|
|
86
|
+
#
|
|
87
|
+
# @api private
|
|
88
|
+
# @param node_hash [Hash] Hash to clean up
|
|
89
|
+
# @return [void]
|
|
90
|
+
def strip_whitespace_content(node_hash)
|
|
91
|
+
content = node_hash[TEXT_CONTENT_KEY]
|
|
92
|
+
should_remove = content.empty? || (node_hash.size > 1 && content.strip.empty?)
|
|
93
|
+
node_hash.delete(TEXT_CONTENT_KEY) if should_remove
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -1,33 +1,50 @@
|
|
|
1
|
-
require "libxml"
|
|
2
|
-
|
|
1
|
+
require "libxml"
|
|
2
|
+
require_relative "dom_parser"
|
|
3
3
|
|
|
4
4
|
module MultiXml
|
|
5
5
|
module Parsers
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
# XML parser using the LibXML library
|
|
7
|
+
#
|
|
8
|
+
# @api private
|
|
9
|
+
module Libxml
|
|
10
|
+
include DomParser
|
|
8
11
|
extend self
|
|
9
12
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
+
# Get the parse error class for this parser
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [Class] LibXML::XML::Error
|
|
17
|
+
def parse_error = ::LibXML::XML::Error
|
|
13
18
|
|
|
14
|
-
|
|
15
|
-
|
|
19
|
+
# Parse XML from an IO object
|
|
20
|
+
#
|
|
21
|
+
# @api private
|
|
22
|
+
# @param io [IO] IO-like object containing XML
|
|
23
|
+
# @return [Hash] Parsed XML as a hash
|
|
24
|
+
# @raise [LibXML::XML::Error] if XML is malformed
|
|
25
|
+
def parse(io)
|
|
26
|
+
node_to_hash(LibXML::XML::Parser.io(io).parse.root)
|
|
16
27
|
end
|
|
17
28
|
|
|
18
29
|
private
|
|
19
30
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
31
|
+
# Iterate over child nodes
|
|
32
|
+
#
|
|
33
|
+
# @param node [LibXML::XML::Node] Parent node
|
|
34
|
+
# @return [void]
|
|
35
|
+
def each_child(node, &) = node.each_child(&)
|
|
23
36
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
37
|
+
# Iterate over attribute nodes
|
|
38
|
+
#
|
|
39
|
+
# @param node [LibXML::XML::Node] Element node
|
|
40
|
+
# @return [void]
|
|
41
|
+
def each_attr(node, &) = node.each_attr(&)
|
|
27
42
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
43
|
+
# Get the name of a node or attribute
|
|
44
|
+
#
|
|
45
|
+
# @param node [LibXML::XML::Node] Node to get name from
|
|
46
|
+
# @return [String] Node name
|
|
47
|
+
def node_name(node) = node.name
|
|
31
48
|
end
|
|
32
49
|
end
|
|
33
50
|
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
require "libxml"
|
|
2
|
+
require "stringio"
|
|
3
|
+
require_relative "sax_handler"
|
|
4
|
+
|
|
5
|
+
module MultiXml
|
|
6
|
+
module Parsers
|
|
7
|
+
# SAX-based parser using LibXML (faster for large documents)
|
|
8
|
+
#
|
|
9
|
+
# @api private
|
|
10
|
+
module LibxmlSax
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
# Get the parse error class for this parser
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [Class] LibXML::XML::Error
|
|
17
|
+
def parse_error = ::LibXML::XML::Error
|
|
18
|
+
|
|
19
|
+
# Parse XML from a string or IO object
|
|
20
|
+
#
|
|
21
|
+
# @api private
|
|
22
|
+
# @param xml [String, IO] XML content
|
|
23
|
+
# @return [Hash] Parsed XML as a hash
|
|
24
|
+
# @raise [LibXML::XML::Error] if XML is malformed
|
|
25
|
+
def parse(xml)
|
|
26
|
+
io = xml.respond_to?(:read) ? xml : StringIO.new(xml)
|
|
27
|
+
return {} if io.eof?
|
|
28
|
+
|
|
29
|
+
LibXML::XML::Error.set_handler(&LibXML::XML::Error::QUIET_HANDLER)
|
|
30
|
+
handler = Handler.new
|
|
31
|
+
parser = ::LibXML::XML::SaxParser.io(io)
|
|
32
|
+
parser.callbacks = handler
|
|
33
|
+
parser.parse
|
|
34
|
+
handler.result
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# LibXML SAX handler that builds a hash tree while parsing
|
|
38
|
+
#
|
|
39
|
+
# @api private
|
|
40
|
+
class Handler
|
|
41
|
+
include ::LibXML::XML::SaxParser::Callbacks
|
|
42
|
+
include SaxHandler
|
|
43
|
+
|
|
44
|
+
# Create a new SAX handler
|
|
45
|
+
#
|
|
46
|
+
# @api private
|
|
47
|
+
# @return [Handler] new handler instance
|
|
48
|
+
def initialize
|
|
49
|
+
initialize_handler
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Handle start of document (no-op)
|
|
53
|
+
#
|
|
54
|
+
# @api private
|
|
55
|
+
# @return [void]
|
|
56
|
+
def on_start_document
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Handle end of document (no-op)
|
|
60
|
+
#
|
|
61
|
+
# @api private
|
|
62
|
+
# @return [void]
|
|
63
|
+
def on_end_document
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Handle parse errors (no-op, LibXML raises directly)
|
|
67
|
+
#
|
|
68
|
+
# @api private
|
|
69
|
+
# @param _error [String] Error message (unused)
|
|
70
|
+
# @return [void]
|
|
71
|
+
def on_error(_error)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Handle start of an element
|
|
75
|
+
#
|
|
76
|
+
# @api private
|
|
77
|
+
# @param name [String] Element name
|
|
78
|
+
# @param attrs [Hash] Element attributes
|
|
79
|
+
# @return [void]
|
|
80
|
+
def on_start_element(name, attrs = {})
|
|
81
|
+
handle_start_element(name, attrs)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Handle end of an element
|
|
85
|
+
#
|
|
86
|
+
# @api private
|
|
87
|
+
# @param _name [String] Element name (unused)
|
|
88
|
+
# @return [void]
|
|
89
|
+
def on_end_element(_name)
|
|
90
|
+
handle_end_element
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Handle character data
|
|
94
|
+
#
|
|
95
|
+
# @api private
|
|
96
|
+
# @param text [String] Text content
|
|
97
|
+
# @return [void]
|
|
98
|
+
def on_characters(text) = append_text(text)
|
|
99
|
+
alias_method :on_cdata_block, :on_characters
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -1,36 +1,53 @@
|
|
|
1
|
-
require "nokogiri"
|
|
2
|
-
|
|
1
|
+
require "nokogiri"
|
|
2
|
+
require_relative "dom_parser"
|
|
3
3
|
|
|
4
4
|
module MultiXml
|
|
5
5
|
module Parsers
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
# XML parser using the Nokogiri library
|
|
7
|
+
#
|
|
8
|
+
# @api private
|
|
9
|
+
module Nokogiri
|
|
10
|
+
include DomParser
|
|
8
11
|
extend self
|
|
9
12
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
# Get the parse error class for this parser
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [Class] Nokogiri::XML::SyntaxError
|
|
17
|
+
def parse_error = ::Nokogiri::XML::SyntaxError
|
|
18
|
+
|
|
19
|
+
# Parse XML from an IO object
|
|
20
|
+
#
|
|
21
|
+
# @api private
|
|
22
|
+
# @param io [IO] IO-like object containing XML
|
|
23
|
+
# @return [Hash] Parsed XML as a hash
|
|
24
|
+
# @raise [Nokogiri::XML::SyntaxError] if XML is malformed
|
|
25
|
+
def parse(io)
|
|
26
|
+
doc = ::Nokogiri::XML(io)
|
|
27
|
+
raise doc.errors.first unless doc.errors.empty?
|
|
17
28
|
|
|
18
29
|
node_to_hash(doc.root)
|
|
19
30
|
end
|
|
20
31
|
|
|
21
32
|
private
|
|
22
33
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
+
# Iterate over child nodes
|
|
35
|
+
#
|
|
36
|
+
# @param node [Nokogiri::XML::Node] Parent node
|
|
37
|
+
# @return [void]
|
|
38
|
+
def each_child(node, &) = node.children.each(&)
|
|
39
|
+
|
|
40
|
+
# Iterate over attribute nodes
|
|
41
|
+
#
|
|
42
|
+
# @param node [Nokogiri::XML::Node] Element node
|
|
43
|
+
# @return [void]
|
|
44
|
+
def each_attr(node, &) = node.attribute_nodes.each(&)
|
|
45
|
+
|
|
46
|
+
# Get the name of a node or attribute
|
|
47
|
+
#
|
|
48
|
+
# @param node [Nokogiri::XML::Node] Node to get name from
|
|
49
|
+
# @return [String] Node name
|
|
50
|
+
def node_name(node) = node.node_name
|
|
34
51
|
end
|
|
35
52
|
end
|
|
36
53
|
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
require "nokogiri"
|
|
2
|
+
require "stringio"
|
|
3
|
+
require_relative "sax_handler"
|
|
4
|
+
|
|
5
|
+
module MultiXml
|
|
6
|
+
module Parsers
|
|
7
|
+
# SAX-based parser using Nokogiri (faster for large documents)
|
|
8
|
+
#
|
|
9
|
+
# @api private
|
|
10
|
+
module NokogiriSax
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
# Get the parse error class for this parser
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [Class] Nokogiri::XML::SyntaxError
|
|
17
|
+
def parse_error = ::Nokogiri::XML::SyntaxError
|
|
18
|
+
|
|
19
|
+
# Parse XML from a string or IO object
|
|
20
|
+
#
|
|
21
|
+
# @api private
|
|
22
|
+
# @param xml [String, IO] XML content
|
|
23
|
+
# @return [Hash] Parsed XML as a hash
|
|
24
|
+
# @raise [Nokogiri::XML::SyntaxError] if XML is malformed
|
|
25
|
+
def parse(xml)
|
|
26
|
+
io = xml.respond_to?(:read) ? xml : StringIO.new(xml)
|
|
27
|
+
return {} if io.eof?
|
|
28
|
+
|
|
29
|
+
handler = Handler.new
|
|
30
|
+
::Nokogiri::XML::SAX::Parser.new(handler).parse(io)
|
|
31
|
+
handler.result
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Nokogiri SAX handler that builds a hash tree while parsing
|
|
35
|
+
#
|
|
36
|
+
# @api private
|
|
37
|
+
class Handler < ::Nokogiri::XML::SAX::Document
|
|
38
|
+
include SaxHandler
|
|
39
|
+
|
|
40
|
+
# Create a new SAX handler
|
|
41
|
+
#
|
|
42
|
+
# @api private
|
|
43
|
+
# @return [Handler] new handler instance
|
|
44
|
+
def initialize
|
|
45
|
+
super
|
|
46
|
+
initialize_handler
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Handle start of document (no-op)
|
|
50
|
+
#
|
|
51
|
+
# @api private
|
|
52
|
+
# @return [void]
|
|
53
|
+
def start_document
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Handle end of document (no-op)
|
|
57
|
+
#
|
|
58
|
+
# @api private
|
|
59
|
+
# @return [void]
|
|
60
|
+
def end_document
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Handle parse errors
|
|
64
|
+
#
|
|
65
|
+
# @api private
|
|
66
|
+
# @param message [String] Error message
|
|
67
|
+
# @return [void]
|
|
68
|
+
# @raise [Nokogiri::XML::SyntaxError] always
|
|
69
|
+
def error(message)
|
|
70
|
+
raise ::Nokogiri::XML::SyntaxError, message
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Handle start of an element
|
|
74
|
+
#
|
|
75
|
+
# @api private
|
|
76
|
+
# @param name [String] Element name
|
|
77
|
+
# @param attrs [Array] Element attributes as pairs
|
|
78
|
+
# @return [void]
|
|
79
|
+
def start_element(name, attrs = [])
|
|
80
|
+
handle_start_element(name, attrs)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Handle end of an element
|
|
84
|
+
#
|
|
85
|
+
# @api private
|
|
86
|
+
# @param _name [String] Element name (unused)
|
|
87
|
+
# @return [void]
|
|
88
|
+
def end_element(_name)
|
|
89
|
+
handle_end_element
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Handle character data
|
|
93
|
+
#
|
|
94
|
+
# @api private
|
|
95
|
+
# @param text [String] Text content
|
|
96
|
+
# @return [void]
|
|
97
|
+
def characters(text) = append_text(text)
|
|
98
|
+
alias_method :cdata_block, :characters
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|