multi_xml 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,228 @@
1
+ module MultiXml
2
+ # Methods for transforming parsed XML hash structures
3
+ #
4
+ # These helper methods handle key transformation and type casting
5
+ # of parsed XML data structures.
6
+ #
7
+ # @api public
8
+ module Helpers
9
+ module_function
10
+
11
+ # Recursively convert all hash keys to symbols
12
+ #
13
+ # @api private
14
+ # @param data [Hash, Array, Object] Data to transform
15
+ # @return [Hash, Array, Object] Transformed data with symbolized keys
16
+ # @example Symbolize hash keys
17
+ # symbolize_keys({"name" => "John"}) #=> {name: "John"}
18
+ def symbolize_keys(data)
19
+ transform_keys(data, &:to_sym)
20
+ end
21
+
22
+ # Recursively convert dashes in hash keys to underscores
23
+ #
24
+ # @api private
25
+ # @param data [Hash, Array, Object] Data to transform
26
+ # @return [Hash, Array, Object] Transformed data with undasherized keys
27
+ # @example Convert dashed keys
28
+ # undasherize_keys({"first-name" => "John"}) #=> {"first_name" => "John"}
29
+ def undasherize_keys(data)
30
+ transform_keys(data) { |key| key.tr("-", "_") }
31
+ end
32
+
33
+ # Recursively typecast XML values based on type attributes
34
+ #
35
+ # @api private
36
+ # @param value [Hash, Array, Object] Value to typecast
37
+ # @param disallowed_types [Array<String>] Types to reject
38
+ # @return [Object] Typecasted value
39
+ # @raise [DisallowedTypeError] if a disallowed type is encountered
40
+ # @example Typecast integer value
41
+ # typecast_xml_value({"__content__" => "42", "type" => "integer"})
42
+ # #=> 42
43
+ def typecast_xml_value(value, disallowed_types = DISALLOWED_TYPES)
44
+ case value
45
+ when Hash then typecast_hash(value, disallowed_types)
46
+ when Array then typecast_array(value, disallowed_types)
47
+ else value
48
+ end
49
+ end
50
+
51
+ # Typecast array elements and unwrap single-element arrays
52
+ #
53
+ # @api private
54
+ # @param array [Array] Array to typecast
55
+ # @param disallowed_types [Array<String>] Types to reject
56
+ # @return [Object, Array] Typecasted array or single element
57
+ def typecast_array(array, disallowed_types)
58
+ array.map! { |item| typecast_xml_value(item, disallowed_types) }
59
+ array.one? ? array.first : array
60
+ end
61
+
62
+ # Typecast a hash based on its type attribute
63
+ #
64
+ # @api private
65
+ # @param hash [Hash] Hash to typecast
66
+ # @param disallowed_types [Array<String>] Types to reject
67
+ # @return [Object] Typecasted value
68
+ # @raise [DisallowedTypeError] if type is disallowed
69
+ def typecast_hash(hash, disallowed_types)
70
+ type = hash["type"]
71
+ raise DisallowedTypeError, type if disallowed_type?(type, disallowed_types)
72
+
73
+ convert_hash(hash, type, disallowed_types)
74
+ end
75
+
76
+ # Check if a type is in the disallowed list
77
+ #
78
+ # @api private
79
+ # @param type [String, nil] Type to check
80
+ # @param disallowed_types [Array<String>] Disallowed type list
81
+ # @return [Boolean] true if type is disallowed
82
+ def disallowed_type?(type, disallowed_types)
83
+ type && !type.is_a?(Hash) && disallowed_types.include?(type)
84
+ end
85
+
86
+ # Convert a hash based on its type and content
87
+ #
88
+ # @api private
89
+ # @param hash [Hash] Hash to convert
90
+ # @param type [String, nil] Type attribute value
91
+ # @param disallowed_types [Array<String>] Types to reject
92
+ # @return [Object] Converted value
93
+ def convert_hash(hash, type, disallowed_types)
94
+ return extract_array_entries(hash, disallowed_types) if type == "array"
95
+ return convert_text_content(hash) if hash.key?(TEXT_CONTENT_KEY)
96
+ return "" if type == "string" && !hash["nil"].eql?("true")
97
+ return nil if empty_value?(hash, type)
98
+
99
+ typecast_children(hash, disallowed_types)
100
+ end
101
+
102
+ # Typecast all child values in a hash
103
+ #
104
+ # @api private
105
+ # @param hash [Hash] Hash with children to typecast
106
+ # @param disallowed_types [Array<String>] Types to reject
107
+ # @return [Hash, StringIO] Typecasted hash or unwrapped file
108
+ def typecast_children(hash, disallowed_types)
109
+ result = hash.transform_values { |v| typecast_xml_value(v, disallowed_types) }
110
+ unwrap_file_if_present(result)
111
+ end
112
+
113
+ # Extract array entries from element with type="array"
114
+ #
115
+ # @api private
116
+ # @param hash [Hash] Hash containing array entries
117
+ # @param disallowed_types [Array<String>] Types to reject
118
+ # @return [Array] Extracted and typecasted entries
119
+ # @see https://github.com/jnunemaker/httparty/issues/102
120
+ def extract_array_entries(hash, disallowed_types)
121
+ entries = find_array_entries(hash)
122
+ return [] unless entries
123
+
124
+ wrap_and_typecast(entries, disallowed_types)
125
+ end
126
+
127
+ # Find array or hash entries in a hash, excluding the type key
128
+ #
129
+ # @api private
130
+ # @param hash [Hash] Hash to search
131
+ # @return [Array, Hash, nil] Found entries or nil
132
+ def find_array_entries(hash)
133
+ hash.each do |key, value|
134
+ return value if !key.eql?("type") && (value.is_a?(Array) || value.is_a?(Hash))
135
+ end
136
+ nil
137
+ end
138
+
139
+ # Wrap hash in array if needed and typecast all entries
140
+ #
141
+ # @api private
142
+ # @param entries [Array, Hash] Entries to process
143
+ # @param disallowed_types [Array<String>] Types to reject
144
+ # @return [Array] Typecasted entries
145
+ def wrap_and_typecast(entries, disallowed_types)
146
+ entries = [entries] if entries.is_a?(Hash)
147
+ entries.map { |entry| typecast_xml_value(entry, disallowed_types) }
148
+ end
149
+
150
+ # Convert text content using type converters
151
+ #
152
+ # @api private
153
+ # @param hash [Hash] Hash containing text content and type
154
+ # @return [Object] Converted value
155
+ def convert_text_content(hash)
156
+ content = hash.fetch(TEXT_CONTENT_KEY)
157
+ converter = TYPE_CONVERTERS[hash["type"]]
158
+
159
+ return unwrap_if_simple(hash, content) unless converter
160
+
161
+ apply_converter(hash, content, converter)
162
+ end
163
+
164
+ # Unwrap value if hash has no other significant keys
165
+ #
166
+ # @api private
167
+ # @param hash [Hash] Original hash
168
+ # @param value [Object] Converted value
169
+ # @return [Object, Hash] Value or hash with merged content
170
+ def unwrap_if_simple(hash, value)
171
+ (hash.size > 1) ? hash.merge(TEXT_CONTENT_KEY => value) : value
172
+ end
173
+
174
+ # Check if a hash represents an empty value
175
+ #
176
+ # @api private
177
+ # @param hash [Hash] Hash to check
178
+ # @param type [String, nil] Type attribute value
179
+ # @return [Boolean] true if value should be nil
180
+ def empty_value?(hash, type)
181
+ hash.empty? ||
182
+ hash["nil"] == "true" ||
183
+ (type && hash.size == 1 && !type.is_a?(Hash))
184
+ end
185
+
186
+ private
187
+
188
+ # Recursively transform hash keys using a block
189
+ #
190
+ # @api private
191
+ # @param data [Hash, Array, Object] Data to transform
192
+ # @return [Hash, Array, Object] Transformed data
193
+ def transform_keys(data, &block)
194
+ case data
195
+ when Hash then data.each_with_object(
196
+ {} #: Hash[Symbol, MultiXml::xmlValue] # rubocop:disable Layout/LeadingCommentSpace
197
+ ) { |(key, value), acc| acc[yield(key)] = transform_keys(value, &block) }
198
+ when Array then data.map { |item| transform_keys(item, &block) }
199
+ else data
200
+ end
201
+ end
202
+
203
+ # Unwrap a file object from the result hash if present
204
+ #
205
+ # @api private
206
+ # @param result [Hash] Hash that may contain a file
207
+ # @return [Hash, StringIO] The file if present, otherwise the hash
208
+ def unwrap_file_if_present(result)
209
+ file = result["file"]
210
+ file.is_a?(StringIO) ? file : result
211
+ end
212
+
213
+ # Apply a type converter to content
214
+ #
215
+ # @api private
216
+ # @param hash [Hash] Original hash with type info
217
+ # @param content [String] Content to convert
218
+ # @param converter [Proc] Converter to apply
219
+ # @return [Object] Converted value
220
+ def apply_converter(hash, content, converter)
221
+ # Binary converters need access to entity attributes (e.g., encoding, name)
222
+ return converter.call(content, hash) if converter.arity == 2
223
+
224
+ hash.delete("type")
225
+ unwrap_if_simple(hash, converter.call(content))
226
+ end
227
+ end
228
+ end
@@ -0,0 +1,97 @@
1
+ module MultiXml
2
+ module Parsers
3
+ # Shared DOM traversal logic for converting XML nodes to hashes
4
+ #
5
+ # Used by Nokogiri, LibXML, and Oga parsers.
6
+ # Including modules must implement:
7
+ # - each_child(node) { |child| ... }
8
+ # - each_attr(node) { |attr| ... }
9
+ # - node_name(node) -> String
10
+ #
11
+ # @api private
12
+ module DomParser
13
+ # Convert an XML node to a hash representation
14
+ #
15
+ # @api private
16
+ # @param node [Object] XML node to convert
17
+ # @param hash [Hash] Accumulator hash for results
18
+ # @return [Hash] Hash representation of the node
19
+ def node_to_hash(node, hash = {})
20
+ node_hash = {TEXT_CONTENT_KEY => +""}
21
+ add_value(hash, node_name(node), node_hash)
22
+ collect_children(node, node_hash)
23
+ collect_attributes(node, node_hash)
24
+ strip_whitespace_content(node_hash)
25
+ hash
26
+ end
27
+
28
+ private
29
+
30
+ # Add a value to a hash, converting to array on duplicates
31
+ #
32
+ # @api private
33
+ # @param hash [Hash] Target hash
34
+ # @param key [String] Key to add
35
+ # @param value [Object] Value to add
36
+ # @return [void]
37
+ def add_value(hash, key, value)
38
+ existing = hash[key]
39
+ hash[key] = case existing
40
+ when Array then existing << value
41
+ when Hash then [existing, value]
42
+ else value
43
+ end
44
+ end
45
+
46
+ # Collect all child nodes into a hash
47
+ #
48
+ # @api private
49
+ # @param node [Object] Parent node
50
+ # @param node_hash [Hash] Hash to populate
51
+ # @return [void]
52
+ def collect_children(node, node_hash)
53
+ each_child(node) do |child|
54
+ if child.element?
55
+ node_to_hash(child, node_hash)
56
+ elsif text_or_cdata?(child)
57
+ node_hash[TEXT_CONTENT_KEY] << child.content
58
+ end
59
+ end
60
+ end
61
+
62
+ # Check if a node is text or CDATA
63
+ #
64
+ # @api private
65
+ # @param node [Object] Node to check
66
+ # @return [Boolean] true if text or CDATA
67
+ def text_or_cdata?(node)
68
+ node.text? || node.cdata?
69
+ end
70
+
71
+ # Collect all attributes from a node
72
+ #
73
+ # @api private
74
+ # @param node [Object] Node with attributes
75
+ # @param node_hash [Hash] Hash to populate
76
+ # @return [void]
77
+ def collect_attributes(node, node_hash)
78
+ each_attr(node) do |attr|
79
+ name = node_name(attr)
80
+ existing = node_hash[name]
81
+ node_hash[name] = existing ? [attr.value, existing] : attr.value
82
+ end
83
+ end
84
+
85
+ # Remove empty or whitespace-only text content
86
+ #
87
+ # @api private
88
+ # @param node_hash [Hash] Hash to clean up
89
+ # @return [void]
90
+ def strip_whitespace_content(node_hash)
91
+ content = node_hash[TEXT_CONTENT_KEY]
92
+ should_remove = content.empty? || (node_hash.size > 1 && content.strip.empty?)
93
+ node_hash.delete(TEXT_CONTENT_KEY) if should_remove
94
+ end
95
+ end
96
+ end
97
+ end
@@ -1,33 +1,50 @@
1
- require "libxml" unless defined?(LibXML)
2
- require "multi_xml/parsers/libxml2_parser"
1
+ require "libxml"
2
+ require_relative "dom_parser"
3
3
 
4
4
  module MultiXml
5
5
  module Parsers
6
- module Libxml # :nodoc:
7
- include Libxml2Parser
6
+ # XML parser using the LibXML library
7
+ #
8
+ # @api private
9
+ module Libxml
10
+ include DomParser
8
11
  extend self
9
12
 
10
- def parse_error
11
- ::LibXML::XML::Error
12
- end
13
+ # Get the parse error class for this parser
14
+ #
15
+ # @api private
16
+ # @return [Class] LibXML::XML::Error
17
+ def parse_error = ::LibXML::XML::Error
13
18
 
14
- def parse(xml)
15
- node_to_hash(LibXML::XML::Parser.io(xml).parse.root)
19
+ # Parse XML from an IO object
20
+ #
21
+ # @api private
22
+ # @param io [IO] IO-like object containing XML
23
+ # @return [Hash] Parsed XML as a hash
24
+ # @raise [LibXML::XML::Error] if XML is malformed
25
+ def parse(io)
26
+ node_to_hash(LibXML::XML::Parser.io(io).parse.root)
16
27
  end
17
28
 
18
29
  private
19
30
 
20
- def each_child(node, &)
21
- node.each_child(&)
22
- end
31
+ # Iterate over child nodes
32
+ #
33
+ # @param node [LibXML::XML::Node] Parent node
34
+ # @return [void]
35
+ def each_child(node, &) = node.each_child(&)
23
36
 
24
- def each_attr(node, &)
25
- node.each_attr(&)
26
- end
37
+ # Iterate over attribute nodes
38
+ #
39
+ # @param node [LibXML::XML::Node] Element node
40
+ # @return [void]
41
+ def each_attr(node, &) = node.each_attr(&)
27
42
 
28
- def node_name(node)
29
- node.name
30
- end
43
+ # Get the name of a node or attribute
44
+ #
45
+ # @param node [LibXML::XML::Node] Node to get name from
46
+ # @return [String] Node name
47
+ def node_name(node) = node.name
31
48
  end
32
49
  end
33
50
  end
@@ -0,0 +1,103 @@
1
+ require "libxml"
2
+ require "stringio"
3
+ require_relative "sax_handler"
4
+
5
+ module MultiXml
6
+ module Parsers
7
+ # SAX-based parser using LibXML (faster for large documents)
8
+ #
9
+ # @api private
10
+ module LibxmlSax
11
+ module_function
12
+
13
+ # Get the parse error class for this parser
14
+ #
15
+ # @api private
16
+ # @return [Class] LibXML::XML::Error
17
+ def parse_error = ::LibXML::XML::Error
18
+
19
+ # Parse XML from a string or IO object
20
+ #
21
+ # @api private
22
+ # @param xml [String, IO] XML content
23
+ # @return [Hash] Parsed XML as a hash
24
+ # @raise [LibXML::XML::Error] if XML is malformed
25
+ def parse(xml)
26
+ io = xml.respond_to?(:read) ? xml : StringIO.new(xml)
27
+ return {} if io.eof?
28
+
29
+ LibXML::XML::Error.set_handler(&LibXML::XML::Error::QUIET_HANDLER)
30
+ handler = Handler.new
31
+ parser = ::LibXML::XML::SaxParser.io(io)
32
+ parser.callbacks = handler
33
+ parser.parse
34
+ handler.result
35
+ end
36
+
37
+ # LibXML SAX handler that builds a hash tree while parsing
38
+ #
39
+ # @api private
40
+ class Handler
41
+ include ::LibXML::XML::SaxParser::Callbacks
42
+ include SaxHandler
43
+
44
+ # Create a new SAX handler
45
+ #
46
+ # @api private
47
+ # @return [Handler] new handler instance
48
+ def initialize
49
+ initialize_handler
50
+ end
51
+
52
+ # Handle start of document (no-op)
53
+ #
54
+ # @api private
55
+ # @return [void]
56
+ def on_start_document
57
+ end
58
+
59
+ # Handle end of document (no-op)
60
+ #
61
+ # @api private
62
+ # @return [void]
63
+ def on_end_document
64
+ end
65
+
66
+ # Handle parse errors (no-op, LibXML raises directly)
67
+ #
68
+ # @api private
69
+ # @param _error [String] Error message (unused)
70
+ # @return [void]
71
+ def on_error(_error)
72
+ end
73
+
74
+ # Handle start of an element
75
+ #
76
+ # @api private
77
+ # @param name [String] Element name
78
+ # @param attrs [Hash] Element attributes
79
+ # @return [void]
80
+ def on_start_element(name, attrs = {})
81
+ handle_start_element(name, attrs)
82
+ end
83
+
84
+ # Handle end of an element
85
+ #
86
+ # @api private
87
+ # @param _name [String] Element name (unused)
88
+ # @return [void]
89
+ def on_end_element(_name)
90
+ handle_end_element
91
+ end
92
+
93
+ # Handle character data
94
+ #
95
+ # @api private
96
+ # @param text [String] Text content
97
+ # @return [void]
98
+ def on_characters(text) = append_text(text)
99
+ alias_method :on_cdata_block, :on_characters
100
+ end
101
+ end
102
+ end
103
+ end
@@ -1,36 +1,53 @@
1
- require "nokogiri" unless defined?(Nokogiri)
2
- require "multi_xml/parsers/libxml2_parser"
1
+ require "nokogiri"
2
+ require_relative "dom_parser"
3
3
 
4
4
  module MultiXml
5
5
  module Parsers
6
- module Nokogiri # :nodoc:
7
- include Libxml2Parser
6
+ # XML parser using the Nokogiri library
7
+ #
8
+ # @api private
9
+ module Nokogiri
10
+ include DomParser
8
11
  extend self
9
12
 
10
- def parse_error
11
- ::Nokogiri::XML::SyntaxError
12
- end
13
-
14
- def parse(xml)
15
- doc = ::Nokogiri::XML(xml)
16
- raise(doc.errors.first) unless doc.errors.empty?
13
+ # Get the parse error class for this parser
14
+ #
15
+ # @api private
16
+ # @return [Class] Nokogiri::XML::SyntaxError
17
+ def parse_error = ::Nokogiri::XML::SyntaxError
18
+
19
+ # Parse XML from an IO object
20
+ #
21
+ # @api private
22
+ # @param io [IO] IO-like object containing XML
23
+ # @return [Hash] Parsed XML as a hash
24
+ # @raise [Nokogiri::XML::SyntaxError] if XML is malformed
25
+ def parse(io)
26
+ doc = ::Nokogiri::XML(io)
27
+ raise doc.errors.first unless doc.errors.empty?
17
28
 
18
29
  node_to_hash(doc.root)
19
30
  end
20
31
 
21
32
  private
22
33
 
23
- def each_child(node, &)
24
- node.children.each(&)
25
- end
26
-
27
- def each_attr(node, &)
28
- node.attribute_nodes.each(&)
29
- end
30
-
31
- def node_name(node)
32
- node.node_name
33
- end
34
+ # Iterate over child nodes
35
+ #
36
+ # @param node [Nokogiri::XML::Node] Parent node
37
+ # @return [void]
38
+ def each_child(node, &) = node.children.each(&)
39
+
40
+ # Iterate over attribute nodes
41
+ #
42
+ # @param node [Nokogiri::XML::Node] Element node
43
+ # @return [void]
44
+ def each_attr(node, &) = node.attribute_nodes.each(&)
45
+
46
+ # Get the name of a node or attribute
47
+ #
48
+ # @param node [Nokogiri::XML::Node] Node to get name from
49
+ # @return [String] Node name
50
+ def node_name(node) = node.node_name
34
51
  end
35
52
  end
36
53
  end
@@ -0,0 +1,102 @@
1
+ require "nokogiri"
2
+ require "stringio"
3
+ require_relative "sax_handler"
4
+
5
+ module MultiXml
6
+ module Parsers
7
+ # SAX-based parser using Nokogiri (faster for large documents)
8
+ #
9
+ # @api private
10
+ module NokogiriSax
11
+ module_function
12
+
13
+ # Get the parse error class for this parser
14
+ #
15
+ # @api private
16
+ # @return [Class] Nokogiri::XML::SyntaxError
17
+ def parse_error = ::Nokogiri::XML::SyntaxError
18
+
19
+ # Parse XML from a string or IO object
20
+ #
21
+ # @api private
22
+ # @param xml [String, IO] XML content
23
+ # @return [Hash] Parsed XML as a hash
24
+ # @raise [Nokogiri::XML::SyntaxError] if XML is malformed
25
+ def parse(xml)
26
+ io = xml.respond_to?(:read) ? xml : StringIO.new(xml)
27
+ return {} if io.eof?
28
+
29
+ handler = Handler.new
30
+ ::Nokogiri::XML::SAX::Parser.new(handler).parse(io)
31
+ handler.result
32
+ end
33
+
34
+ # Nokogiri SAX handler that builds a hash tree while parsing
35
+ #
36
+ # @api private
37
+ class Handler < ::Nokogiri::XML::SAX::Document
38
+ include SaxHandler
39
+
40
+ # Create a new SAX handler
41
+ #
42
+ # @api private
43
+ # @return [Handler] new handler instance
44
+ def initialize
45
+ super
46
+ initialize_handler
47
+ end
48
+
49
+ # Handle start of document (no-op)
50
+ #
51
+ # @api private
52
+ # @return [void]
53
+ def start_document
54
+ end
55
+
56
+ # Handle end of document (no-op)
57
+ #
58
+ # @api private
59
+ # @return [void]
60
+ def end_document
61
+ end
62
+
63
+ # Handle parse errors
64
+ #
65
+ # @api private
66
+ # @param message [String] Error message
67
+ # @return [void]
68
+ # @raise [Nokogiri::XML::SyntaxError] always
69
+ def error(message)
70
+ raise ::Nokogiri::XML::SyntaxError, message
71
+ end
72
+
73
+ # Handle start of an element
74
+ #
75
+ # @api private
76
+ # @param name [String] Element name
77
+ # @param attrs [Array] Element attributes as pairs
78
+ # @return [void]
79
+ def start_element(name, attrs = [])
80
+ handle_start_element(name, attrs)
81
+ end
82
+
83
+ # Handle end of an element
84
+ #
85
+ # @api private
86
+ # @param _name [String] Element name (unused)
87
+ # @return [void]
88
+ def end_element(_name)
89
+ handle_end_element
90
+ end
91
+
92
+ # Handle character data
93
+ #
94
+ # @api private
95
+ # @param text [String] Text content
96
+ # @return [void]
97
+ def characters(text) = append_text(text)
98
+ alias_method :cdata_block, :characters
99
+ end
100
+ end
101
+ end
102
+ end