multi_xml 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/multi_xml.rb CHANGED
@@ -3,309 +3,213 @@ require "date"
3
3
  require "stringio"
4
4
  require "time"
5
5
  require "yaml"
6
-
7
- module MultiXml # rubocop:disable Metrics/ModuleLength
8
- class ParseError < StandardError; end
9
-
10
- class NoParserError < StandardError; end
11
-
12
- class DisallowedTypeError < StandardError
13
- def initialize(type)
14
- super("Disallowed type attribute: #{type.inspect}")
15
- end
16
- end
17
-
18
- unless defined?(REQUIREMENT_MAP)
19
- REQUIREMENT_MAP = [
20
- ["ox", :ox],
21
- ["libxml", :libxml],
22
- ["nokogiri", :nokogiri],
23
- ["rexml/document", :rexml],
24
- ["oga", :oga]
25
- ].freeze
26
- end
27
-
28
- CONTENT_ROOT = "__content__".freeze unless defined?(CONTENT_ROOT)
29
-
30
- unless defined?(PARSING)
31
- float_proc = proc { |float| float.to_f }
32
- datetime_proc = proc { |time| Time.parse(time).utc rescue DateTime.parse(time).utc } # rubocop:disable Style/RescueModifier
33
-
34
- PARSING = {
35
- "symbol" => proc { |symbol| symbol.to_sym },
36
- "date" => proc { |date| Date.parse(date) },
37
- "datetime" => datetime_proc,
38
- "dateTime" => datetime_proc,
39
- "integer" => proc { |integer| integer.to_i },
40
- "float" => float_proc,
41
- "double" => float_proc,
42
- "decimal" => proc { |number| BigDecimal(number) },
43
- "boolean" => proc { |boolean| !%w[0 false].include?(boolean.strip) },
44
- "string" => proc { |string| string.to_s },
45
- "yaml" => proc { |yaml| YAML.load(yaml) rescue yaml }, # rubocop:disable Style/RescueModifier
46
- "base64Binary" => proc { |binary| base64_decode(binary) },
47
- "binary" => proc { |binary, entity| parse_binary(binary, entity) },
48
- "file" => proc { |file, entity| parse_file(file, entity) }
49
- }.freeze
50
- end
51
-
52
- unless defined?(TYPE_NAMES)
53
- TYPE_NAMES = {
54
- "Symbol" => "symbol",
55
- "Integer" => "integer",
56
- "BigDecimal" => "decimal",
57
- "Float" => "float",
58
- "TrueClass" => "boolean",
59
- "FalseClass" => "boolean",
60
- "Date" => "date",
61
- "DateTime" => "datetime",
62
- "Time" => "datetime",
63
- "Array" => "array",
64
- "Hash" => "hash"
65
- }.freeze
66
- end
67
-
68
- DISALLOWED_XML_TYPES = %w[symbol yaml].freeze
69
-
70
- DEFAULT_OPTIONS = {
71
- typecast_xml_value: true,
72
- disallowed_types: DISALLOWED_XML_TYPES,
73
- symbolize_keys: false
74
- }.freeze
75
-
6
+ require_relative "multi_xml/constants"
7
+ require_relative "multi_xml/errors"
8
+ require_relative "multi_xml/file_like"
9
+ require_relative "multi_xml/helpers"
10
+
11
+ # A generic swappable back-end for parsing XML
12
+ #
13
+ # MultiXml provides a unified interface for XML parsing across different
14
+ # parser libraries. It automatically selects the best available parser
15
+ # (Ox, LibXML, Nokogiri, Oga, or REXML) and converts XML to Ruby hashes.
16
+ #
17
+ # @api public
18
+ # @example Parse XML
19
+ # MultiXml.parse('<root><name>John</name></root>')
20
+ # #=> {"root"=>{"name"=>"John"}}
21
+ #
22
+ # @example Set the parser
23
+ # MultiXml.parser = :nokogiri
24
+ module MultiXml
76
25
  class << self
77
- # Get the current parser class.
78
- def parser
79
- return @parser if defined?(@parser)
80
-
81
- self.parser = default_parser
82
- @parser
83
- end
84
-
85
- # The default parser based on what you currently
86
- # have loaded and installed. First checks to see
87
- # if any parsers are already loaded, then checks
88
- # to see which are installed if none are loaded.
89
- def default_parser
90
- return :ox if defined?(::Ox)
91
- return :libxml if defined?(::LibXML)
92
- return :nokogiri if defined?(::Nokogiri)
93
- return :oga if defined?(::Oga)
26
+ include Helpers
94
27
 
95
- REQUIREMENT_MAP.each do |library, parser|
96
- require library
97
- return parser
98
- rescue LoadError
99
- next
100
- end
101
- raise(NoParserError,
102
- "No XML parser detected. If you're using Rubinius and Bundler, try adding an XML parser to your Gemfile (e.g. libxml-ruby, nokogiri, or rubysl-rexml). For more information, see https://github.com/sferik/multi_xml/issues/42.")
28
+ # Get the current XML parser module
29
+ #
30
+ # Returns the currently configured parser, auto-detecting one if not set.
31
+ # Parsers are checked in order of performance: Ox, LibXML, Nokogiri, Oga, REXML.
32
+ #
33
+ # @api public
34
+ # @return [Module] the current parser module
35
+ # @example Get current parser
36
+ # MultiXml.parser #=> MultiXml::Parsers::Ox
37
+ def parser
38
+ @parser ||= resolve_parser(detect_parser)
103
39
  end
104
40
 
105
- # Set the XML parser utilizing a symbol, string, or class.
106
- # Supported by default are:
41
+ # Set the XML parser to use
107
42
  #
108
- # * <tt>:libxml</tt>
109
- # * <tt>:nokogiri</tt>
110
- # * <tt>:ox</tt>
111
- # * <tt>:rexml</tt>
112
- # * <tt>:oga</tt>
43
+ # @api public
44
+ # @param new_parser [Symbol, String, Module] Parser specification
45
+ # - Symbol/String: :libxml, :nokogiri, :ox, :rexml, :oga
46
+ # - Module: Custom parser implementing parse(io) and parse_error
47
+ # @return [Module] the newly configured parser module
48
+ # @example Set parser by symbol
49
+ # MultiXml.parser = :nokogiri
50
+ # @example Set parser by module
51
+ # MultiXml.parser = MyCustomParser
113
52
  def parser=(new_parser)
114
- case new_parser
115
- when String, Symbol
116
- require "multi_xml/parsers/#{new_parser.to_s.downcase}"
117
- @parser = MultiXml::Parsers.const_get(new_parser.to_s.split("_").collect(&:capitalize).join.to_s)
118
- when Class, Module
119
- @parser = new_parser
120
- else
121
- raise("Did not recognize your parser specification. Please specify either a symbol or a class.")
122
- end
53
+ @parser = resolve_parser(new_parser)
123
54
  end
124
55
 
125
- # Parse an XML string or IO into Ruby.
126
- #
127
- # <b>Options</b>
56
+ # Parse XML into a Ruby Hash
128
57
  #
129
- # <tt>:symbolize_keys</tt> :: If true, will use symbols instead of strings for the keys.
130
- #
131
- # <tt>:disallowed_types</tt> :: Types to disallow from being typecasted. Defaults to `['yaml', 'symbol']`. Use `[]` to allow all types.
132
- #
133
- # <tt>:typecast_xml_value</tt> :: If true, won't typecast values for parsed document
134
- def parse(xml, options = {}) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
135
- xml ||= ""
136
-
58
+ # @api public
59
+ # @param xml [String, IO] XML content as a string or IO-like object
60
+ # @param options [Hash] Parsing options
61
+ # @option options [Symbol, String, Module] :parser Parser to use for this call
62
+ # @option options [Boolean] :symbolize_keys Convert keys to symbols (default: false)
63
+ # @option options [Array<String>] :disallowed_types Types to reject (default: ['yaml', 'symbol'])
64
+ # @option options [Boolean] :typecast_xml_value Apply type conversions (default: true)
65
+ # @return [Hash] Parsed XML as nested hash
66
+ # @raise [ParseError] if XML is malformed
67
+ # @raise [DisallowedTypeError] if XML contains a disallowed type attribute
68
+ # @example Parse simple XML
69
+ # MultiXml.parse('<root><name>John</name></root>')
70
+ # #=> {"root"=>{"name"=>"John"}}
71
+ # @example Parse with symbolized keys
72
+ # MultiXml.parse('<root><name>John</name></root>', symbolize_keys: true)
73
+ # #=> {root: {name: "John"}}
74
+ def parse(xml, options = {})
137
75
  options = DEFAULT_OPTIONS.merge(options)
76
+ xml_parser = options[:parser] ? resolve_parser(options.fetch(:parser)) : parser
138
77
 
139
- xml = xml.strip if xml.respond_to?(:strip)
140
- begin
141
- xml = StringIO.new(xml) unless xml.respond_to?(:read)
78
+ io = normalize_input(xml)
79
+ return {} if io.eof?
142
80
 
143
- char = xml.getc
144
- return {} if char.nil?
145
-
146
- xml.ungetc(char)
147
-
148
- hash = undasherize_keys(parser.parse(xml) || {})
149
- hash = typecast_xml_value(hash, options[:disallowed_types]) if options[:typecast_xml_value]
150
- rescue DisallowedTypeError
151
- raise
152
- rescue parser.parse_error => e
153
- raise(ParseError, e.message, e.backtrace)
154
- end
155
- hash = symbolize_keys(hash) if options[:symbolize_keys]
156
- hash
81
+ result = parse_with_error_handling(io, xml, xml_parser)
82
+ result = typecast_xml_value(result, options.fetch(:disallowed_types)) if options.fetch(:typecast_xml_value)
83
+ result = symbolize_keys(result) if options.fetch(:symbolize_keys)
84
+ result
157
85
  end
158
86
 
159
- # This module decorates files with the <tt>original_filename</tt>
160
- # and <tt>content_type</tt> methods.
161
- module FileLike # :nodoc:
162
- attr_writer :original_filename, :content_type
163
-
164
- def original_filename
165
- @original_filename || "untitled"
166
- end
87
+ private
167
88
 
168
- def content_type
169
- @content_type || "application/octet-stream"
89
+ # Resolve a parser specification to a module
90
+ #
91
+ # @api private
92
+ # @param spec [Symbol, String, Class, Module] Parser specification
93
+ # @return [Module] Resolved parser module
94
+ # @raise [RuntimeError] if spec is invalid
95
+ def resolve_parser(spec)
96
+ case spec
97
+ when String, Symbol then load_parser(spec)
98
+ when Module then spec
99
+ else raise "Invalid parser specification: expected Symbol, String, or Module"
170
100
  end
171
101
  end
172
102
 
173
- private
174
-
175
- # TODO: Add support for other encodings
176
- def parse_binary(binary, entity) # :nodoc:
177
- case entity["encoding"]
178
- when "base64"
179
- base64_decode(binary)
180
- else
181
- binary
182
- end
103
+ # Load a parser by name
104
+ #
105
+ # @api private
106
+ # @param name [Symbol, String] Parser name
107
+ # @return [Module] Loaded parser module
108
+ def load_parser(name)
109
+ name = name.to_s.downcase
110
+ require "multi_xml/parsers/#{name}"
111
+ Parsers.const_get(camelize(name))
183
112
  end
184
113
 
185
- def parse_file(file, entity)
186
- f = StringIO.new(base64_decode(file))
187
- f.extend(FileLike)
188
- f.original_filename = entity["name"]
189
- f.content_type = entity["content_type"]
190
- f
114
+ # Convert underscored string to CamelCase
115
+ #
116
+ # @api private
117
+ # @param name [String] Underscored string
118
+ # @return [String] CamelCased string
119
+ def camelize(name)
120
+ name.split("_").map(&:capitalize).join
191
121
  end
192
122
 
193
- def base64_decode(input)
194
- input.unpack1("m")
123
+ # Detect the best available parser
124
+ #
125
+ # @api private
126
+ # @return [Symbol] Parser name
127
+ # @raise [NoParserError] if no parser is available
128
+ def detect_parser
129
+ find_loaded_parser || find_available_parser || raise_no_parser_error
195
130
  end
196
131
 
197
- def symbolize_keys(params)
198
- case params
199
- when Hash
200
- params.inject({}) do |result, (key, value)|
201
- result.merge(key.to_sym => symbolize_keys(value))
202
- end
203
- when Array
204
- params.collect { |value| symbolize_keys(value) }
205
- else
206
- params
132
+ # Parser constant names mapped to their symbols, in preference order
133
+ #
134
+ # @api private
135
+ LOADED_PARSER_CHECKS = {
136
+ Ox: :ox,
137
+ LibXML: :libxml,
138
+ Nokogiri: :nokogiri,
139
+ Oga: :oga
140
+ }.freeze
141
+ private_constant :LOADED_PARSER_CHECKS
142
+
143
+ # Find an already-loaded parser library
144
+ #
145
+ # @api private
146
+ # @return [Symbol, nil] Parser name or nil if none loaded
147
+ def find_loaded_parser
148
+ LOADED_PARSER_CHECKS.each do |const_name, parser_name|
149
+ return parser_name if const_defined?(const_name)
207
150
  end
151
+ nil
208
152
  end
209
153
 
210
- def undasherize_keys(params)
211
- case params
212
- when Hash
213
- params.each_with_object({}) do |(key, value), hash|
214
- hash[key.to_s.tr("-", "_")] = undasherize_keys(value)
215
- hash
216
- end
217
- when Array
218
- params.collect { |value| undasherize_keys(value) }
219
- else
220
- params
154
+ # Try to load and find an available parser
155
+ #
156
+ # @api private
157
+ # @return [Symbol, nil] Parser name or nil if none available
158
+ def find_available_parser
159
+ PARSER_PREFERENCE.each do |library, parser_name|
160
+ return parser_name if try_require(library)
221
161
  end
162
+ nil
222
163
  end
223
164
 
224
- def typecast_xml_value(value, disallowed_types = nil) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
225
- disallowed_types ||= DISALLOWED_XML_TYPES
226
-
227
- case value
228
- when Hash
229
- if value.include?("type") && !value["type"].is_a?(Hash) && disallowed_types.include?(value["type"])
230
- raise(DisallowedTypeError, value["type"])
231
- end
232
-
233
- if value["type"] == "array"
234
-
235
- # this commented-out suggestion helps to avoid the multiple attribute
236
- # problem, but it breaks when there is only one item in the array.
237
- #
238
- # from: https://github.com/jnunemaker/httparty/issues/102
239
- #
240
- # _, entries = value.detect { |k, v| k != 'type' && v.is_a?(Array) }
241
-
242
- # This attempt fails to consider the order that the detect method
243
- # retrieves the entries.
244
- # _, entries = value.detect {|key, _| key != 'type'}
165
+ # Attempt to require a library
166
+ #
167
+ # @api private
168
+ # @param library [String] Library to require
169
+ # @return [Boolean] true if successful, false if LoadError
170
+ def try_require(library)
171
+ require library
172
+ true
173
+ rescue LoadError
174
+ false
175
+ end
245
176
 
246
- # This approach ignores attribute entries that are not convertable
247
- # to an Array which allows attributes to be ignored.
248
- _, entries = value.detect { |k, v| k != "type" && (v.is_a?(Array) || v.is_a?(Hash)) }
177
+ # Raise an error indicating no parser is available
178
+ #
179
+ # @api private
180
+ # @return [void]
181
+ # @raise [NoParserError] always
182
+ def raise_no_parser_error
183
+ raise NoParserError, <<~MSG.chomp
184
+ No XML parser detected. Install one of: ox, nokogiri, libxml-ruby, or oga.
185
+ See https://github.com/sferik/multi_xml for more information.
186
+ MSG
187
+ end
249
188
 
250
- case entries
251
- when NilClass
252
- []
253
- when String
254
- [] if entries.strip.empty?
255
- when Array
256
- entries.collect { |entry| typecast_xml_value(entry, disallowed_types) }
257
- when Hash
258
- [typecast_xml_value(entries, disallowed_types)]
259
- else
260
- raise("can't typecast #{entries.class.name}: #{entries.inspect}")
261
- end
189
+ # Normalize input to an IO-like object
190
+ #
191
+ # @api private
192
+ # @param xml [String, IO] Input to normalize
193
+ # @return [IO] IO-like object
194
+ def normalize_input(xml)
195
+ return xml if xml.respond_to?(:read)
262
196
 
263
- elsif value.key?(CONTENT_ROOT)
264
- content = value[CONTENT_ROOT]
265
- block = PARSING[value["type"]]
266
- if block
267
- if block.arity == 1
268
- value.delete("type") if PARSING[value["type"]]
269
- if value.keys.size > 1
270
- value[CONTENT_ROOT] = block.call(content)
271
- value
272
- else
273
- block.call(content)
274
- end
275
- else
276
- block.call(content, value)
277
- end
278
- else
279
- (value.keys.size > 1) ? value : content
280
- end
281
- elsif value["type"] == "string" && value["nil"] != "true"
282
- ""
283
- # blank or nil parsed values are represented by nil
284
- elsif value.empty? || value["nil"] == "true"
285
- nil
286
- # If the type is the only element which makes it then
287
- # this still makes the value nil, except if type is
288
- # a XML node(where type['value'] is a Hash)
289
- elsif value["type"] && value.size == 1 && !value["type"].is_a?(Hash)
290
- nil
291
- else
292
- xml_value = value.each_with_object({}) do |(k, v), hash|
293
- hash[k] = typecast_xml_value(v, disallowed_types)
294
- hash
295
- end
197
+ StringIO.new(xml.to_s.strip)
198
+ end
296
199
 
297
- # Turn {:files => {:file => #<StringIO>} into {:files => #<StringIO>} so it is compatible with
298
- # how multipart uploaded files from HTML appear
299
- (xml_value["file"].is_a?(StringIO)) ? xml_value["file"] : xml_value
300
- end
301
- when Array
302
- value.map! { |i| typecast_xml_value(i, disallowed_types) }
303
- (value.length > 1) ? value : value.first
304
- when String
305
- value
306
- else
307
- raise("can't typecast #{value.class.name}: #{value.inspect}")
308
- end
200
+ # Parse XML with error handling and key normalization
201
+ #
202
+ # @api private
203
+ # @param io [IO] IO-like object containing XML
204
+ # @param original_input [String, IO] Original input for error reporting
205
+ # @param xml_parser [Module] Parser to use
206
+ # @return [Hash] Parsed XML with undasherized keys
207
+ # @raise [ParseError] if XML is malformed
208
+ def parse_with_error_handling(io, original_input, xml_parser)
209
+ undasherize_keys(xml_parser.parse(io) || {})
210
+ rescue xml_parser.parse_error => e
211
+ xml_string = original_input.respond_to?(:read) ? original_input.tap(&:rewind).read : original_input.to_s
212
+ raise(ParseError.new(e, xml: xml_string, cause: e))
309
213
  end
310
214
  end
311
215
  end