rexml 3.2.6 → 3.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,20 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
128
184
  @closed = nil
185
+ @have_root = false
129
186
  @document_status = nil
130
187
  @tags = []
131
188
  @stack = []
132
189
  @entities = []
133
- @nsstack = []
190
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
191
+ @namespaces_restore_stack = []
134
192
  end
135
193
 
136
194
  def position
@@ -180,6 +238,8 @@ module REXML
180
238
 
181
239
  # Returns the next event. This is a +PullEvent+ object.
182
240
  def pull
241
+ @source.drop_parsed_content
242
+
183
243
  pull_event.tap do |event|
184
244
  @listeners.each do |listener|
185
245
  listener.receive event
@@ -192,236 +252,274 @@ module REXML
192
252
  x, @closed = @closed, nil
193
253
  return [ :end_element, x ]
194
254
  end
195
- return [ :end_document ] if empty?
255
+ if empty?
256
+ if @document_status == :in_doctype
257
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
258
+ end
259
+ unless @tags.empty?
260
+ path = "/" + @tags.join("/")
261
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
262
+ end
263
+ return [ :end_document ]
264
+ end
196
265
  return @stack.shift if @stack.size > 0
197
266
  #STDERR.puts @source.encoding
198
267
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
268
+
269
+ @source.ensure_buffer
199
270
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
271
+ start_position = @source.position
272
+ if @source.match("<?", true)
223
273
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
274
+ elsif @source.match("<!", true)
275
+ if @source.match("--", true)
276
+ md = @source.match(/(.*?)-->/um, true)
277
+ if md.nil?
278
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
279
  end
243
- if @source.match(/\A\s*\[/um, true)
280
+ if /--|-\z/.match?(md[1])
281
+ raise REXML::ParseException.new("Malformed comment", @source)
282
+ end
283
+ return [ :comment, md[1] ]
284
+ elsif @source.match("DOCTYPE", true)
285
+ base_error_message = "Malformed DOCTYPE"
286
+ unless @source.match(/\s+/um, true)
287
+ if @source.match(">")
288
+ message = "#{base_error_message}: name is missing"
289
+ else
290
+ message = "#{base_error_message}: invalid name"
291
+ end
292
+ @source.position = start_position
293
+ raise REXML::ParseException.new(message, @source)
294
+ end
295
+ name = parse_name(base_error_message)
296
+ if @source.match(/\s*\[/um, true)
297
+ id = [nil, nil, nil]
244
298
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
299
+ elsif @source.match(/\s*>/um, true)
300
+ id = [nil, nil, nil]
246
301
  @document_status = :after_doctype
302
+ @source.ensure_buffer
247
303
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
304
+ id = parse_id(base_error_message,
305
+ accept_external_id: true,
306
+ accept_public_id: false)
307
+ if id[0] == "SYSTEM"
308
+ # For backward compatibility
309
+ id[1], id[2] = id[2], nil
310
+ end
311
+ if @source.match(/\s*\[/um, true)
312
+ @document_status = :in_doctype
313
+ elsif @source.match(/\s*>/um, true)
314
+ @document_status = :after_doctype
315
+ @source.ensure_buffer
316
+ else
317
+ message = "#{base_error_message}: garbage after external ID"
318
+ raise REXML::ParseException.new(message, @source)
319
+ end
250
320
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
321
+ args = [:start_doctype, name, *id]
322
+ if @document_status == :after_doctype
323
+ @source.match(/\s*/um, true)
324
+ @stack << [ :end_doctype ]
325
+ end
326
+ return args
327
+ else
328
+ message = "Invalid XML"
329
+ raise REXML::ParseException.new(message, @source)
263
330
  end
264
331
  end
265
332
  end
266
333
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
334
+ @source.match(/\s*/um, true) # skip spaces
335
+ start_position = @source.position
336
+ if @source.match("<!", true)
337
+ if @source.match("ELEMENT", true)
338
+ md = @source.match(/(.*?)>/um, true)
339
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
340
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
341
+ elsif @source.match("ENTITY", true)
342
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
343
+ unless match_data
344
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
345
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
346
+ match = [:entitydecl, *match_data.captures.compact]
347
+ ref = false
348
+ if match[1] == '%'
349
+ ref = true
350
+ match.delete_at 1
351
+ end
352
+ # Now we have to sort out what kind of entity reference this is
353
+ if match[2] == 'SYSTEM'
354
+ # External reference
355
+ match[3] = match[3][1..-2] # PUBID
356
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
357
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
358
+ elsif match[2] == 'PUBLIC'
359
+ # External reference
360
+ match[3] = match[3][1..-2] # PUBID
361
+ match[4] = match[4][1..-2] # HREF
362
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
363
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
364
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
365
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
366
  else
329
- message = "#{base_error_message}: invalid declaration name"
367
+ match[2] = match[2][1..-2]
368
+ match.pop if match.size == 4
369
+ # match is [ :entity, name, value ]
330
370
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
371
+ match << '%' if ref
372
+ return match
373
+ elsif @source.match("ATTLIST", true)
374
+ md = @source.match(Private::ATTLISTDECL_END, true)
375
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
376
+ element = md[1]
377
+ contents = md[0]
378
+
379
+ pairs = {}
380
+ values = md[0].strip.scan( ATTDEF_RE )
381
+ values.each do |attdef|
382
+ unless attdef[3] == "#IMPLIED"
383
+ attdef.compact!
384
+ val = attdef[3]
385
+ val = attdef[4] if val == "#FIXED "
386
+ pairs[attdef[0]] = val
387
+ if attdef[0] =~ /^xmlns:(.*)/
388
+ @namespaces[$1] = val
389
+ end
390
+ end
391
+ end
392
+ return [ :attlistdecl, element, pairs, contents ]
393
+ elsif @source.match("NOTATION", true)
394
+ base_error_message = "Malformed notation declaration"
395
+ unless @source.match(/\s+/um, true)
396
+ if @source.match(">")
397
+ message = "#{base_error_message}: name is missing"
398
+ else
399
+ message = "#{base_error_message}: invalid name"
400
+ end
401
+ @source.position = start_position
402
+ raise REXML::ParseException.new(message, @source)
403
+ end
404
+ name = parse_name(base_error_message)
405
+ id = parse_id(base_error_message,
406
+ accept_external_id: true,
407
+ accept_public_id: true)
408
+ unless @source.match(/\s*>/um, true)
409
+ message = "#{base_error_message}: garbage before end >"
410
+ raise REXML::ParseException.new(message, @source)
411
+ end
412
+ return [:notationdecl, name, *id]
413
+ elsif md = @source.match(/--(.*?)-->/um, true)
414
+ case md[1]
415
+ when /--/, /-\z/
416
+ raise REXML::ParseException.new("Malformed comment", @source)
417
+ end
418
+ return [ :comment, md[1] ] if md
340
419
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
420
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
421
+ return [ :externalentity, match[1] ]
422
+ elsif @source.match(/\]\s*>/um, true)
343
423
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
424
  return [ :end_doctype ]
346
425
  end
426
+ if @document_status == :in_doctype
427
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
428
+ end
347
429
  end
348
430
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
431
+ @source.match(/\s*/um, true)
350
432
  end
351
433
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
434
+ start_position = @source.position
435
+ if @source.match("<", true)
436
+ # :text's read_until may remain only "<" in buffer. In the
437
+ # case, buffer is empty here. So we need to fill buffer
438
+ # here explicitly.
439
+ @source.ensure_buffer
440
+ if @source.match("/", true)
441
+ @namespaces_restore_stack.pop
356
442
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
443
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
444
  if md and !last_tag
359
445
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
446
  raise REXML::ParseException.new(message, @source)
361
447
  end
362
448
  if md.nil? or last_tag != md[1]
363
449
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
450
+ message += " (got '#{md[1]}')" if md
451
+ @source.position = start_position if md.nil?
365
452
  raise REXML::ParseException.new(message, @source)
366
453
  end
367
454
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
455
+ elsif @source.match("!", true)
456
+ md = @source.match(/([^>]*>)/um)
370
457
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
458
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
459
+ if md[0][0] == ?-
460
+ md = @source.match(/--(.*?)-->/um, true)
374
461
 
375
- case md[1]
376
- when /--/, /-\z/
462
+ if md.nil? || /--|-\z/.match?(md[1])
377
463
  raise REXML::ParseException.new("Malformed comment", @source)
378
464
  end
379
465
 
380
- return [ :comment, md[1] ] if md
466
+ return [ :comment, md[1] ]
381
467
  else
382
- md = @source.match( CDATA_PATTERN, true )
468
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
469
  return [ :cdata, md[1] ] if md
384
470
  end
385
471
  raise REXML::ParseException.new( "Declarations can only occur "+
386
472
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
473
+ elsif @source.match("?", true)
388
474
  return process_instruction
389
475
  else
390
476
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
477
+ md = @source.match(Private::TAG_PATTERN, true)
392
478
  unless md
479
+ @source.position = start_position
393
480
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
481
  end
482
+ tag = md[1]
395
483
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
484
+ @prefixes.clear
485
+ @prefixes << md[2] if md[2]
486
+ push_namespaces_restore
487
+ attributes, closed = parse_attributes(@prefixes)
400
488
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
489
+ for prefix in @prefixes
490
+ unless @namespaces.key?(prefix)
403
491
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
492
  end
405
493
  end
406
494
 
407
495
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
496
+ @closed = tag
497
+ pop_namespaces_restore
410
498
  else
411
- @tags.push( md[1] )
499
+ if @tags.empty? and @have_root
500
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
501
+ end
502
+ @tags.push( tag )
412
503
  end
413
- return [ :start_element, md[1], attributes ]
504
+ @have_root = true
505
+ return [ :start_element, tag, attributes ]
414
506
  end
415
507
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
508
+ text = @source.read_until("<")
509
+ if text.chomp!("<")
510
+ @source.position -= "<".bytesize
511
+ end
512
+ if @tags.empty?
513
+ unless /\A\s*\z/.match?(text)
514
+ if @have_root
515
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
516
+ else
517
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
518
+ end
519
+ end
520
+ return pull_event if @have_root
419
521
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
522
+ return [ :text, text ]
425
523
  end
426
524
  rescue REXML::UndefinedNamespaceException
427
525
  raise
@@ -436,13 +534,13 @@ module REXML
436
534
  private :pull_event
437
535
 
438
536
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
537
+ return unless entities
538
+
539
+ value = entities[ reference ]
540
+ return if value.nil?
541
+
542
+ record_entity_expansion
543
+ unnormalize( value, entities )
446
544
  end
447
545
 
448
546
  # Escapes all possible entities
@@ -463,35 +561,87 @@ module REXML
463
561
 
464
562
  # Unescapes all possible entities
465
563
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
564
+ if string.include?("\r")
565
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
566
+ else
567
+ rv = string.dup
568
+ end
468
569
  matches = rv.scan( REFERENCE_RE )
469
570
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
571
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
572
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
573
+ if m.start_with?("x")
574
+ code_point = Integer(m[1..-1], 16)
575
+ else
576
+ code_point = Integer(m, 10)
577
+ end
578
+ [code_point].pack('U*')
474
579
  }
475
580
  matches.collect!{|x|x[0]}.compact!
581
+ if filter
582
+ matches.reject! do |entity_reference|
583
+ filter.include?(entity_reference)
584
+ end
585
+ end
476
586
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
587
+ matches.tally.each do |entity_reference, n|
588
+ entity_expansion_count_before = @entity_expansion_count
589
+ entity_value = entity( entity_reference, entities )
590
+ if entity_value
591
+ if n > 1
592
+ entity_expansion_count_delta =
593
+ @entity_expansion_count - entity_expansion_count_before
594
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
595
+ end
596
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
597
+ rv.gsub!( re, entity_value )
598
+ if rv.bytesize > @entity_expansion_text_limit
599
+ raise "entity expansion has grown too large"
486
600
  end
601
+ else
602
+ er = DEFAULT_ENTITIES[entity_reference]
603
+ rv.gsub!( er[0], er[2] ) if er
487
604
  end
488
605
  end
489
- rv.gsub!( /&amp;/, '&' )
606
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
607
  end
491
608
  rv
492
609
  end
493
610
 
494
611
  private
612
+ def add_namespace(prefix, uri)
613
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
614
+ if uri.nil?
615
+ @namespaces.delete(prefix)
616
+ else
617
+ @namespaces[prefix] = uri
618
+ end
619
+ end
620
+
621
+ def push_namespaces_restore
622
+ namespaces_restore = {}
623
+ @namespaces_restore_stack.push(namespaces_restore)
624
+ namespaces_restore
625
+ end
626
+
627
+ def pop_namespaces_restore
628
+ namespaces_restore = @namespaces_restore_stack.pop
629
+ namespaces_restore.each do |prefix, uri|
630
+ if uri.nil?
631
+ @namespaces.delete(prefix)
632
+ else
633
+ @namespaces[prefix] = uri
634
+ end
635
+ end
636
+ end
637
+
638
+ def record_entity_expansion(delta=1)
639
+ @entity_expansion_count += delta
640
+ if @entity_expansion_count > @entity_expansion_limit
641
+ raise "number of entity expansions exceeded, processing aborted."
642
+ end
643
+ end
644
+
495
645
  def need_source_encoding_update?(xml_declaration_encoding)
496
646
  return false if xml_declaration_encoding.nil?
497
647
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +649,16 @@ module REXML
499
649
  end
500
650
 
501
651
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
652
+ md = @source.match(Private::NAME_PATTERN, true)
503
653
  unless md
504
- if @source.match(/\A\s*\S/um)
654
+ if @source.match(/\S/um)
505
655
  message = "#{base_error_message}: invalid name"
506
656
  else
507
657
  message = "#{base_error_message}: name is missing"
508
658
  end
509
659
  raise REXML::ParseException.new(message, @source)
510
660
  end
511
- md[1]
661
+ md[0]
512
662
  end
513
663
 
514
664
  def parse_id(base_error_message,
@@ -578,96 +728,114 @@ module REXML
578
728
  end
579
729
 
580
730
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
731
+ name = parse_name("Malformed XML: Invalid processing instruction node")
732
+ if @source.match(/\s+/um, true)
733
+ match_data = @source.match(/(.*?)\?>/um, true)
734
+ unless match_data
735
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736
+ end
737
+ content = match_data[1]
738
+ else
739
+ content = nil
740
+ unless @source.match("?>", true)
741
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
742
+ end
743
+ end
744
+ if name == "xml"
745
+ if @document_status
746
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
747
+ end
748
+ version = VERSION.match(content)
749
+ version = version[1] unless version.nil?
750
+ encoding = ENCODING.match(content)
751
+ encoding = encoding[1] unless encoding.nil?
752
+ if need_source_encoding_update?(encoding)
753
+ @source.encoding = encoding
754
+ end
755
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
756
+ encoding = "UTF-16"
757
+ end
758
+ standalone = STANDALONE.match(content)
759
+ standalone = standalone[1] unless standalone.nil?
760
+ return [ :xmldecl, version, encoding, standalone ]
585
761
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
762
+ [:processing_instruction, name, content]
587
763
  end
588
764
 
589
- def parse_attributes(prefixes, curr_ns)
765
+ def parse_attributes(prefixes)
590
766
  attributes = {}
767
+ expanded_names = {}
591
768
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
769
+ while true
770
+ if @source.match(">", true)
771
+ return attributes, closed
772
+ elsif @source.match("/>", true)
773
+ closed = true
774
+ return attributes, closed
775
+ elsif match = @source.match(QNAME, true)
776
+ name = match[1]
777
+ prefix = match[2]
778
+ local_part = match[3]
779
+
780
+ unless @source.match(/\s*=\s*/um, true)
618
781
  message = "Missing attribute equal: <#{name}>"
619
782
  raise REXML::ParseException.new(message, @source)
620
783
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
784
+ unless match = @source.match(/(['"])/, true)
623
785
  message = "Missing attribute value start quote: <#{name}>"
624
786
  raise REXML::ParseException.new(message, @source)
625
787
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
788
+ quote = match[1]
789
+ start_position = @source.position
790
+ value = @source.read_until(quote)
791
+ unless value.chomp!(quote)
792
+ @source.position = start_position
793
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
794
  raise REXML::ParseException.new(message, @source)
639
795
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
796
+ @source.match(/\s*/um, true)
797
+ if prefix == "xmlns"
798
+ if local_part == "xml"
799
+ if value != Private::XML_PREFIXED_NAMESPACE
800
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
801
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
802
+ raise REXML::ParseException.new( msg, @source, self )
803
+ end
804
+ elsif local_part == "xmlns"
805
+ msg = "The 'xmlns' prefix must not be declared "+
650
806
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
807
+ raise REXML::ParseException.new( msg, @source, self)
652
808
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
809
+ add_namespace(local_part, value)
810
+ elsif prefix
811
+ prefixes << prefix unless prefix == "xml"
657
812
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
813
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
814
+ if attributes[name]
815
+ msg = "Duplicate attribute #{name.inspect}"
816
+ raise REXML::ParseException.new(msg, @source, self)
817
+ end
667
818
 
668
- attributes[name] = value
819
+ unless prefix == "xmlns"
820
+ uri = @namespaces[prefix]
821
+ expanded_name = [uri, local_part]
822
+ existing_prefix = expanded_names[expanded_name]
823
+ if existing_prefix
824
+ message = "Namespace conflict in adding attribute " +
825
+ "\"#{local_part}\": " +
826
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
827
+ "prefix \"#{prefix}\" = \"#{uri}\""
828
+ raise REXML::ParseException.new(message, @source, self)
829
+ end
830
+ expanded_names[expanded_name] = prefix
831
+ end
832
+
833
+ attributes[name] = value
834
+ else
835
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
836
+ raise REXML::ParseException.new(message, @source)
837
+ end
669
838
  end
670
- return attributes, closed
671
839
  end
672
840
  end
673
841
  end