rexml 3.2.5 → 3.3.8

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,33 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
118
170
  end
119
171
 
120
172
  def add_listener( listener )
@@ -122,15 +174,20 @@ module REXML
122
174
  end
123
175
 
124
176
  attr_reader :source
177
+ attr_reader :entity_expansion_count
178
+ attr_writer :entity_expansion_limit
179
+ attr_writer :entity_expansion_text_limit
125
180
 
126
181
  def stream=( source )
127
182
  @source = SourceFactory.create_from( source )
128
183
  @closed = nil
184
+ @have_root = false
129
185
  @document_status = nil
130
186
  @tags = []
131
187
  @stack = []
132
188
  @entities = []
133
- @nsstack = []
189
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
190
+ @namespaces_restore_stack = []
134
191
  end
135
192
 
136
193
  def position
@@ -180,6 +237,8 @@ module REXML
180
237
 
181
238
  # Returns the next event. This is a +PullEvent+ object.
182
239
  def pull
240
+ @source.drop_parsed_content
241
+
183
242
  pull_event.tap do |event|
184
243
  @listeners.each do |listener|
185
244
  listener.receive event
@@ -192,236 +251,274 @@ module REXML
192
251
  x, @closed = @closed, nil
193
252
  return [ :end_element, x ]
194
253
  end
195
- return [ :end_document ] if empty?
254
+ if empty?
255
+ if @document_status == :in_doctype
256
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
257
+ end
258
+ unless @tags.empty?
259
+ path = "/" + @tags.join("/")
260
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
261
+ end
262
+ return [ :end_document ]
263
+ end
196
264
  return @stack.shift if @stack.size > 0
197
265
  #STDERR.puts @source.encoding
198
266
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
267
+
268
+ @source.ensure_buffer
199
269
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
270
+ start_position = @source.position
271
+ if @source.match("<?", true)
223
272
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
273
+ elsif @source.match("<!", true)
274
+ if @source.match("--", true)
275
+ md = @source.match(/(.*?)-->/um, true)
276
+ if md.nil?
277
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
278
  end
243
- if @source.match(/\A\s*\[/um, true)
279
+ if /--|-\z/.match?(md[1])
280
+ raise REXML::ParseException.new("Malformed comment", @source)
281
+ end
282
+ return [ :comment, md[1] ]
283
+ elsif @source.match("DOCTYPE", true)
284
+ base_error_message = "Malformed DOCTYPE"
285
+ unless @source.match(/\s+/um, true)
286
+ if @source.match(">")
287
+ message = "#{base_error_message}: name is missing"
288
+ else
289
+ message = "#{base_error_message}: invalid name"
290
+ end
291
+ @source.position = start_position
292
+ raise REXML::ParseException.new(message, @source)
293
+ end
294
+ name = parse_name(base_error_message)
295
+ if @source.match(/\s*\[/um, true)
296
+ id = [nil, nil, nil]
244
297
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
298
+ elsif @source.match(/\s*>/um, true)
299
+ id = [nil, nil, nil]
246
300
  @document_status = :after_doctype
301
+ @source.ensure_buffer
247
302
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
303
+ id = parse_id(base_error_message,
304
+ accept_external_id: true,
305
+ accept_public_id: false)
306
+ if id[0] == "SYSTEM"
307
+ # For backward compatibility
308
+ id[1], id[2] = id[2], nil
309
+ end
310
+ if @source.match(/\s*\[/um, true)
311
+ @document_status = :in_doctype
312
+ elsif @source.match(/\s*>/um, true)
313
+ @document_status = :after_doctype
314
+ @source.ensure_buffer
315
+ else
316
+ message = "#{base_error_message}: garbage after external ID"
317
+ raise REXML::ParseException.new(message, @source)
318
+ end
250
319
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
320
+ args = [:start_doctype, name, *id]
321
+ if @document_status == :after_doctype
322
+ @source.match(/\s*/um, true)
323
+ @stack << [ :end_doctype ]
324
+ end
325
+ return args
326
+ else
327
+ message = "Invalid XML"
328
+ raise REXML::ParseException.new(message, @source)
263
329
  end
264
330
  end
265
331
  end
266
332
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
333
+ @source.match(/\s*/um, true) # skip spaces
334
+ start_position = @source.position
335
+ if @source.match("<!", true)
336
+ if @source.match("ELEMENT", true)
337
+ md = @source.match(/(.*?)>/um, true)
338
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
339
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
340
+ elsif @source.match("ENTITY", true)
341
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
342
+ unless match_data
343
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
344
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
345
+ match = [:entitydecl, *match_data.captures.compact]
346
+ ref = false
347
+ if match[1] == '%'
348
+ ref = true
349
+ match.delete_at 1
350
+ end
351
+ # Now we have to sort out what kind of entity reference this is
352
+ if match[2] == 'SYSTEM'
353
+ # External reference
354
+ match[3] = match[3][1..-2] # PUBID
355
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
356
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
357
+ elsif match[2] == 'PUBLIC'
358
+ # External reference
359
+ match[3] = match[3][1..-2] # PUBID
360
+ match[4] = match[4][1..-2] # HREF
361
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
362
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
363
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
364
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
365
  else
329
- message = "#{base_error_message}: invalid declaration name"
366
+ match[2] = match[2][1..-2]
367
+ match.pop if match.size == 4
368
+ # match is [ :entity, name, value ]
330
369
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
370
+ match << '%' if ref
371
+ return match
372
+ elsif @source.match("ATTLIST", true)
373
+ md = @source.match(Private::ATTLISTDECL_END, true)
374
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
375
+ element = md[1]
376
+ contents = md[0]
377
+
378
+ pairs = {}
379
+ values = md[0].strip.scan( ATTDEF_RE )
380
+ values.each do |attdef|
381
+ unless attdef[3] == "#IMPLIED"
382
+ attdef.compact!
383
+ val = attdef[3]
384
+ val = attdef[4] if val == "#FIXED "
385
+ pairs[attdef[0]] = val
386
+ if attdef[0] =~ /^xmlns:(.*)/
387
+ @namespaces[$1] = val
388
+ end
389
+ end
390
+ end
391
+ return [ :attlistdecl, element, pairs, contents ]
392
+ elsif @source.match("NOTATION", true)
393
+ base_error_message = "Malformed notation declaration"
394
+ unless @source.match(/\s+/um, true)
395
+ if @source.match(">")
396
+ message = "#{base_error_message}: name is missing"
397
+ else
398
+ message = "#{base_error_message}: invalid name"
399
+ end
400
+ @source.position = start_position
401
+ raise REXML::ParseException.new(message, @source)
402
+ end
403
+ name = parse_name(base_error_message)
404
+ id = parse_id(base_error_message,
405
+ accept_external_id: true,
406
+ accept_public_id: true)
407
+ unless @source.match(/\s*>/um, true)
408
+ message = "#{base_error_message}: garbage before end >"
409
+ raise REXML::ParseException.new(message, @source)
410
+ end
411
+ return [:notationdecl, name, *id]
412
+ elsif md = @source.match(/--(.*?)-->/um, true)
413
+ case md[1]
414
+ when /--/, /-\z/
415
+ raise REXML::ParseException.new("Malformed comment", @source)
416
+ end
417
+ return [ :comment, md[1] ] if md
340
418
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
419
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
420
+ return [ :externalentity, match[1] ]
421
+ elsif @source.match(/\]\s*>/um, true)
343
422
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
423
  return [ :end_doctype ]
346
424
  end
425
+ if @document_status == :in_doctype
426
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
427
+ end
347
428
  end
348
429
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
430
+ @source.match(/\s*/um, true)
350
431
  end
351
432
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
433
+ start_position = @source.position
434
+ if @source.match("<", true)
435
+ # :text's read_until may remain only "<" in buffer. In the
436
+ # case, buffer is empty here. So we need to fill buffer
437
+ # here explicitly.
438
+ @source.ensure_buffer
439
+ if @source.match("/", true)
440
+ @namespaces_restore_stack.pop
356
441
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
442
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
443
  if md and !last_tag
359
444
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
445
  raise REXML::ParseException.new(message, @source)
361
446
  end
362
447
  if md.nil? or last_tag != md[1]
363
448
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
449
+ message += " (got '#{md[1]}')" if md
450
+ @source.position = start_position if md.nil?
365
451
  raise REXML::ParseException.new(message, @source)
366
452
  end
367
453
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
454
+ elsif @source.match("!", true)
455
+ md = @source.match(/([^>]*>)/um)
370
456
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
457
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
458
+ if md[0][0] == ?-
459
+ md = @source.match(/--(.*?)-->/um, true)
374
460
 
375
- case md[1]
376
- when /--/, /-\z/
461
+ if md.nil? || /--|-\z/.match?(md[1])
377
462
  raise REXML::ParseException.new("Malformed comment", @source)
378
463
  end
379
464
 
380
- return [ :comment, md[1] ] if md
465
+ return [ :comment, md[1] ]
381
466
  else
382
- md = @source.match( CDATA_PATTERN, true )
467
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
468
  return [ :cdata, md[1] ] if md
384
469
  end
385
470
  raise REXML::ParseException.new( "Declarations can only occur "+
386
471
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
472
+ elsif @source.match("?", true)
388
473
  return process_instruction
389
474
  else
390
475
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
476
+ md = @source.match(Private::TAG_PATTERN, true)
392
477
  unless md
478
+ @source.position = start_position
393
479
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
480
  end
481
+ tag = md[1]
395
482
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
483
+ @prefixes.clear
484
+ @prefixes << md[2] if md[2]
485
+ push_namespaces_restore
486
+ attributes, closed = parse_attributes(@prefixes)
400
487
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
488
+ for prefix in @prefixes
489
+ unless @namespaces.key?(prefix)
403
490
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
491
  end
405
492
  end
406
493
 
407
494
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
495
+ @closed = tag
496
+ pop_namespaces_restore
410
497
  else
411
- @tags.push( md[1] )
498
+ if @tags.empty? and @have_root
499
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
500
+ end
501
+ @tags.push( tag )
412
502
  end
413
- return [ :start_element, md[1], attributes ]
503
+ @have_root = true
504
+ return [ :start_element, tag, attributes ]
414
505
  end
415
506
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
507
+ text = @source.read_until("<")
508
+ if text.chomp!("<")
509
+ @source.position -= "<".bytesize
419
510
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
511
+ if @tags.empty?
512
+ unless /\A\s*\z/.match?(text)
513
+ if @have_root
514
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
515
+ else
516
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
517
+ end
518
+ end
519
+ return pull_event if @have_root
520
+ end
521
+ return [ :text, text ]
425
522
  end
426
523
  rescue REXML::UndefinedNamespaceException
427
524
  raise
@@ -436,13 +533,13 @@ module REXML
436
533
  private :pull_event
437
534
 
438
535
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
536
+ return unless entities
537
+
538
+ value = entities[ reference ]
539
+ return if value.nil?
540
+
541
+ record_entity_expansion
542
+ unnormalize( value, entities )
446
543
  end
447
544
 
448
545
  # Escapes all possible entities
@@ -463,35 +560,83 @@ module REXML
463
560
 
464
561
  # Unescapes all possible entities
465
562
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
563
+ if string.include?("\r")
564
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
565
+ else
566
+ rv = string.dup
567
+ end
468
568
  matches = rv.scan( REFERENCE_RE )
469
569
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
570
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
571
  m=$1
472
572
  m = "0#{m}" if m[0] == ?x
473
573
  [Integer(m)].pack('U*')
474
574
  }
475
575
  matches.collect!{|x|x[0]}.compact!
576
+ if filter
577
+ matches.reject! do |entity_reference|
578
+ filter.include?(entity_reference)
579
+ end
580
+ end
476
581
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
582
+ matches.tally.each do |entity_reference, n|
583
+ entity_expansion_count_before = @entity_expansion_count
584
+ entity_value = entity( entity_reference, entities )
585
+ if entity_value
586
+ if n > 1
587
+ entity_expansion_count_delta =
588
+ @entity_expansion_count - entity_expansion_count_before
589
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
590
+ end
591
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
592
+ rv.gsub!( re, entity_value )
593
+ if rv.bytesize > @entity_expansion_text_limit
594
+ raise "entity expansion has grown too large"
486
595
  end
596
+ else
597
+ er = DEFAULT_ENTITIES[entity_reference]
598
+ rv.gsub!( er[0], er[2] ) if er
487
599
  end
488
600
  end
489
- rv.gsub!( /&amp;/, '&' )
601
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
602
  end
491
603
  rv
492
604
  end
493
605
 
494
606
  private
607
+ def add_namespace(prefix, uri)
608
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
609
+ if uri.nil?
610
+ @namespaces.delete(prefix)
611
+ else
612
+ @namespaces[prefix] = uri
613
+ end
614
+ end
615
+
616
+ def push_namespaces_restore
617
+ namespaces_restore = {}
618
+ @namespaces_restore_stack.push(namespaces_restore)
619
+ namespaces_restore
620
+ end
621
+
622
+ def pop_namespaces_restore
623
+ namespaces_restore = @namespaces_restore_stack.pop
624
+ namespaces_restore.each do |prefix, uri|
625
+ if uri.nil?
626
+ @namespaces.delete(prefix)
627
+ else
628
+ @namespaces[prefix] = uri
629
+ end
630
+ end
631
+ end
632
+
633
+ def record_entity_expansion(delta=1)
634
+ @entity_expansion_count += delta
635
+ if @entity_expansion_count > @entity_expansion_limit
636
+ raise "number of entity expansions exceeded, processing aborted."
637
+ end
638
+ end
639
+
495
640
  def need_source_encoding_update?(xml_declaration_encoding)
496
641
  return false if xml_declaration_encoding.nil?
497
642
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +644,16 @@ module REXML
499
644
  end
500
645
 
501
646
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
647
+ md = @source.match(Private::NAME_PATTERN, true)
503
648
  unless md
504
- if @source.match(/\A\s*\S/um)
649
+ if @source.match(/\S/um)
505
650
  message = "#{base_error_message}: invalid name"
506
651
  else
507
652
  message = "#{base_error_message}: name is missing"
508
653
  end
509
654
  raise REXML::ParseException.new(message, @source)
510
655
  end
511
- md[1]
656
+ md[0]
512
657
  end
513
658
 
514
659
  def parse_id(base_error_message,
@@ -578,96 +723,114 @@ module REXML
578
723
  end
579
724
 
580
725
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
726
+ name = parse_name("Malformed XML: Invalid processing instruction node")
727
+ if @source.match(/\s+/um, true)
728
+ match_data = @source.match(/(.*?)\?>/um, true)
729
+ unless match_data
730
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
731
+ end
732
+ content = match_data[1]
733
+ else
734
+ content = nil
735
+ unless @source.match("?>", true)
736
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
737
+ end
738
+ end
739
+ if name == "xml"
740
+ if @document_status
741
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
742
+ end
743
+ version = VERSION.match(content)
744
+ version = version[1] unless version.nil?
745
+ encoding = ENCODING.match(content)
746
+ encoding = encoding[1] unless encoding.nil?
747
+ if need_source_encoding_update?(encoding)
748
+ @source.encoding = encoding
749
+ end
750
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
751
+ encoding = "UTF-16"
752
+ end
753
+ standalone = STANDALONE.match(content)
754
+ standalone = standalone[1] unless standalone.nil?
755
+ return [ :xmldecl, version, encoding, standalone ]
585
756
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
757
+ [:processing_instruction, name, content]
587
758
  end
588
759
 
589
- def parse_attributes(prefixes, curr_ns)
760
+ def parse_attributes(prefixes)
590
761
  attributes = {}
762
+ expanded_names = {}
591
763
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
764
+ while true
765
+ if @source.match(">", true)
766
+ return attributes, closed
767
+ elsif @source.match("/>", true)
768
+ closed = true
769
+ return attributes, closed
770
+ elsif match = @source.match(QNAME, true)
771
+ name = match[1]
772
+ prefix = match[2]
773
+ local_part = match[3]
774
+
775
+ unless @source.match(/\s*=\s*/um, true)
618
776
  message = "Missing attribute equal: <#{name}>"
619
777
  raise REXML::ParseException.new(message, @source)
620
778
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
779
+ unless match = @source.match(/(['"])/, true)
623
780
  message = "Missing attribute value start quote: <#{name}>"
624
781
  raise REXML::ParseException.new(message, @source)
625
782
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
783
+ quote = match[1]
784
+ start_position = @source.position
785
+ value = @source.read_until(quote)
786
+ unless value.chomp!(quote)
787
+ @source.position = start_position
788
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
789
  raise REXML::ParseException.new(message, @source)
639
790
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
791
+ @source.match(/\s*/um, true)
792
+ if prefix == "xmlns"
793
+ if local_part == "xml"
794
+ if value != Private::XML_PREFIXED_NAMESPACE
795
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
796
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
797
+ raise REXML::ParseException.new( msg, @source, self )
798
+ end
799
+ elsif local_part == "xmlns"
800
+ msg = "The 'xmlns' prefix must not be declared "+
650
801
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
802
+ raise REXML::ParseException.new( msg, @source, self)
652
803
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
804
+ add_namespace(local_part, value)
805
+ elsif prefix
806
+ prefixes << prefix unless prefix == "xml"
657
807
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
808
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
809
+ if attributes[name]
810
+ msg = "Duplicate attribute #{name.inspect}"
811
+ raise REXML::ParseException.new(msg, @source, self)
812
+ end
667
813
 
668
- attributes[name] = value
814
+ unless prefix == "xmlns"
815
+ uri = @namespaces[prefix]
816
+ expanded_name = [uri, local_part]
817
+ existing_prefix = expanded_names[expanded_name]
818
+ if existing_prefix
819
+ message = "Namespace conflict in adding attribute " +
820
+ "\"#{local_part}\": " +
821
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
822
+ "prefix \"#{prefix}\" = \"#{uri}\""
823
+ raise REXML::ParseException.new(message, @source, self)
824
+ end
825
+ expanded_names[expanded_name] = prefix
826
+ end
827
+
828
+ attributes[name] = value
829
+ else
830
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
831
+ raise REXML::ParseException.new(message, @source)
832
+ end
669
833
  end
670
- return attributes, closed
671
834
  end
672
835
  end
673
836
  end