rexml 3.2.6 → 3.3.6

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,30 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ end
160
+ private_constant :Private
161
+
115
162
  def initialize( source )
116
163
  self.stream = source
117
164
  @listeners = []
165
+ @prefixes = Set.new
166
+ @entity_expansion_count = 0
118
167
  end
119
168
 
120
169
  def add_listener( listener )
@@ -122,15 +171,18 @@ module REXML
122
171
  end
123
172
 
124
173
  attr_reader :source
174
+ attr_reader :entity_expansion_count
125
175
 
126
176
  def stream=( source )
127
177
  @source = SourceFactory.create_from( source )
128
178
  @closed = nil
179
+ @have_root = false
129
180
  @document_status = nil
130
181
  @tags = []
131
182
  @stack = []
132
183
  @entities = []
133
- @nsstack = []
184
+ @namespaces = {}
185
+ @namespaces_restore_stack = []
134
186
  end
135
187
 
136
188
  def position
@@ -180,6 +232,8 @@ module REXML
180
232
 
181
233
  # Returns the next event. This is a +PullEvent+ object.
182
234
  def pull
235
+ @source.drop_parsed_content
236
+
183
237
  pull_event.tap do |event|
184
238
  @listeners.each do |listener|
185
239
  listener.receive event
@@ -192,236 +246,274 @@ module REXML
192
246
  x, @closed = @closed, nil
193
247
  return [ :end_element, x ]
194
248
  end
195
- return [ :end_document ] if empty?
249
+ if empty?
250
+ if @document_status == :in_doctype
251
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
252
+ end
253
+ unless @tags.empty?
254
+ path = "/" + @tags.join("/")
255
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
256
+ end
257
+ return [ :end_document ]
258
+ end
196
259
  return @stack.shift if @stack.size > 0
197
260
  #STDERR.puts @source.encoding
198
261
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
262
+
263
+ @source.ensure_buffer
199
264
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
265
+ start_position = @source.position
266
+ if @source.match("<?", true)
223
267
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
268
+ elsif @source.match("<!", true)
269
+ if @source.match("--", true)
270
+ md = @source.match(/(.*?)-->/um, true)
271
+ if md.nil?
272
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
273
  end
243
- if @source.match(/\A\s*\[/um, true)
274
+ if /--|-\z/.match?(md[1])
275
+ raise REXML::ParseException.new("Malformed comment", @source)
276
+ end
277
+ return [ :comment, md[1] ]
278
+ elsif @source.match("DOCTYPE", true)
279
+ base_error_message = "Malformed DOCTYPE"
280
+ unless @source.match(/\s+/um, true)
281
+ if @source.match(">")
282
+ message = "#{base_error_message}: name is missing"
283
+ else
284
+ message = "#{base_error_message}: invalid name"
285
+ end
286
+ @source.position = start_position
287
+ raise REXML::ParseException.new(message, @source)
288
+ end
289
+ name = parse_name(base_error_message)
290
+ if @source.match(/\s*\[/um, true)
291
+ id = [nil, nil, nil]
244
292
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
293
+ elsif @source.match(/\s*>/um, true)
294
+ id = [nil, nil, nil]
246
295
  @document_status = :after_doctype
296
+ @source.ensure_buffer
247
297
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
298
+ id = parse_id(base_error_message,
299
+ accept_external_id: true,
300
+ accept_public_id: false)
301
+ if id[0] == "SYSTEM"
302
+ # For backward compatibility
303
+ id[1], id[2] = id[2], nil
304
+ end
305
+ if @source.match(/\s*\[/um, true)
306
+ @document_status = :in_doctype
307
+ elsif @source.match(/\s*>/um, true)
308
+ @document_status = :after_doctype
309
+ @source.ensure_buffer
310
+ else
311
+ message = "#{base_error_message}: garbage after external ID"
312
+ raise REXML::ParseException.new(message, @source)
313
+ end
250
314
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
315
+ args = [:start_doctype, name, *id]
316
+ if @document_status == :after_doctype
317
+ @source.match(/\s*/um, true)
318
+ @stack << [ :end_doctype ]
319
+ end
320
+ return args
321
+ else
322
+ message = "Invalid XML"
323
+ raise REXML::ParseException.new(message, @source)
263
324
  end
264
325
  end
265
326
  end
266
327
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
328
+ @source.match(/\s*/um, true) # skip spaces
329
+ start_position = @source.position
330
+ if @source.match("<!", true)
331
+ if @source.match("ELEMENT", true)
332
+ md = @source.match(/(.*?)>/um, true)
333
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
334
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
335
+ elsif @source.match("ENTITY", true)
336
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
337
+ unless match_data
338
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
339
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
340
+ match = [:entitydecl, *match_data.captures.compact]
341
+ ref = false
342
+ if match[1] == '%'
343
+ ref = true
344
+ match.delete_at 1
345
+ end
346
+ # Now we have to sort out what kind of entity reference this is
347
+ if match[2] == 'SYSTEM'
348
+ # External reference
349
+ match[3] = match[3][1..-2] # PUBID
350
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
351
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
352
+ elsif match[2] == 'PUBLIC'
353
+ # External reference
354
+ match[3] = match[3][1..-2] # PUBID
355
+ match[4] = match[4][1..-2] # HREF
356
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
357
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
358
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
359
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
360
  else
329
- message = "#{base_error_message}: invalid declaration name"
361
+ match[2] = match[2][1..-2]
362
+ match.pop if match.size == 4
363
+ # match is [ :entity, name, value ]
330
364
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
365
+ match << '%' if ref
366
+ return match
367
+ elsif @source.match("ATTLIST", true)
368
+ md = @source.match(Private::ATTLISTDECL_END, true)
369
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
370
+ element = md[1]
371
+ contents = md[0]
372
+
373
+ pairs = {}
374
+ values = md[0].strip.scan( ATTDEF_RE )
375
+ values.each do |attdef|
376
+ unless attdef[3] == "#IMPLIED"
377
+ attdef.compact!
378
+ val = attdef[3]
379
+ val = attdef[4] if val == "#FIXED "
380
+ pairs[attdef[0]] = val
381
+ if attdef[0] =~ /^xmlns:(.*)/
382
+ @namespaces[$1] = val
383
+ end
384
+ end
385
+ end
386
+ return [ :attlistdecl, element, pairs, contents ]
387
+ elsif @source.match("NOTATION", true)
388
+ base_error_message = "Malformed notation declaration"
389
+ unless @source.match(/\s+/um, true)
390
+ if @source.match(">")
391
+ message = "#{base_error_message}: name is missing"
392
+ else
393
+ message = "#{base_error_message}: invalid name"
394
+ end
395
+ @source.position = start_position
396
+ raise REXML::ParseException.new(message, @source)
397
+ end
398
+ name = parse_name(base_error_message)
399
+ id = parse_id(base_error_message,
400
+ accept_external_id: true,
401
+ accept_public_id: true)
402
+ unless @source.match(/\s*>/um, true)
403
+ message = "#{base_error_message}: garbage before end >"
404
+ raise REXML::ParseException.new(message, @source)
405
+ end
406
+ return [:notationdecl, name, *id]
407
+ elsif md = @source.match(/--(.*?)-->/um, true)
408
+ case md[1]
409
+ when /--/, /-\z/
410
+ raise REXML::ParseException.new("Malformed comment", @source)
411
+ end
412
+ return [ :comment, md[1] ] if md
340
413
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
414
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
415
+ return [ :externalentity, match[1] ]
416
+ elsif @source.match(/\]\s*>/um, true)
343
417
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
418
  return [ :end_doctype ]
346
419
  end
420
+ if @document_status == :in_doctype
421
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
422
+ end
347
423
  end
348
424
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
425
+ @source.match(/\s*/um, true)
350
426
  end
351
427
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
428
+ start_position = @source.position
429
+ if @source.match("<", true)
430
+ # :text's read_until may remain only "<" in buffer. In the
431
+ # case, buffer is empty here. So we need to fill buffer
432
+ # here explicitly.
433
+ @source.ensure_buffer
434
+ if @source.match("/", true)
435
+ @namespaces_restore_stack.pop
356
436
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
437
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
438
  if md and !last_tag
359
439
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
440
  raise REXML::ParseException.new(message, @source)
361
441
  end
362
442
  if md.nil? or last_tag != md[1]
363
443
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
444
+ message += " (got '#{md[1]}')" if md
445
+ @source.position = start_position if md.nil?
365
446
  raise REXML::ParseException.new(message, @source)
366
447
  end
367
448
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
449
+ elsif @source.match("!", true)
450
+ md = @source.match(/([^>]*>)/um)
370
451
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
452
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
453
+ if md[0][0] == ?-
454
+ md = @source.match(/--(.*?)-->/um, true)
374
455
 
375
- case md[1]
376
- when /--/, /-\z/
456
+ if md.nil? || /--|-\z/.match?(md[1])
377
457
  raise REXML::ParseException.new("Malformed comment", @source)
378
458
  end
379
459
 
380
- return [ :comment, md[1] ] if md
460
+ return [ :comment, md[1] ]
381
461
  else
382
- md = @source.match( CDATA_PATTERN, true )
462
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
463
  return [ :cdata, md[1] ] if md
384
464
  end
385
465
  raise REXML::ParseException.new( "Declarations can only occur "+
386
466
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
467
+ elsif @source.match("?", true)
388
468
  return process_instruction
389
469
  else
390
470
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
471
+ md = @source.match(Private::TAG_PATTERN, true)
392
472
  unless md
473
+ @source.position = start_position
393
474
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
475
  end
476
+ tag = md[1]
395
477
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
478
+ @prefixes.clear
479
+ @prefixes << md[2] if md[2]
480
+ push_namespaces_restore
481
+ attributes, closed = parse_attributes(@prefixes)
400
482
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
483
+ for prefix in @prefixes
484
+ unless @namespaces.key?(prefix)
403
485
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
486
  end
405
487
  end
406
488
 
407
489
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
490
+ @closed = tag
491
+ pop_namespaces_restore
410
492
  else
411
- @tags.push( md[1] )
493
+ if @tags.empty? and @have_root
494
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
495
+ end
496
+ @tags.push( tag )
412
497
  end
413
- return [ :start_element, md[1], attributes ]
498
+ @have_root = true
499
+ return [ :start_element, tag, attributes ]
414
500
  end
415
501
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
502
+ text = @source.read_until("<")
503
+ if text.chomp!("<")
504
+ @source.position -= "<".bytesize
419
505
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
506
+ if @tags.empty?
507
+ unless /\A\s*\z/.match?(text)
508
+ if @have_root
509
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
510
+ else
511
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
512
+ end
513
+ end
514
+ return pull_event if @have_root
515
+ end
516
+ return [ :text, text ]
425
517
  end
426
518
  rescue REXML::UndefinedNamespaceException
427
519
  raise
@@ -436,13 +528,13 @@ module REXML
436
528
  private :pull_event
437
529
 
438
530
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
531
+ return unless entities
532
+
533
+ value = entities[ reference ]
534
+ return if value.nil?
535
+
536
+ record_entity_expansion
537
+ unnormalize( value, entities )
446
538
  end
447
539
 
448
540
  # Escapes all possible entities
@@ -463,35 +555,83 @@ module REXML
463
555
 
464
556
  # Unescapes all possible entities
465
557
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
558
+ if string.include?("\r")
559
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
560
+ else
561
+ rv = string.dup
562
+ end
468
563
  matches = rv.scan( REFERENCE_RE )
469
564
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
565
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
566
  m=$1
472
567
  m = "0#{m}" if m[0] == ?x
473
568
  [Integer(m)].pack('U*')
474
569
  }
475
570
  matches.collect!{|x|x[0]}.compact!
571
+ if filter
572
+ matches.reject! do |entity_reference|
573
+ filter.include?(entity_reference)
574
+ end
575
+ end
476
576
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
577
+ matches.tally.each do |entity_reference, n|
578
+ entity_expansion_count_before = @entity_expansion_count
579
+ entity_value = entity( entity_reference, entities )
580
+ if entity_value
581
+ if n > 1
582
+ entity_expansion_count_delta =
583
+ @entity_expansion_count - entity_expansion_count_before
584
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
585
+ end
586
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
587
+ rv.gsub!( re, entity_value )
588
+ if rv.bytesize > Security.entity_expansion_text_limit
589
+ raise "entity expansion has grown too large"
486
590
  end
591
+ else
592
+ er = DEFAULT_ENTITIES[entity_reference]
593
+ rv.gsub!( er[0], er[2] ) if er
487
594
  end
488
595
  end
489
- rv.gsub!( /&amp;/, '&' )
596
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
597
  end
491
598
  rv
492
599
  end
493
600
 
494
601
  private
602
+ def add_namespace(prefix, uri)
603
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
604
+ if uri.nil?
605
+ @namespaces.delete(prefix)
606
+ else
607
+ @namespaces[prefix] = uri
608
+ end
609
+ end
610
+
611
+ def push_namespaces_restore
612
+ namespaces_restore = {}
613
+ @namespaces_restore_stack.push(namespaces_restore)
614
+ namespaces_restore
615
+ end
616
+
617
+ def pop_namespaces_restore
618
+ namespaces_restore = @namespaces_restore_stack.pop
619
+ namespaces_restore.each do |prefix, uri|
620
+ if uri.nil?
621
+ @namespaces.delete(prefix)
622
+ else
623
+ @namespaces[prefix] = uri
624
+ end
625
+ end
626
+ end
627
+
628
+ def record_entity_expansion(delta=1)
629
+ @entity_expansion_count += delta
630
+ if @entity_expansion_count > Security.entity_expansion_limit
631
+ raise "number of entity expansions exceeded, processing aborted."
632
+ end
633
+ end
634
+
495
635
  def need_source_encoding_update?(xml_declaration_encoding)
496
636
  return false if xml_declaration_encoding.nil?
497
637
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +639,16 @@ module REXML
499
639
  end
500
640
 
501
641
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
642
+ md = @source.match(Private::NAME_PATTERN, true)
503
643
  unless md
504
- if @source.match(/\A\s*\S/um)
644
+ if @source.match(/\S/um)
505
645
  message = "#{base_error_message}: invalid name"
506
646
  else
507
647
  message = "#{base_error_message}: name is missing"
508
648
  end
509
649
  raise REXML::ParseException.new(message, @source)
510
650
  end
511
- md[1]
651
+ md[0]
512
652
  end
513
653
 
514
654
  def parse_id(base_error_message,
@@ -578,96 +718,114 @@ module REXML
578
718
  end
579
719
 
580
720
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
721
+ name = parse_name("Malformed XML: Invalid processing instruction node")
722
+ if @source.match(/\s+/um, true)
723
+ match_data = @source.match(/(.*?)\?>/um, true)
724
+ unless match_data
725
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
726
+ end
727
+ content = match_data[1]
728
+ else
729
+ content = nil
730
+ unless @source.match("?>", true)
731
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
732
+ end
733
+ end
734
+ if name == "xml"
735
+ if @document_status
736
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
737
+ end
738
+ version = VERSION.match(content)
739
+ version = version[1] unless version.nil?
740
+ encoding = ENCODING.match(content)
741
+ encoding = encoding[1] unless encoding.nil?
742
+ if need_source_encoding_update?(encoding)
743
+ @source.encoding = encoding
744
+ end
745
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
746
+ encoding = "UTF-16"
747
+ end
748
+ standalone = STANDALONE.match(content)
749
+ standalone = standalone[1] unless standalone.nil?
750
+ return [ :xmldecl, version, encoding, standalone ]
585
751
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
752
+ [:processing_instruction, name, content]
587
753
  end
588
754
 
589
- def parse_attributes(prefixes, curr_ns)
755
+ def parse_attributes(prefixes)
590
756
  attributes = {}
757
+ expanded_names = {}
591
758
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
759
+ while true
760
+ if @source.match(">", true)
761
+ return attributes, closed
762
+ elsif @source.match("/>", true)
763
+ closed = true
764
+ return attributes, closed
765
+ elsif match = @source.match(QNAME, true)
766
+ name = match[1]
767
+ prefix = match[2]
768
+ local_part = match[3]
769
+
770
+ unless @source.match(/\s*=\s*/um, true)
618
771
  message = "Missing attribute equal: <#{name}>"
619
772
  raise REXML::ParseException.new(message, @source)
620
773
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
774
+ unless match = @source.match(/(['"])/, true)
623
775
  message = "Missing attribute value start quote: <#{name}>"
624
776
  raise REXML::ParseException.new(message, @source)
625
777
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
778
+ quote = match[1]
779
+ start_position = @source.position
780
+ value = @source.read_until(quote)
781
+ unless value.chomp!(quote)
782
+ @source.position = start_position
783
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
784
  raise REXML::ParseException.new(message, @source)
639
785
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
786
+ @source.match(/\s*/um, true)
787
+ if prefix == "xmlns"
788
+ if local_part == "xml"
789
+ if value != "http://www.w3.org/XML/1998/namespace"
790
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
791
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
792
+ raise REXML::ParseException.new( msg, @source, self )
793
+ end
794
+ elsif local_part == "xmlns"
795
+ msg = "The 'xmlns' prefix must not be declared "+
650
796
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
797
+ raise REXML::ParseException.new( msg, @source, self)
652
798
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
799
+ add_namespace(local_part, value)
800
+ elsif prefix
801
+ prefixes << prefix unless prefix == "xml"
657
802
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
803
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
804
+ if attributes[name]
805
+ msg = "Duplicate attribute #{name.inspect}"
806
+ raise REXML::ParseException.new(msg, @source, self)
807
+ end
667
808
 
668
- attributes[name] = value
809
+ unless prefix == "xmlns"
810
+ uri = @namespaces[prefix]
811
+ expanded_name = [uri, local_part]
812
+ existing_prefix = expanded_names[expanded_name]
813
+ if existing_prefix
814
+ message = "Namespace conflict in adding attribute " +
815
+ "\"#{local_part}\": " +
816
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
817
+ "prefix \"#{prefix}\" = \"#{uri}\""
818
+ raise REXML::ParseException.new(message, @source, self)
819
+ end
820
+ expanded_names[expanded_name] = prefix
821
+ end
822
+
823
+ attributes[name] = value
824
+ else
825
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
826
+ raise REXML::ParseException.new(message, @source)
827
+ end
669
828
  end
670
- return attributes, closed
671
829
  end
672
830
  end
673
831
  end