rexml 3.2.5 → 3.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,30 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ end
160
+ private_constant :Private
161
+
115
162
  def initialize( source )
116
163
  self.stream = source
117
164
  @listeners = []
165
+ @prefixes = Set.new
166
+ @entity_expansion_count = 0
118
167
  end
119
168
 
120
169
  def add_listener( listener )
@@ -122,15 +171,18 @@ module REXML
122
171
  end
123
172
 
124
173
  attr_reader :source
174
+ attr_reader :entity_expansion_count
125
175
 
126
176
  def stream=( source )
127
177
  @source = SourceFactory.create_from( source )
128
178
  @closed = nil
179
+ @have_root = false
129
180
  @document_status = nil
130
181
  @tags = []
131
182
  @stack = []
132
183
  @entities = []
133
- @nsstack = []
184
+ @namespaces = {}
185
+ @namespaces_restore_stack = []
134
186
  end
135
187
 
136
188
  def position
@@ -180,6 +232,8 @@ module REXML
180
232
 
181
233
  # Returns the next event. This is a +PullEvent+ object.
182
234
  def pull
235
+ @source.drop_parsed_content
236
+
183
237
  pull_event.tap do |event|
184
238
  @listeners.each do |listener|
185
239
  listener.receive event
@@ -192,236 +246,274 @@ module REXML
192
246
  x, @closed = @closed, nil
193
247
  return [ :end_element, x ]
194
248
  end
195
- return [ :end_document ] if empty?
249
+ if empty?
250
+ if @document_status == :in_doctype
251
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
252
+ end
253
+ unless @tags.empty?
254
+ path = "/" + @tags.join("/")
255
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
256
+ end
257
+ return [ :end_document ]
258
+ end
196
259
  return @stack.shift if @stack.size > 0
197
260
  #STDERR.puts @source.encoding
198
261
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
262
+
263
+ @source.ensure_buffer
199
264
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
265
+ start_position = @source.position
266
+ if @source.match("<?", true)
223
267
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
268
+ elsif @source.match("<!", true)
269
+ if @source.match("--", true)
270
+ md = @source.match(/(.*?)-->/um, true)
271
+ if md.nil?
272
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
273
  end
243
- if @source.match(/\A\s*\[/um, true)
274
+ if /--|-\z/.match?(md[1])
275
+ raise REXML::ParseException.new("Malformed comment", @source)
276
+ end
277
+ return [ :comment, md[1] ]
278
+ elsif @source.match("DOCTYPE", true)
279
+ base_error_message = "Malformed DOCTYPE"
280
+ unless @source.match(/\s+/um, true)
281
+ if @source.match(">")
282
+ message = "#{base_error_message}: name is missing"
283
+ else
284
+ message = "#{base_error_message}: invalid name"
285
+ end
286
+ @source.position = start_position
287
+ raise REXML::ParseException.new(message, @source)
288
+ end
289
+ name = parse_name(base_error_message)
290
+ if @source.match(/\s*\[/um, true)
291
+ id = [nil, nil, nil]
244
292
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
293
+ elsif @source.match(/\s*>/um, true)
294
+ id = [nil, nil, nil]
246
295
  @document_status = :after_doctype
296
+ @source.ensure_buffer
247
297
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
298
+ id = parse_id(base_error_message,
299
+ accept_external_id: true,
300
+ accept_public_id: false)
301
+ if id[0] == "SYSTEM"
302
+ # For backward compatibility
303
+ id[1], id[2] = id[2], nil
304
+ end
305
+ if @source.match(/\s*\[/um, true)
306
+ @document_status = :in_doctype
307
+ elsif @source.match(/\s*>/um, true)
308
+ @document_status = :after_doctype
309
+ @source.ensure_buffer
310
+ else
311
+ message = "#{base_error_message}: garbage after external ID"
312
+ raise REXML::ParseException.new(message, @source)
313
+ end
250
314
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
315
+ args = [:start_doctype, name, *id]
316
+ if @document_status == :after_doctype
317
+ @source.match(/\s*/um, true)
318
+ @stack << [ :end_doctype ]
319
+ end
320
+ return args
321
+ else
322
+ message = "Invalid XML"
323
+ raise REXML::ParseException.new(message, @source)
263
324
  end
264
325
  end
265
326
  end
266
327
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
328
+ @source.match(/\s*/um, true) # skip spaces
329
+ start_position = @source.position
330
+ if @source.match("<!", true)
331
+ if @source.match("ELEMENT", true)
332
+ md = @source.match(/(.*?)>/um, true)
333
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
334
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
335
+ elsif @source.match("ENTITY", true)
336
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
337
+ unless match_data
338
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
339
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
340
+ match = [:entitydecl, *match_data.captures.compact]
341
+ ref = false
342
+ if match[1] == '%'
343
+ ref = true
344
+ match.delete_at 1
345
+ end
346
+ # Now we have to sort out what kind of entity reference this is
347
+ if match[2] == 'SYSTEM'
348
+ # External reference
349
+ match[3] = match[3][1..-2] # PUBID
350
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
351
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
352
+ elsif match[2] == 'PUBLIC'
353
+ # External reference
354
+ match[3] = match[3][1..-2] # PUBID
355
+ match[4] = match[4][1..-2] # HREF
356
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
357
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
358
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
359
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
360
  else
329
- message = "#{base_error_message}: invalid declaration name"
361
+ match[2] = match[2][1..-2]
362
+ match.pop if match.size == 4
363
+ # match is [ :entity, name, value ]
330
364
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
365
+ match << '%' if ref
366
+ return match
367
+ elsif @source.match("ATTLIST", true)
368
+ md = @source.match(Private::ATTLISTDECL_END, true)
369
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
370
+ element = md[1]
371
+ contents = md[0]
372
+
373
+ pairs = {}
374
+ values = md[0].strip.scan( ATTDEF_RE )
375
+ values.each do |attdef|
376
+ unless attdef[3] == "#IMPLIED"
377
+ attdef.compact!
378
+ val = attdef[3]
379
+ val = attdef[4] if val == "#FIXED "
380
+ pairs[attdef[0]] = val
381
+ if attdef[0] =~ /^xmlns:(.*)/
382
+ @namespaces[$1] = val
383
+ end
384
+ end
385
+ end
386
+ return [ :attlistdecl, element, pairs, contents ]
387
+ elsif @source.match("NOTATION", true)
388
+ base_error_message = "Malformed notation declaration"
389
+ unless @source.match(/\s+/um, true)
390
+ if @source.match(">")
391
+ message = "#{base_error_message}: name is missing"
392
+ else
393
+ message = "#{base_error_message}: invalid name"
394
+ end
395
+ @source.position = start_position
396
+ raise REXML::ParseException.new(message, @source)
397
+ end
398
+ name = parse_name(base_error_message)
399
+ id = parse_id(base_error_message,
400
+ accept_external_id: true,
401
+ accept_public_id: true)
402
+ unless @source.match(/\s*>/um, true)
403
+ message = "#{base_error_message}: garbage before end >"
404
+ raise REXML::ParseException.new(message, @source)
405
+ end
406
+ return [:notationdecl, name, *id]
407
+ elsif md = @source.match(/--(.*?)-->/um, true)
408
+ case md[1]
409
+ when /--/, /-\z/
410
+ raise REXML::ParseException.new("Malformed comment", @source)
411
+ end
412
+ return [ :comment, md[1] ] if md
340
413
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
414
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
415
+ return [ :externalentity, match[1] ]
416
+ elsif @source.match(/\]\s*>/um, true)
343
417
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
418
  return [ :end_doctype ]
346
419
  end
420
+ if @document_status == :in_doctype
421
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
422
+ end
347
423
  end
348
424
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
425
+ @source.match(/\s*/um, true)
350
426
  end
351
427
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
428
+ start_position = @source.position
429
+ if @source.match("<", true)
430
+ # :text's read_until may remain only "<" in buffer. In the
431
+ # case, buffer is empty here. So we need to fill buffer
432
+ # here explicitly.
433
+ @source.ensure_buffer
434
+ if @source.match("/", true)
435
+ @namespaces_restore_stack.pop
356
436
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
437
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
438
  if md and !last_tag
359
439
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
440
  raise REXML::ParseException.new(message, @source)
361
441
  end
362
442
  if md.nil? or last_tag != md[1]
363
443
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
444
+ message += " (got '#{md[1]}')" if md
445
+ @source.position = start_position if md.nil?
365
446
  raise REXML::ParseException.new(message, @source)
366
447
  end
367
448
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
449
+ elsif @source.match("!", true)
450
+ md = @source.match(/([^>]*>)/um)
370
451
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
452
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
453
+ if md[0][0] == ?-
454
+ md = @source.match(/--(.*?)-->/um, true)
374
455
 
375
- case md[1]
376
- when /--/, /-\z/
456
+ if md.nil? || /--|-\z/.match?(md[1])
377
457
  raise REXML::ParseException.new("Malformed comment", @source)
378
458
  end
379
459
 
380
- return [ :comment, md[1] ] if md
460
+ return [ :comment, md[1] ]
381
461
  else
382
- md = @source.match( CDATA_PATTERN, true )
462
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
463
  return [ :cdata, md[1] ] if md
384
464
  end
385
465
  raise REXML::ParseException.new( "Declarations can only occur "+
386
466
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
467
+ elsif @source.match("?", true)
388
468
  return process_instruction
389
469
  else
390
470
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
471
+ md = @source.match(Private::TAG_PATTERN, true)
392
472
  unless md
473
+ @source.position = start_position
393
474
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
475
  end
476
+ tag = md[1]
395
477
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
478
+ @prefixes.clear
479
+ @prefixes << md[2] if md[2]
480
+ push_namespaces_restore
481
+ attributes, closed = parse_attributes(@prefixes)
400
482
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
483
+ for prefix in @prefixes
484
+ unless @namespaces.key?(prefix)
403
485
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
486
  end
405
487
  end
406
488
 
407
489
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
490
+ @closed = tag
491
+ pop_namespaces_restore
410
492
  else
411
- @tags.push( md[1] )
493
+ if @tags.empty? and @have_root
494
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
495
+ end
496
+ @tags.push( tag )
412
497
  end
413
- return [ :start_element, md[1], attributes ]
498
+ @have_root = true
499
+ return [ :start_element, tag, attributes ]
414
500
  end
415
501
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
502
+ text = @source.read_until("<")
503
+ if text.chomp!("<")
504
+ @source.position -= "<".bytesize
419
505
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
506
+ if @tags.empty?
507
+ unless /\A\s*\z/.match?(text)
508
+ if @have_root
509
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
510
+ else
511
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
512
+ end
513
+ end
514
+ return pull_event if @have_root
515
+ end
516
+ return [ :text, text ]
425
517
  end
426
518
  rescue REXML::UndefinedNamespaceException
427
519
  raise
@@ -436,13 +528,13 @@ module REXML
436
528
  private :pull_event
437
529
 
438
530
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
531
+ return unless entities
532
+
533
+ value = entities[ reference ]
534
+ return if value.nil?
535
+
536
+ record_entity_expansion
537
+ unnormalize( value, entities )
446
538
  end
447
539
 
448
540
  # Escapes all possible entities
@@ -463,35 +555,83 @@ module REXML
463
555
 
464
556
  # Unescapes all possible entities
465
557
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
558
+ if string.include?("\r")
559
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
560
+ else
561
+ rv = string.dup
562
+ end
468
563
  matches = rv.scan( REFERENCE_RE )
469
564
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
565
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
566
  m=$1
472
567
  m = "0#{m}" if m[0] == ?x
473
568
  [Integer(m)].pack('U*')
474
569
  }
475
570
  matches.collect!{|x|x[0]}.compact!
571
+ if filter
572
+ matches.reject! do |entity_reference|
573
+ filter.include?(entity_reference)
574
+ end
575
+ end
476
576
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
577
+ matches.tally.each do |entity_reference, n|
578
+ entity_expansion_count_before = @entity_expansion_count
579
+ entity_value = entity( entity_reference, entities )
580
+ if entity_value
581
+ if n > 1
582
+ entity_expansion_count_delta =
583
+ @entity_expansion_count - entity_expansion_count_before
584
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
585
+ end
586
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
587
+ rv.gsub!( re, entity_value )
588
+ if rv.bytesize > Security.entity_expansion_text_limit
589
+ raise "entity expansion has grown too large"
486
590
  end
591
+ else
592
+ er = DEFAULT_ENTITIES[entity_reference]
593
+ rv.gsub!( er[0], er[2] ) if er
487
594
  end
488
595
  end
489
- rv.gsub!( /&amp;/, '&' )
596
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
597
  end
491
598
  rv
492
599
  end
493
600
 
494
601
  private
602
+ def add_namespace(prefix, uri)
603
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
604
+ if uri.nil?
605
+ @namespaces.delete(prefix)
606
+ else
607
+ @namespaces[prefix] = uri
608
+ end
609
+ end
610
+
611
+ def push_namespaces_restore
612
+ namespaces_restore = {}
613
+ @namespaces_restore_stack.push(namespaces_restore)
614
+ namespaces_restore
615
+ end
616
+
617
+ def pop_namespaces_restore
618
+ namespaces_restore = @namespaces_restore_stack.pop
619
+ namespaces_restore.each do |prefix, uri|
620
+ if uri.nil?
621
+ @namespaces.delete(prefix)
622
+ else
623
+ @namespaces[prefix] = uri
624
+ end
625
+ end
626
+ end
627
+
628
+ def record_entity_expansion(delta=1)
629
+ @entity_expansion_count += delta
630
+ if @entity_expansion_count > Security.entity_expansion_limit
631
+ raise "number of entity expansions exceeded, processing aborted."
632
+ end
633
+ end
634
+
495
635
  def need_source_encoding_update?(xml_declaration_encoding)
496
636
  return false if xml_declaration_encoding.nil?
497
637
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +639,16 @@ module REXML
499
639
  end
500
640
 
501
641
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
642
+ md = @source.match(Private::NAME_PATTERN, true)
503
643
  unless md
504
- if @source.match(/\A\s*\S/um)
644
+ if @source.match(/\S/um)
505
645
  message = "#{base_error_message}: invalid name"
506
646
  else
507
647
  message = "#{base_error_message}: name is missing"
508
648
  end
509
649
  raise REXML::ParseException.new(message, @source)
510
650
  end
511
- md[1]
651
+ md[0]
512
652
  end
513
653
 
514
654
  def parse_id(base_error_message,
@@ -578,96 +718,114 @@ module REXML
578
718
  end
579
719
 
580
720
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
721
+ name = parse_name("Malformed XML: Invalid processing instruction node")
722
+ if @source.match(/\s+/um, true)
723
+ match_data = @source.match(/(.*?)\?>/um, true)
724
+ unless match_data
725
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
726
+ end
727
+ content = match_data[1]
728
+ else
729
+ content = nil
730
+ unless @source.match("?>", true)
731
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
732
+ end
733
+ end
734
+ if name == "xml"
735
+ if @document_status
736
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
737
+ end
738
+ version = VERSION.match(content)
739
+ version = version[1] unless version.nil?
740
+ encoding = ENCODING.match(content)
741
+ encoding = encoding[1] unless encoding.nil?
742
+ if need_source_encoding_update?(encoding)
743
+ @source.encoding = encoding
744
+ end
745
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
746
+ encoding = "UTF-16"
747
+ end
748
+ standalone = STANDALONE.match(content)
749
+ standalone = standalone[1] unless standalone.nil?
750
+ return [ :xmldecl, version, encoding, standalone ]
585
751
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
752
+ [:processing_instruction, name, content]
587
753
  end
588
754
 
589
- def parse_attributes(prefixes, curr_ns)
755
+ def parse_attributes(prefixes)
590
756
  attributes = {}
757
+ expanded_names = {}
591
758
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
759
+ while true
760
+ if @source.match(">", true)
761
+ return attributes, closed
762
+ elsif @source.match("/>", true)
763
+ closed = true
764
+ return attributes, closed
765
+ elsif match = @source.match(QNAME, true)
766
+ name = match[1]
767
+ prefix = match[2]
768
+ local_part = match[3]
769
+
770
+ unless @source.match(/\s*=\s*/um, true)
618
771
  message = "Missing attribute equal: <#{name}>"
619
772
  raise REXML::ParseException.new(message, @source)
620
773
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
774
+ unless match = @source.match(/(['"])/, true)
623
775
  message = "Missing attribute value start quote: <#{name}>"
624
776
  raise REXML::ParseException.new(message, @source)
625
777
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
778
+ quote = match[1]
779
+ start_position = @source.position
780
+ value = @source.read_until(quote)
781
+ unless value.chomp!(quote)
782
+ @source.position = start_position
783
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
784
  raise REXML::ParseException.new(message, @source)
639
785
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
786
+ @source.match(/\s*/um, true)
787
+ if prefix == "xmlns"
788
+ if local_part == "xml"
789
+ if value != "http://www.w3.org/XML/1998/namespace"
790
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
791
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
792
+ raise REXML::ParseException.new( msg, @source, self )
793
+ end
794
+ elsif local_part == "xmlns"
795
+ msg = "The 'xmlns' prefix must not be declared "+
650
796
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
797
+ raise REXML::ParseException.new( msg, @source, self)
652
798
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
799
+ add_namespace(local_part, value)
800
+ elsif prefix
801
+ prefixes << prefix unless prefix == "xml"
657
802
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
803
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
804
+ if attributes[name]
805
+ msg = "Duplicate attribute #{name.inspect}"
806
+ raise REXML::ParseException.new(msg, @source, self)
807
+ end
667
808
 
668
- attributes[name] = value
809
+ unless prefix == "xmlns"
810
+ uri = @namespaces[prefix]
811
+ expanded_name = [uri, local_part]
812
+ existing_prefix = expanded_names[expanded_name]
813
+ if existing_prefix
814
+ message = "Namespace conflict in adding attribute " +
815
+ "\"#{local_part}\": " +
816
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
817
+ "prefix \"#{prefix}\" = \"#{uri}\""
818
+ raise REXML::ParseException.new(message, @source, self)
819
+ end
820
+ expanded_names[expanded_name] = prefix
821
+ end
822
+
823
+ attributes[name] = value
824
+ else
825
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
826
+ raise REXML::ParseException.new(message, @source)
827
+ end
669
828
  end
670
- return attributes, closed
671
829
  end
672
830
  end
673
831
  end