rexml 3.2.6 → 3.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,20 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
128
184
  @closed = nil
185
+ @have_root = false
129
186
  @document_status = nil
130
187
  @tags = []
131
188
  @stack = []
132
189
  @entities = []
133
- @nsstack = []
190
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
191
+ @namespaces_restore_stack = []
134
192
  end
135
193
 
136
194
  def position
@@ -180,6 +238,8 @@ module REXML
180
238
 
181
239
  # Returns the next event. This is a +PullEvent+ object.
182
240
  def pull
241
+ @source.drop_parsed_content
242
+
183
243
  pull_event.tap do |event|
184
244
  @listeners.each do |listener|
185
245
  listener.receive event
@@ -192,236 +252,274 @@ module REXML
192
252
  x, @closed = @closed, nil
193
253
  return [ :end_element, x ]
194
254
  end
195
- return [ :end_document ] if empty?
255
+ if empty?
256
+ if @document_status == :in_doctype
257
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
258
+ end
259
+ unless @tags.empty?
260
+ path = "/" + @tags.join("/")
261
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
262
+ end
263
+ return [ :end_document ]
264
+ end
196
265
  return @stack.shift if @stack.size > 0
197
266
  #STDERR.puts @source.encoding
198
267
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
268
+
269
+ @source.ensure_buffer
199
270
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
271
+ start_position = @source.position
272
+ if @source.match("<?", true)
223
273
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
274
+ elsif @source.match("<!", true)
275
+ if @source.match("--", true)
276
+ md = @source.match(/(.*?)-->/um, true)
277
+ if md.nil?
278
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
279
  end
243
- if @source.match(/\A\s*\[/um, true)
280
+ if /--|-\z/.match?(md[1])
281
+ raise REXML::ParseException.new("Malformed comment", @source)
282
+ end
283
+ return [ :comment, md[1] ]
284
+ elsif @source.match("DOCTYPE", true)
285
+ base_error_message = "Malformed DOCTYPE"
286
+ unless @source.match(/\s+/um, true)
287
+ if @source.match(">")
288
+ message = "#{base_error_message}: name is missing"
289
+ else
290
+ message = "#{base_error_message}: invalid name"
291
+ end
292
+ @source.position = start_position
293
+ raise REXML::ParseException.new(message, @source)
294
+ end
295
+ name = parse_name(base_error_message)
296
+ if @source.match(/\s*\[/um, true)
297
+ id = [nil, nil, nil]
244
298
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
299
+ elsif @source.match(/\s*>/um, true)
300
+ id = [nil, nil, nil]
246
301
  @document_status = :after_doctype
302
+ @source.ensure_buffer
247
303
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
304
+ id = parse_id(base_error_message,
305
+ accept_external_id: true,
306
+ accept_public_id: false)
307
+ if id[0] == "SYSTEM"
308
+ # For backward compatibility
309
+ id[1], id[2] = id[2], nil
310
+ end
311
+ if @source.match(/\s*\[/um, true)
312
+ @document_status = :in_doctype
313
+ elsif @source.match(/\s*>/um, true)
314
+ @document_status = :after_doctype
315
+ @source.ensure_buffer
316
+ else
317
+ message = "#{base_error_message}: garbage after external ID"
318
+ raise REXML::ParseException.new(message, @source)
319
+ end
250
320
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
321
+ args = [:start_doctype, name, *id]
322
+ if @document_status == :after_doctype
323
+ @source.match(/\s*/um, true)
324
+ @stack << [ :end_doctype ]
325
+ end
326
+ return args
327
+ else
328
+ message = "Invalid XML"
329
+ raise REXML::ParseException.new(message, @source)
263
330
  end
264
331
  end
265
332
  end
266
333
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
334
+ @source.match(/\s*/um, true) # skip spaces
335
+ start_position = @source.position
336
+ if @source.match("<!", true)
337
+ if @source.match("ELEMENT", true)
338
+ md = @source.match(/(.*?)>/um, true)
339
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
340
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
341
+ elsif @source.match("ENTITY", true)
342
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
343
+ unless match_data
344
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
345
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
346
+ match = [:entitydecl, *match_data.captures.compact]
347
+ ref = false
348
+ if match[1] == '%'
349
+ ref = true
350
+ match.delete_at 1
351
+ end
352
+ # Now we have to sort out what kind of entity reference this is
353
+ if match[2] == 'SYSTEM'
354
+ # External reference
355
+ match[3] = match[3][1..-2] # PUBID
356
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
357
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
358
+ elsif match[2] == 'PUBLIC'
359
+ # External reference
360
+ match[3] = match[3][1..-2] # PUBID
361
+ match[4] = match[4][1..-2] # HREF
362
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
363
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
364
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
365
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
366
  else
329
- message = "#{base_error_message}: invalid declaration name"
367
+ match[2] = match[2][1..-2]
368
+ match.pop if match.size == 4
369
+ # match is [ :entity, name, value ]
330
370
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
371
+ match << '%' if ref
372
+ return match
373
+ elsif @source.match("ATTLIST", true)
374
+ md = @source.match(Private::ATTLISTDECL_END, true)
375
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
376
+ element = md[1]
377
+ contents = md[0]
378
+
379
+ pairs = {}
380
+ values = md[0].strip.scan( ATTDEF_RE )
381
+ values.each do |attdef|
382
+ unless attdef[3] == "#IMPLIED"
383
+ attdef.compact!
384
+ val = attdef[3]
385
+ val = attdef[4] if val == "#FIXED "
386
+ pairs[attdef[0]] = val
387
+ if attdef[0] =~ /^xmlns:(.*)/
388
+ @namespaces[$1] = val
389
+ end
390
+ end
391
+ end
392
+ return [ :attlistdecl, element, pairs, contents ]
393
+ elsif @source.match("NOTATION", true)
394
+ base_error_message = "Malformed notation declaration"
395
+ unless @source.match(/\s+/um, true)
396
+ if @source.match(">")
397
+ message = "#{base_error_message}: name is missing"
398
+ else
399
+ message = "#{base_error_message}: invalid name"
400
+ end
401
+ @source.position = start_position
402
+ raise REXML::ParseException.new(message, @source)
403
+ end
404
+ name = parse_name(base_error_message)
405
+ id = parse_id(base_error_message,
406
+ accept_external_id: true,
407
+ accept_public_id: true)
408
+ unless @source.match(/\s*>/um, true)
409
+ message = "#{base_error_message}: garbage before end >"
410
+ raise REXML::ParseException.new(message, @source)
411
+ end
412
+ return [:notationdecl, name, *id]
413
+ elsif md = @source.match(/--(.*?)-->/um, true)
414
+ case md[1]
415
+ when /--/, /-\z/
416
+ raise REXML::ParseException.new("Malformed comment", @source)
417
+ end
418
+ return [ :comment, md[1] ] if md
340
419
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
420
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
421
+ return [ :externalentity, match[1] ]
422
+ elsif @source.match(/\]\s*>/um, true)
343
423
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
424
  return [ :end_doctype ]
346
425
  end
426
+ if @document_status == :in_doctype
427
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
428
+ end
347
429
  end
348
430
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
431
+ @source.match(/\s*/um, true)
350
432
  end
351
433
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
434
+ start_position = @source.position
435
+ if @source.match("<", true)
436
+ # :text's read_until may remain only "<" in buffer. In the
437
+ # case, buffer is empty here. So we need to fill buffer
438
+ # here explicitly.
439
+ @source.ensure_buffer
440
+ if @source.match("/", true)
441
+ @namespaces_restore_stack.pop
356
442
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
443
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
444
  if md and !last_tag
359
445
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
446
  raise REXML::ParseException.new(message, @source)
361
447
  end
362
448
  if md.nil? or last_tag != md[1]
363
449
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
450
+ message += " (got '#{md[1]}')" if md
451
+ @source.position = start_position if md.nil?
365
452
  raise REXML::ParseException.new(message, @source)
366
453
  end
367
454
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
455
+ elsif @source.match("!", true)
456
+ md = @source.match(/([^>]*>)/um)
370
457
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
458
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
459
+ if md[0][0] == ?-
460
+ md = @source.match(/--(.*?)-->/um, true)
374
461
 
375
- case md[1]
376
- when /--/, /-\z/
462
+ if md.nil? || /--|-\z/.match?(md[1])
377
463
  raise REXML::ParseException.new("Malformed comment", @source)
378
464
  end
379
465
 
380
- return [ :comment, md[1] ] if md
466
+ return [ :comment, md[1] ]
381
467
  else
382
- md = @source.match( CDATA_PATTERN, true )
468
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
469
  return [ :cdata, md[1] ] if md
384
470
  end
385
471
  raise REXML::ParseException.new( "Declarations can only occur "+
386
472
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
473
+ elsif @source.match("?", true)
388
474
  return process_instruction
389
475
  else
390
476
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
477
+ md = @source.match(Private::TAG_PATTERN, true)
392
478
  unless md
479
+ @source.position = start_position
393
480
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
481
  end
482
+ tag = md[1]
395
483
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
484
+ @prefixes.clear
485
+ @prefixes << md[2] if md[2]
486
+ push_namespaces_restore
487
+ attributes, closed = parse_attributes(@prefixes)
400
488
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
489
+ for prefix in @prefixes
490
+ unless @namespaces.key?(prefix)
403
491
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
492
  end
405
493
  end
406
494
 
407
495
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
496
+ @closed = tag
497
+ pop_namespaces_restore
410
498
  else
411
- @tags.push( md[1] )
499
+ if @tags.empty? and @have_root
500
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
501
+ end
502
+ @tags.push( tag )
412
503
  end
413
- return [ :start_element, md[1], attributes ]
504
+ @have_root = true
505
+ return [ :start_element, tag, attributes ]
414
506
  end
415
507
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
508
+ text = @source.read_until("<")
509
+ if text.chomp!("<")
510
+ @source.position -= "<".bytesize
511
+ end
512
+ if @tags.empty?
513
+ unless /\A\s*\z/.match?(text)
514
+ if @have_root
515
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
516
+ else
517
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
518
+ end
519
+ end
520
+ return pull_event if @have_root
419
521
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
522
+ return [ :text, text ]
425
523
  end
426
524
  rescue REXML::UndefinedNamespaceException
427
525
  raise
@@ -436,13 +534,13 @@ module REXML
436
534
  private :pull_event
437
535
 
438
536
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
537
+ return unless entities
538
+
539
+ value = entities[ reference ]
540
+ return if value.nil?
541
+
542
+ record_entity_expansion
543
+ unnormalize( value, entities )
446
544
  end
447
545
 
448
546
  # Escapes all possible entities
@@ -463,35 +561,87 @@ module REXML
463
561
 
464
562
  # Unescapes all possible entities
465
563
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
564
+ if string.include?("\r")
565
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
566
+ else
567
+ rv = string.dup
568
+ end
468
569
  matches = rv.scan( REFERENCE_RE )
469
570
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
571
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
572
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
573
+ if m.start_with?("x")
574
+ code_point = Integer(m[1..-1], 16)
575
+ else
576
+ code_point = Integer(m, 10)
577
+ end
578
+ [code_point].pack('U*')
474
579
  }
475
580
  matches.collect!{|x|x[0]}.compact!
581
+ if filter
582
+ matches.reject! do |entity_reference|
583
+ filter.include?(entity_reference)
584
+ end
585
+ end
476
586
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
587
+ matches.tally.each do |entity_reference, n|
588
+ entity_expansion_count_before = @entity_expansion_count
589
+ entity_value = entity( entity_reference, entities )
590
+ if entity_value
591
+ if n > 1
592
+ entity_expansion_count_delta =
593
+ @entity_expansion_count - entity_expansion_count_before
594
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
595
+ end
596
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
597
+ rv.gsub!( re, entity_value )
598
+ if rv.bytesize > @entity_expansion_text_limit
599
+ raise "entity expansion has grown too large"
486
600
  end
601
+ else
602
+ er = DEFAULT_ENTITIES[entity_reference]
603
+ rv.gsub!( er[0], er[2] ) if er
487
604
  end
488
605
  end
489
- rv.gsub!( /&amp;/, '&' )
606
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
607
  end
491
608
  rv
492
609
  end
493
610
 
494
611
  private
612
+ def add_namespace(prefix, uri)
613
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
614
+ if uri.nil?
615
+ @namespaces.delete(prefix)
616
+ else
617
+ @namespaces[prefix] = uri
618
+ end
619
+ end
620
+
621
+ def push_namespaces_restore
622
+ namespaces_restore = {}
623
+ @namespaces_restore_stack.push(namespaces_restore)
624
+ namespaces_restore
625
+ end
626
+
627
+ def pop_namespaces_restore
628
+ namespaces_restore = @namespaces_restore_stack.pop
629
+ namespaces_restore.each do |prefix, uri|
630
+ if uri.nil?
631
+ @namespaces.delete(prefix)
632
+ else
633
+ @namespaces[prefix] = uri
634
+ end
635
+ end
636
+ end
637
+
638
+ def record_entity_expansion(delta=1)
639
+ @entity_expansion_count += delta
640
+ if @entity_expansion_count > @entity_expansion_limit
641
+ raise "number of entity expansions exceeded, processing aborted."
642
+ end
643
+ end
644
+
495
645
  def need_source_encoding_update?(xml_declaration_encoding)
496
646
  return false if xml_declaration_encoding.nil?
497
647
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +649,16 @@ module REXML
499
649
  end
500
650
 
501
651
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
652
+ md = @source.match(Private::NAME_PATTERN, true)
503
653
  unless md
504
- if @source.match(/\A\s*\S/um)
654
+ if @source.match(/\S/um)
505
655
  message = "#{base_error_message}: invalid name"
506
656
  else
507
657
  message = "#{base_error_message}: name is missing"
508
658
  end
509
659
  raise REXML::ParseException.new(message, @source)
510
660
  end
511
- md[1]
661
+ md[0]
512
662
  end
513
663
 
514
664
  def parse_id(base_error_message,
@@ -578,96 +728,114 @@ module REXML
578
728
  end
579
729
 
580
730
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
731
+ name = parse_name("Malformed XML: Invalid processing instruction node")
732
+ if @source.match(/\s+/um, true)
733
+ match_data = @source.match(/(.*?)\?>/um, true)
734
+ unless match_data
735
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736
+ end
737
+ content = match_data[1]
738
+ else
739
+ content = nil
740
+ unless @source.match("?>", true)
741
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
742
+ end
743
+ end
744
+ if name == "xml"
745
+ if @document_status
746
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
747
+ end
748
+ version = VERSION.match(content)
749
+ version = version[1] unless version.nil?
750
+ encoding = ENCODING.match(content)
751
+ encoding = encoding[1] unless encoding.nil?
752
+ if need_source_encoding_update?(encoding)
753
+ @source.encoding = encoding
754
+ end
755
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
756
+ encoding = "UTF-16"
757
+ end
758
+ standalone = STANDALONE.match(content)
759
+ standalone = standalone[1] unless standalone.nil?
760
+ return [ :xmldecl, version, encoding, standalone ]
585
761
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
762
+ [:processing_instruction, name, content]
587
763
  end
588
764
 
589
- def parse_attributes(prefixes, curr_ns)
765
+ def parse_attributes(prefixes)
590
766
  attributes = {}
767
+ expanded_names = {}
591
768
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
769
+ while true
770
+ if @source.match(">", true)
771
+ return attributes, closed
772
+ elsif @source.match("/>", true)
773
+ closed = true
774
+ return attributes, closed
775
+ elsif match = @source.match(QNAME, true)
776
+ name = match[1]
777
+ prefix = match[2]
778
+ local_part = match[3]
779
+
780
+ unless @source.match(/\s*=\s*/um, true)
618
781
  message = "Missing attribute equal: <#{name}>"
619
782
  raise REXML::ParseException.new(message, @source)
620
783
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
784
+ unless match = @source.match(/(['"])/, true)
623
785
  message = "Missing attribute value start quote: <#{name}>"
624
786
  raise REXML::ParseException.new(message, @source)
625
787
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
788
+ quote = match[1]
789
+ start_position = @source.position
790
+ value = @source.read_until(quote)
791
+ unless value.chomp!(quote)
792
+ @source.position = start_position
793
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
794
  raise REXML::ParseException.new(message, @source)
639
795
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
796
+ @source.match(/\s*/um, true)
797
+ if prefix == "xmlns"
798
+ if local_part == "xml"
799
+ if value != Private::XML_PREFIXED_NAMESPACE
800
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
801
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
802
+ raise REXML::ParseException.new( msg, @source, self )
803
+ end
804
+ elsif local_part == "xmlns"
805
+ msg = "The 'xmlns' prefix must not be declared "+
650
806
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
807
+ raise REXML::ParseException.new( msg, @source, self)
652
808
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
809
+ add_namespace(local_part, value)
810
+ elsif prefix
811
+ prefixes << prefix unless prefix == "xml"
657
812
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
813
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
814
+ if attributes[name]
815
+ msg = "Duplicate attribute #{name.inspect}"
816
+ raise REXML::ParseException.new(msg, @source, self)
817
+ end
667
818
 
668
- attributes[name] = value
819
+ unless prefix == "xmlns"
820
+ uri = @namespaces[prefix]
821
+ expanded_name = [uri, local_part]
822
+ existing_prefix = expanded_names[expanded_name]
823
+ if existing_prefix
824
+ message = "Namespace conflict in adding attribute " +
825
+ "\"#{local_part}\": " +
826
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
827
+ "prefix \"#{prefix}\" = \"#{uri}\""
828
+ raise REXML::ParseException.new(message, @source, self)
829
+ end
830
+ expanded_names[expanded_name] = prefix
831
+ end
832
+
833
+ attributes[name] = value
834
+ else
835
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
836
+ raise REXML::ParseException.new(message, @source)
837
+ end
669
838
  end
670
- return attributes, closed
671
839
  end
672
840
  end
673
841
  end