rexml 3.2.6 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,24 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
184
+ reset
185
+ end
186
+
187
+ def reset
128
188
  @closed = nil
189
+ @have_root = false
129
190
  @document_status = nil
130
191
  @tags = []
131
192
  @stack = []
132
193
  @entities = []
133
- @nsstack = []
194
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
195
+ @namespaces_restore_stack = []
134
196
  end
135
197
 
136
198
  def position
@@ -180,6 +242,8 @@ module REXML
180
242
 
181
243
  # Returns the next event. This is a +PullEvent+ object.
182
244
  def pull
245
+ @source.drop_parsed_content
246
+
183
247
  pull_event.tap do |event|
184
248
  @listeners.each do |listener|
185
249
  listener.receive event
@@ -192,236 +256,274 @@ module REXML
192
256
  x, @closed = @closed, nil
193
257
  return [ :end_element, x ]
194
258
  end
195
- return [ :end_document ] if empty?
259
+ if empty?
260
+ if @document_status == :in_doctype
261
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
262
+ end
263
+ unless @tags.empty?
264
+ path = "/" + @tags.join("/")
265
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
266
+ end
267
+ return [ :end_document ]
268
+ end
196
269
  return @stack.shift if @stack.size > 0
197
270
  #STDERR.puts @source.encoding
198
271
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
272
+
273
+ @source.ensure_buffer
199
274
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
275
+ start_position = @source.position
276
+ if @source.match?("<?", true)
223
277
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
278
+ elsif @source.match?("<!", true)
279
+ if @source.match?("--", true)
280
+ md = @source.match(/(.*?)-->/um, true)
281
+ if md.nil?
282
+ raise REXML::ParseException.new("Unclosed comment", @source)
283
+ end
284
+ if /--|-\z/.match?(md[1])
285
+ raise REXML::ParseException.new("Malformed comment", @source)
286
+ end
287
+ return [ :comment, md[1] ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.match?(/\s+/um, true)
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
242
298
  end
243
- if @source.match(/\A\s*\[/um, true)
299
+ name = parse_name(base_error_message)
300
+ if @source.match?(/\s*\[/um, true)
301
+ id = [nil, nil, nil]
244
302
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
303
+ elsif @source.match?(/\s*>/um, true)
304
+ id = [nil, nil, nil]
246
305
  @document_status = :after_doctype
306
+ @source.ensure_buffer
247
307
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
308
+ id = parse_id(base_error_message,
309
+ accept_external_id: true,
310
+ accept_public_id: false)
311
+ if id[0] == "SYSTEM"
312
+ # For backward compatibility
313
+ id[1], id[2] = id[2], nil
314
+ end
315
+ if @source.match?(/\s*\[/um, true)
316
+ @document_status = :in_doctype
317
+ elsif @source.match?(/\s*>/um, true)
318
+ @document_status = :after_doctype
319
+ @source.ensure_buffer
320
+ else
321
+ message = "#{base_error_message}: garbage after external ID"
322
+ raise REXML::ParseException.new(message, @source)
323
+ end
250
324
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
325
+ args = [:start_doctype, name, *id]
326
+ if @document_status == :after_doctype
327
+ @source.match?(/\s*/um, true)
328
+ @stack << [ :end_doctype ]
329
+ end
330
+ return args
331
+ else
332
+ message = "Invalid XML"
333
+ raise REXML::ParseException.new(message, @source)
263
334
  end
264
335
  end
265
336
  end
266
337
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
338
+ @source.match?(/\s*/um, true) # skip spaces
339
+ start_position = @source.position
340
+ if @source.match?("<!", true)
341
+ if @source.match?("ELEMENT", true)
342
+ md = @source.match(/(.*?)>/um, true)
343
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
344
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
345
+ elsif @source.match?("ENTITY", true)
346
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
347
+ unless match_data
348
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
349
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
350
+ match = [:entitydecl, *match_data.captures.compact]
351
+ ref = false
352
+ if match[1] == '%'
353
+ ref = true
354
+ match.delete_at 1
355
+ end
356
+ # Now we have to sort out what kind of entity reference this is
357
+ if match[2] == 'SYSTEM'
358
+ # External reference
359
+ match[3] = match[3][1..-2] # PUBID
360
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
361
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
362
+ elsif match[2] == 'PUBLIC'
363
+ # External reference
364
+ match[3] = match[3][1..-2] # PUBID
365
+ match[4] = match[4][1..-2] # HREF
366
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
367
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
368
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
369
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
370
  else
329
- message = "#{base_error_message}: invalid declaration name"
371
+ match[2] = match[2][1..-2]
372
+ match.pop if match.size == 4
373
+ # match is [ :entity, name, value ]
330
374
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
375
+ match << '%' if ref
376
+ return match
377
+ elsif @source.match?("ATTLIST", true)
378
+ md = @source.match(Private::ATTLISTDECL_END, true)
379
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
380
+ element = md[1]
381
+ contents = md[0]
382
+
383
+ pairs = {}
384
+ values = md[0].strip.scan( ATTDEF_RE )
385
+ values.each do |attdef|
386
+ unless attdef[3] == "#IMPLIED"
387
+ attdef.compact!
388
+ val = attdef[3]
389
+ val = attdef[4] if val == "#FIXED "
390
+ pairs[attdef[0]] = val
391
+ if attdef[0] =~ /^xmlns:(.*)/
392
+ @namespaces[$1] = val
393
+ end
394
+ end
395
+ end
396
+ return [ :attlistdecl, element, pairs, contents ]
397
+ elsif @source.match?("NOTATION", true)
398
+ base_error_message = "Malformed notation declaration"
399
+ unless @source.match?(/\s+/um, true)
400
+ if @source.match?(">")
401
+ message = "#{base_error_message}: name is missing"
402
+ else
403
+ message = "#{base_error_message}: invalid name"
404
+ end
405
+ @source.position = start_position
406
+ raise REXML::ParseException.new(message, @source)
407
+ end
408
+ name = parse_name(base_error_message)
409
+ id = parse_id(base_error_message,
410
+ accept_external_id: true,
411
+ accept_public_id: true)
412
+ unless @source.match?(/\s*>/um, true)
413
+ message = "#{base_error_message}: garbage before end >"
414
+ raise REXML::ParseException.new(message, @source)
415
+ end
416
+ return [:notationdecl, name, *id]
417
+ elsif md = @source.match(/--(.*?)-->/um, true)
418
+ case md[1]
419
+ when /--/, /-\z/
420
+ raise REXML::ParseException.new("Malformed comment", @source)
421
+ end
422
+ return [ :comment, md[1] ] if md
340
423
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
424
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
425
+ return [ :externalentity, match[1] ]
426
+ elsif @source.match?(/\]\s*>/um, true)
343
427
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
428
  return [ :end_doctype ]
346
429
  end
430
+ if @document_status == :in_doctype
431
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
432
+ end
347
433
  end
348
434
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
435
+ @source.match?(/\s*/um, true)
350
436
  end
351
437
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
438
+ start_position = @source.position
439
+ if @source.match?("<", true)
440
+ # :text's read_until may remain only "<" in buffer. In the
441
+ # case, buffer is empty here. So we need to fill buffer
442
+ # here explicitly.
443
+ @source.ensure_buffer
444
+ if @source.match?("/", true)
445
+ @namespaces_restore_stack.pop
356
446
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
447
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
448
  if md and !last_tag
359
449
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
450
  raise REXML::ParseException.new(message, @source)
361
451
  end
362
452
  if md.nil? or last_tag != md[1]
363
453
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
454
+ message += " (got '#{md[1]}')" if md
455
+ @source.position = start_position if md.nil?
365
456
  raise REXML::ParseException.new(message, @source)
366
457
  end
367
458
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
459
+ elsif @source.match?("!", true)
460
+ md = @source.match(/([^>]*>)/um)
370
461
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
462
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
463
+ if md[0][0] == ?-
464
+ md = @source.match(/--(.*?)-->/um, true)
374
465
 
375
- case md[1]
376
- when /--/, /-\z/
466
+ if md.nil? || /--|-\z/.match?(md[1])
377
467
  raise REXML::ParseException.new("Malformed comment", @source)
378
468
  end
379
469
 
380
- return [ :comment, md[1] ] if md
470
+ return [ :comment, md[1] ]
381
471
  else
382
- md = @source.match( CDATA_PATTERN, true )
472
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
473
  return [ :cdata, md[1] ] if md
384
474
  end
385
475
  raise REXML::ParseException.new( "Declarations can only occur "+
386
476
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
477
+ elsif @source.match?("?", true)
388
478
  return process_instruction
389
479
  else
390
480
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
481
+ md = @source.match(Private::TAG_PATTERN, true)
392
482
  unless md
483
+ @source.position = start_position
393
484
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
485
  end
486
+ tag = md[1]
395
487
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
488
+ @prefixes.clear
489
+ @prefixes << md[2] if md[2]
490
+ push_namespaces_restore
491
+ attributes, closed = parse_attributes(@prefixes)
400
492
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
493
+ for prefix in @prefixes
494
+ unless @namespaces.key?(prefix)
403
495
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
496
  end
405
497
  end
406
498
 
407
499
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
500
+ @closed = tag
501
+ pop_namespaces_restore
410
502
  else
411
- @tags.push( md[1] )
503
+ if @tags.empty? and @have_root
504
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
505
+ end
506
+ @tags.push( tag )
412
507
  end
413
- return [ :start_element, md[1], attributes ]
508
+ @have_root = true
509
+ return [ :start_element, tag, attributes ]
414
510
  end
415
511
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
512
+ text = @source.read_until("<")
513
+ if text.chomp!("<")
514
+ @source.position -= "<".bytesize
515
+ end
516
+ if @tags.empty?
517
+ unless /\A\s*\z/.match?(text)
518
+ if @have_root
519
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
520
+ else
521
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
522
+ end
523
+ end
524
+ return pull_event if @have_root
419
525
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
526
+ return [ :text, text ]
425
527
  end
426
528
  rescue REXML::UndefinedNamespaceException
427
529
  raise
@@ -436,13 +538,13 @@ module REXML
436
538
  private :pull_event
437
539
 
438
540
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
541
+ return unless entities
542
+
543
+ value = entities[ reference ]
544
+ return if value.nil?
545
+
546
+ record_entity_expansion
547
+ unnormalize( value, entities )
446
548
  end
447
549
 
448
550
  # Escapes all possible entities
@@ -463,35 +565,87 @@ module REXML
463
565
 
464
566
  # Unescapes all possible entities
465
567
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
568
+ if string.include?("\r")
569
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
570
+ else
571
+ rv = string.dup
572
+ end
468
573
  matches = rv.scan( REFERENCE_RE )
469
574
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
575
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
576
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
577
+ if m.start_with?("x")
578
+ code_point = Integer(m[1..-1], 16)
579
+ else
580
+ code_point = Integer(m, 10)
581
+ end
582
+ [code_point].pack('U*')
474
583
  }
475
584
  matches.collect!{|x|x[0]}.compact!
585
+ if filter
586
+ matches.reject! do |entity_reference|
587
+ filter.include?(entity_reference)
588
+ end
589
+ end
476
590
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
591
+ matches.tally.each do |entity_reference, n|
592
+ entity_expansion_count_before = @entity_expansion_count
593
+ entity_value = entity( entity_reference, entities )
594
+ if entity_value
595
+ if n > 1
596
+ entity_expansion_count_delta =
597
+ @entity_expansion_count - entity_expansion_count_before
598
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
486
599
  end
600
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
601
+ rv.gsub!( re, entity_value )
602
+ if rv.bytesize > @entity_expansion_text_limit
603
+ raise "entity expansion has grown too large"
604
+ end
605
+ else
606
+ er = DEFAULT_ENTITIES[entity_reference]
607
+ rv.gsub!( er[0], er[2] ) if er
487
608
  end
488
609
  end
489
- rv.gsub!( /&amp;/, '&' )
610
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
611
  end
491
612
  rv
492
613
  end
493
614
 
494
615
  private
616
+ def add_namespace(prefix, uri)
617
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
618
+ if uri.nil?
619
+ @namespaces.delete(prefix)
620
+ else
621
+ @namespaces[prefix] = uri
622
+ end
623
+ end
624
+
625
+ def push_namespaces_restore
626
+ namespaces_restore = {}
627
+ @namespaces_restore_stack.push(namespaces_restore)
628
+ namespaces_restore
629
+ end
630
+
631
+ def pop_namespaces_restore
632
+ namespaces_restore = @namespaces_restore_stack.pop
633
+ namespaces_restore.each do |prefix, uri|
634
+ if uri.nil?
635
+ @namespaces.delete(prefix)
636
+ else
637
+ @namespaces[prefix] = uri
638
+ end
639
+ end
640
+ end
641
+
642
+ def record_entity_expansion(delta=1)
643
+ @entity_expansion_count += delta
644
+ if @entity_expansion_count > @entity_expansion_limit
645
+ raise "number of entity expansions exceeded, processing aborted."
646
+ end
647
+ end
648
+
495
649
  def need_source_encoding_update?(xml_declaration_encoding)
496
650
  return false if xml_declaration_encoding.nil?
497
651
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +653,16 @@ module REXML
499
653
  end
500
654
 
501
655
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
656
+ md = @source.match(Private::NAME_PATTERN, true)
503
657
  unless md
504
- if @source.match(/\A\s*\S/um)
658
+ if @source.match?(/\S/um)
505
659
  message = "#{base_error_message}: invalid name"
506
660
  else
507
661
  message = "#{base_error_message}: name is missing"
508
662
  end
509
663
  raise REXML::ParseException.new(message, @source)
510
664
  end
511
- md[1]
665
+ md[0]
512
666
  end
513
667
 
514
668
  def parse_id(base_error_message,
@@ -543,34 +697,34 @@ module REXML
543
697
  accept_public_id:)
544
698
  public = /\A\s*PUBLIC/um
545
699
  system = /\A\s*SYSTEM/um
546
- if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
547
- if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
700
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
701
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
548
702
  return "public ID literal is missing"
549
703
  end
550
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
704
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
551
705
  return "invalid public ID literal"
552
706
  end
553
707
  if accept_public_id
554
- if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
708
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
555
709
  return "system ID literal is missing"
556
710
  end
557
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
711
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
558
712
  return "invalid system literal"
559
713
  end
560
714
  "garbage after system literal"
561
715
  else
562
716
  "garbage after public ID literal"
563
717
  end
564
- elsif accept_external_id and @source.match(/#{system}/um)
565
- if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
718
+ elsif accept_external_id and @source.match?(/#{system}/um)
719
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
566
720
  return "system literal is missing"
567
721
  end
568
- unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
722
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
569
723
  return "invalid system literal"
570
724
  end
571
725
  "garbage after system literal"
572
726
  else
573
- unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
727
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
574
728
  return "invalid ID type"
575
729
  end
576
730
  "ID type is missing"
@@ -578,96 +732,114 @@ module REXML
578
732
  end
579
733
 
580
734
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
735
+ name = parse_name("Malformed XML: Invalid processing instruction node")
736
+ if @source.match?(/\s+/um, true)
737
+ match_data = @source.match(/(.*?)\?>/um, true)
738
+ unless match_data
739
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
740
+ end
741
+ content = match_data[1]
742
+ else
743
+ content = nil
744
+ unless @source.match?("?>", true)
745
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
746
+ end
585
747
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
748
+ if name == "xml"
749
+ if @document_status
750
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
751
+ end
752
+ version = VERSION.match(content)
753
+ version = version[1] unless version.nil?
754
+ encoding = ENCODING.match(content)
755
+ encoding = encoding[1] unless encoding.nil?
756
+ if need_source_encoding_update?(encoding)
757
+ @source.encoding = encoding
758
+ end
759
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
760
+ encoding = "UTF-16"
761
+ end
762
+ standalone = STANDALONE.match(content)
763
+ standalone = standalone[1] unless standalone.nil?
764
+ return [ :xmldecl, version, encoding, standalone ]
765
+ end
766
+ [:processing_instruction, name, content]
587
767
  end
588
768
 
589
- def parse_attributes(prefixes, curr_ns)
769
+ def parse_attributes(prefixes)
590
770
  attributes = {}
771
+ expanded_names = {}
591
772
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
773
+ while true
774
+ if @source.match?(">", true)
775
+ return attributes, closed
776
+ elsif @source.match?("/>", true)
777
+ closed = true
778
+ return attributes, closed
779
+ elsif match = @source.match(QNAME, true)
780
+ name = match[1]
781
+ prefix = match[2]
782
+ local_part = match[3]
783
+
784
+ unless @source.match?(/\s*=\s*/um, true)
618
785
  message = "Missing attribute equal: <#{name}>"
619
786
  raise REXML::ParseException.new(message, @source)
620
787
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
788
+ unless match = @source.match(/(['"])/, true)
623
789
  message = "Missing attribute value start quote: <#{name}>"
624
790
  raise REXML::ParseException.new(message, @source)
625
791
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
792
+ quote = match[1]
793
+ start_position = @source.position
794
+ value = @source.read_until(quote)
795
+ unless value.chomp!(quote)
796
+ @source.position = start_position
797
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
798
  raise REXML::ParseException.new(message, @source)
639
799
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
800
+ @source.match?(/\s*/um, true)
801
+ if prefix == "xmlns"
802
+ if local_part == "xml"
803
+ if value != Private::XML_PREFIXED_NAMESPACE
804
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
805
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
806
+ raise REXML::ParseException.new( msg, @source, self )
807
+ end
808
+ elsif local_part == "xmlns"
809
+ msg = "The 'xmlns' prefix must not be declared "+
650
810
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
811
+ raise REXML::ParseException.new( msg, @source, self)
652
812
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
813
+ add_namespace(local_part, value)
814
+ elsif prefix
815
+ prefixes << prefix unless prefix == "xml"
657
816
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
817
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
818
+ if attributes[name]
819
+ msg = "Duplicate attribute #{name.inspect}"
820
+ raise REXML::ParseException.new(msg, @source, self)
821
+ end
667
822
 
668
- attributes[name] = value
823
+ unless prefix == "xmlns"
824
+ uri = @namespaces[prefix]
825
+ expanded_name = [uri, local_part]
826
+ existing_prefix = expanded_names[expanded_name]
827
+ if existing_prefix
828
+ message = "Namespace conflict in adding attribute " +
829
+ "\"#{local_part}\": " +
830
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
831
+ "prefix \"#{prefix}\" = \"#{uri}\""
832
+ raise REXML::ParseException.new(message, @source, self)
833
+ end
834
+ expanded_names[expanded_name] = prefix
835
+ end
836
+
837
+ attributes[name] = value
838
+ else
839
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
840
+ raise REXML::ParseException.new(message, @source)
841
+ end
669
842
  end
670
- return attributes, closed
671
843
  end
672
844
  end
673
845
  end