rexml 3.2.6 → 3.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,32 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ end
160
+ private_constant :Private
161
+
115
162
  def initialize( source )
116
163
  self.stream = source
117
164
  @listeners = []
165
+ @prefixes = Set.new
166
+ @entity_expansion_count = 0
167
+ @entity_expansion_limit = Security.entity_expansion_limit
168
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
118
169
  end
119
170
 
120
171
  def add_listener( listener )
@@ -122,15 +173,20 @@ module REXML
122
173
  end
123
174
 
124
175
  attr_reader :source
176
+ attr_reader :entity_expansion_count
177
+ attr_writer :entity_expansion_limit
178
+ attr_writer :entity_expansion_text_limit
125
179
 
126
180
  def stream=( source )
127
181
  @source = SourceFactory.create_from( source )
128
182
  @closed = nil
183
+ @have_root = false
129
184
  @document_status = nil
130
185
  @tags = []
131
186
  @stack = []
132
187
  @entities = []
133
- @nsstack = []
188
+ @namespaces = {}
189
+ @namespaces_restore_stack = []
134
190
  end
135
191
 
136
192
  def position
@@ -180,6 +236,8 @@ module REXML
180
236
 
181
237
  # Returns the next event. This is a +PullEvent+ object.
182
238
  def pull
239
+ @source.drop_parsed_content
240
+
183
241
  pull_event.tap do |event|
184
242
  @listeners.each do |listener|
185
243
  listener.receive event
@@ -192,236 +250,274 @@ module REXML
192
250
  x, @closed = @closed, nil
193
251
  return [ :end_element, x ]
194
252
  end
195
- return [ :end_document ] if empty?
253
+ if empty?
254
+ if @document_status == :in_doctype
255
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
256
+ end
257
+ unless @tags.empty?
258
+ path = "/" + @tags.join("/")
259
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
260
+ end
261
+ return [ :end_document ]
262
+ end
196
263
  return @stack.shift if @stack.size > 0
197
264
  #STDERR.puts @source.encoding
198
265
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
266
+
267
+ @source.ensure_buffer
199
268
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
269
+ start_position = @source.position
270
+ if @source.match("<?", true)
223
271
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
272
+ elsif @source.match("<!", true)
273
+ if @source.match("--", true)
274
+ md = @source.match(/(.*?)-->/um, true)
275
+ if md.nil?
276
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
277
  end
243
- if @source.match(/\A\s*\[/um, true)
278
+ if /--|-\z/.match?(md[1])
279
+ raise REXML::ParseException.new("Malformed comment", @source)
280
+ end
281
+ return [ :comment, md[1] ]
282
+ elsif @source.match("DOCTYPE", true)
283
+ base_error_message = "Malformed DOCTYPE"
284
+ unless @source.match(/\s+/um, true)
285
+ if @source.match(">")
286
+ message = "#{base_error_message}: name is missing"
287
+ else
288
+ message = "#{base_error_message}: invalid name"
289
+ end
290
+ @source.position = start_position
291
+ raise REXML::ParseException.new(message, @source)
292
+ end
293
+ name = parse_name(base_error_message)
294
+ if @source.match(/\s*\[/um, true)
295
+ id = [nil, nil, nil]
244
296
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
297
+ elsif @source.match(/\s*>/um, true)
298
+ id = [nil, nil, nil]
246
299
  @document_status = :after_doctype
300
+ @source.ensure_buffer
247
301
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
302
+ id = parse_id(base_error_message,
303
+ accept_external_id: true,
304
+ accept_public_id: false)
305
+ if id[0] == "SYSTEM"
306
+ # For backward compatibility
307
+ id[1], id[2] = id[2], nil
308
+ end
309
+ if @source.match(/\s*\[/um, true)
310
+ @document_status = :in_doctype
311
+ elsif @source.match(/\s*>/um, true)
312
+ @document_status = :after_doctype
313
+ @source.ensure_buffer
314
+ else
315
+ message = "#{base_error_message}: garbage after external ID"
316
+ raise REXML::ParseException.new(message, @source)
317
+ end
250
318
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
319
+ args = [:start_doctype, name, *id]
320
+ if @document_status == :after_doctype
321
+ @source.match(/\s*/um, true)
322
+ @stack << [ :end_doctype ]
323
+ end
324
+ return args
325
+ else
326
+ message = "Invalid XML"
327
+ raise REXML::ParseException.new(message, @source)
263
328
  end
264
329
  end
265
330
  end
266
331
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
332
+ @source.match(/\s*/um, true) # skip spaces
333
+ start_position = @source.position
334
+ if @source.match("<!", true)
335
+ if @source.match("ELEMENT", true)
336
+ md = @source.match(/(.*?)>/um, true)
337
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
338
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
339
+ elsif @source.match("ENTITY", true)
340
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
341
+ unless match_data
342
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
343
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
344
+ match = [:entitydecl, *match_data.captures.compact]
345
+ ref = false
346
+ if match[1] == '%'
347
+ ref = true
348
+ match.delete_at 1
349
+ end
350
+ # Now we have to sort out what kind of entity reference this is
351
+ if match[2] == 'SYSTEM'
352
+ # External reference
353
+ match[3] = match[3][1..-2] # PUBID
354
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
355
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
356
+ elsif match[2] == 'PUBLIC'
357
+ # External reference
358
+ match[3] = match[3][1..-2] # PUBID
359
+ match[4] = match[4][1..-2] # HREF
360
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
361
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
362
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
363
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
364
  else
329
- message = "#{base_error_message}: invalid declaration name"
365
+ match[2] = match[2][1..-2]
366
+ match.pop if match.size == 4
367
+ # match is [ :entity, name, value ]
330
368
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
369
+ match << '%' if ref
370
+ return match
371
+ elsif @source.match("ATTLIST", true)
372
+ md = @source.match(Private::ATTLISTDECL_END, true)
373
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
374
+ element = md[1]
375
+ contents = md[0]
376
+
377
+ pairs = {}
378
+ values = md[0].strip.scan( ATTDEF_RE )
379
+ values.each do |attdef|
380
+ unless attdef[3] == "#IMPLIED"
381
+ attdef.compact!
382
+ val = attdef[3]
383
+ val = attdef[4] if val == "#FIXED "
384
+ pairs[attdef[0]] = val
385
+ if attdef[0] =~ /^xmlns:(.*)/
386
+ @namespaces[$1] = val
387
+ end
388
+ end
389
+ end
390
+ return [ :attlistdecl, element, pairs, contents ]
391
+ elsif @source.match("NOTATION", true)
392
+ base_error_message = "Malformed notation declaration"
393
+ unless @source.match(/\s+/um, true)
394
+ if @source.match(">")
395
+ message = "#{base_error_message}: name is missing"
396
+ else
397
+ message = "#{base_error_message}: invalid name"
398
+ end
399
+ @source.position = start_position
400
+ raise REXML::ParseException.new(message, @source)
401
+ end
402
+ name = parse_name(base_error_message)
403
+ id = parse_id(base_error_message,
404
+ accept_external_id: true,
405
+ accept_public_id: true)
406
+ unless @source.match(/\s*>/um, true)
407
+ message = "#{base_error_message}: garbage before end >"
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ return [:notationdecl, name, *id]
411
+ elsif md = @source.match(/--(.*?)-->/um, true)
412
+ case md[1]
413
+ when /--/, /-\z/
414
+ raise REXML::ParseException.new("Malformed comment", @source)
415
+ end
416
+ return [ :comment, md[1] ] if md
340
417
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
418
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
419
+ return [ :externalentity, match[1] ]
420
+ elsif @source.match(/\]\s*>/um, true)
343
421
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
422
  return [ :end_doctype ]
346
423
  end
424
+ if @document_status == :in_doctype
425
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
426
+ end
347
427
  end
348
428
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
429
+ @source.match(/\s*/um, true)
350
430
  end
351
431
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
432
+ start_position = @source.position
433
+ if @source.match("<", true)
434
+ # :text's read_until may remain only "<" in buffer. In the
435
+ # case, buffer is empty here. So we need to fill buffer
436
+ # here explicitly.
437
+ @source.ensure_buffer
438
+ if @source.match("/", true)
439
+ @namespaces_restore_stack.pop
356
440
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
441
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
442
  if md and !last_tag
359
443
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
444
  raise REXML::ParseException.new(message, @source)
361
445
  end
362
446
  if md.nil? or last_tag != md[1]
363
447
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
448
+ message += " (got '#{md[1]}')" if md
449
+ @source.position = start_position if md.nil?
365
450
  raise REXML::ParseException.new(message, @source)
366
451
  end
367
452
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
453
+ elsif @source.match("!", true)
454
+ md = @source.match(/([^>]*>)/um)
370
455
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
456
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
457
+ if md[0][0] == ?-
458
+ md = @source.match(/--(.*?)-->/um, true)
374
459
 
375
- case md[1]
376
- when /--/, /-\z/
460
+ if md.nil? || /--|-\z/.match?(md[1])
377
461
  raise REXML::ParseException.new("Malformed comment", @source)
378
462
  end
379
463
 
380
- return [ :comment, md[1] ] if md
464
+ return [ :comment, md[1] ]
381
465
  else
382
- md = @source.match( CDATA_PATTERN, true )
466
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
467
  return [ :cdata, md[1] ] if md
384
468
  end
385
469
  raise REXML::ParseException.new( "Declarations can only occur "+
386
470
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
471
+ elsif @source.match("?", true)
388
472
  return process_instruction
389
473
  else
390
474
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
475
+ md = @source.match(Private::TAG_PATTERN, true)
392
476
  unless md
477
+ @source.position = start_position
393
478
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
479
  end
480
+ tag = md[1]
395
481
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
482
+ @prefixes.clear
483
+ @prefixes << md[2] if md[2]
484
+ push_namespaces_restore
485
+ attributes, closed = parse_attributes(@prefixes)
400
486
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
487
+ for prefix in @prefixes
488
+ unless @namespaces.key?(prefix)
403
489
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
490
  end
405
491
  end
406
492
 
407
493
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
494
+ @closed = tag
495
+ pop_namespaces_restore
410
496
  else
411
- @tags.push( md[1] )
497
+ if @tags.empty? and @have_root
498
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
499
+ end
500
+ @tags.push( tag )
412
501
  end
413
- return [ :start_element, md[1], attributes ]
502
+ @have_root = true
503
+ return [ :start_element, tag, attributes ]
414
504
  end
415
505
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
506
+ text = @source.read_until("<")
507
+ if text.chomp!("<")
508
+ @source.position -= "<".bytesize
419
509
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
510
+ if @tags.empty?
511
+ unless /\A\s*\z/.match?(text)
512
+ if @have_root
513
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
514
+ else
515
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
516
+ end
517
+ end
518
+ return pull_event if @have_root
519
+ end
520
+ return [ :text, text ]
425
521
  end
426
522
  rescue REXML::UndefinedNamespaceException
427
523
  raise
@@ -436,13 +532,13 @@ module REXML
436
532
  private :pull_event
437
533
 
438
534
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
535
+ return unless entities
536
+
537
+ value = entities[ reference ]
538
+ return if value.nil?
539
+
540
+ record_entity_expansion
541
+ unnormalize( value, entities )
446
542
  end
447
543
 
448
544
  # Escapes all possible entities
@@ -463,35 +559,83 @@ module REXML
463
559
 
464
560
  # Unescapes all possible entities
465
561
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
562
+ if string.include?("\r")
563
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
564
+ else
565
+ rv = string.dup
566
+ end
468
567
  matches = rv.scan( REFERENCE_RE )
469
568
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
569
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
570
  m=$1
472
571
  m = "0#{m}" if m[0] == ?x
473
572
  [Integer(m)].pack('U*')
474
573
  }
475
574
  matches.collect!{|x|x[0]}.compact!
575
+ if filter
576
+ matches.reject! do |entity_reference|
577
+ filter.include?(entity_reference)
578
+ end
579
+ end
476
580
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
581
+ matches.tally.each do |entity_reference, n|
582
+ entity_expansion_count_before = @entity_expansion_count
583
+ entity_value = entity( entity_reference, entities )
584
+ if entity_value
585
+ if n > 1
586
+ entity_expansion_count_delta =
587
+ @entity_expansion_count - entity_expansion_count_before
588
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
589
+ end
590
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
591
+ rv.gsub!( re, entity_value )
592
+ if rv.bytesize > @entity_expansion_text_limit
593
+ raise "entity expansion has grown too large"
486
594
  end
595
+ else
596
+ er = DEFAULT_ENTITIES[entity_reference]
597
+ rv.gsub!( er[0], er[2] ) if er
487
598
  end
488
599
  end
489
- rv.gsub!( /&amp;/, '&' )
600
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
601
  end
491
602
  rv
492
603
  end
493
604
 
494
605
  private
606
+ def add_namespace(prefix, uri)
607
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
608
+ if uri.nil?
609
+ @namespaces.delete(prefix)
610
+ else
611
+ @namespaces[prefix] = uri
612
+ end
613
+ end
614
+
615
+ def push_namespaces_restore
616
+ namespaces_restore = {}
617
+ @namespaces_restore_stack.push(namespaces_restore)
618
+ namespaces_restore
619
+ end
620
+
621
+ def pop_namespaces_restore
622
+ namespaces_restore = @namespaces_restore_stack.pop
623
+ namespaces_restore.each do |prefix, uri|
624
+ if uri.nil?
625
+ @namespaces.delete(prefix)
626
+ else
627
+ @namespaces[prefix] = uri
628
+ end
629
+ end
630
+ end
631
+
632
+ def record_entity_expansion(delta=1)
633
+ @entity_expansion_count += delta
634
+ if @entity_expansion_count > @entity_expansion_limit
635
+ raise "number of entity expansions exceeded, processing aborted."
636
+ end
637
+ end
638
+
495
639
  def need_source_encoding_update?(xml_declaration_encoding)
496
640
  return false if xml_declaration_encoding.nil?
497
641
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +643,16 @@ module REXML
499
643
  end
500
644
 
501
645
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
646
+ md = @source.match(Private::NAME_PATTERN, true)
503
647
  unless md
504
- if @source.match(/\A\s*\S/um)
648
+ if @source.match(/\S/um)
505
649
  message = "#{base_error_message}: invalid name"
506
650
  else
507
651
  message = "#{base_error_message}: name is missing"
508
652
  end
509
653
  raise REXML::ParseException.new(message, @source)
510
654
  end
511
- md[1]
655
+ md[0]
512
656
  end
513
657
 
514
658
  def parse_id(base_error_message,
@@ -578,96 +722,114 @@ module REXML
578
722
  end
579
723
 
580
724
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
725
+ name = parse_name("Malformed XML: Invalid processing instruction node")
726
+ if @source.match(/\s+/um, true)
727
+ match_data = @source.match(/(.*?)\?>/um, true)
728
+ unless match_data
729
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
730
+ end
731
+ content = match_data[1]
732
+ else
733
+ content = nil
734
+ unless @source.match("?>", true)
735
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736
+ end
737
+ end
738
+ if name == "xml"
739
+ if @document_status
740
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
741
+ end
742
+ version = VERSION.match(content)
743
+ version = version[1] unless version.nil?
744
+ encoding = ENCODING.match(content)
745
+ encoding = encoding[1] unless encoding.nil?
746
+ if need_source_encoding_update?(encoding)
747
+ @source.encoding = encoding
748
+ end
749
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
750
+ encoding = "UTF-16"
751
+ end
752
+ standalone = STANDALONE.match(content)
753
+ standalone = standalone[1] unless standalone.nil?
754
+ return [ :xmldecl, version, encoding, standalone ]
585
755
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
756
+ [:processing_instruction, name, content]
587
757
  end
588
758
 
589
- def parse_attributes(prefixes, curr_ns)
759
+ def parse_attributes(prefixes)
590
760
  attributes = {}
761
+ expanded_names = {}
591
762
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
763
+ while true
764
+ if @source.match(">", true)
765
+ return attributes, closed
766
+ elsif @source.match("/>", true)
767
+ closed = true
768
+ return attributes, closed
769
+ elsif match = @source.match(QNAME, true)
770
+ name = match[1]
771
+ prefix = match[2]
772
+ local_part = match[3]
773
+
774
+ unless @source.match(/\s*=\s*/um, true)
618
775
  message = "Missing attribute equal: <#{name}>"
619
776
  raise REXML::ParseException.new(message, @source)
620
777
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
778
+ unless match = @source.match(/(['"])/, true)
623
779
  message = "Missing attribute value start quote: <#{name}>"
624
780
  raise REXML::ParseException.new(message, @source)
625
781
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
782
+ quote = match[1]
783
+ start_position = @source.position
784
+ value = @source.read_until(quote)
785
+ unless value.chomp!(quote)
786
+ @source.position = start_position
787
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
788
  raise REXML::ParseException.new(message, @source)
639
789
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
790
+ @source.match(/\s*/um, true)
791
+ if prefix == "xmlns"
792
+ if local_part == "xml"
793
+ if value != "http://www.w3.org/XML/1998/namespace"
794
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
795
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
796
+ raise REXML::ParseException.new( msg, @source, self )
797
+ end
798
+ elsif local_part == "xmlns"
799
+ msg = "The 'xmlns' prefix must not be declared "+
650
800
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
801
+ raise REXML::ParseException.new( msg, @source, self)
652
802
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
803
+ add_namespace(local_part, value)
804
+ elsif prefix
805
+ prefixes << prefix unless prefix == "xml"
657
806
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
807
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
808
+ if attributes[name]
809
+ msg = "Duplicate attribute #{name.inspect}"
810
+ raise REXML::ParseException.new(msg, @source, self)
811
+ end
667
812
 
668
- attributes[name] = value
813
+ unless prefix == "xmlns"
814
+ uri = @namespaces[prefix]
815
+ expanded_name = [uri, local_part]
816
+ existing_prefix = expanded_names[expanded_name]
817
+ if existing_prefix
818
+ message = "Namespace conflict in adding attribute " +
819
+ "\"#{local_part}\": " +
820
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
821
+ "prefix \"#{prefix}\" = \"#{uri}\""
822
+ raise REXML::ParseException.new(message, @source, self)
823
+ end
824
+ expanded_names[expanded_name] = prefix
825
+ end
826
+
827
+ attributes[name] = value
828
+ else
829
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
830
+ raise REXML::ParseException.new(message, @source)
831
+ end
669
832
  end
670
- return attributes, closed
671
833
  end
672
834
  end
673
835
  end