rexml 3.2.6 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,24 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
184
+ reset
185
+ end
186
+
187
+ def reset
128
188
  @closed = nil
189
+ @have_root = false
129
190
  @document_status = nil
130
191
  @tags = []
131
192
  @stack = []
132
193
  @entities = []
133
- @nsstack = []
194
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
195
+ @namespaces_restore_stack = []
134
196
  end
135
197
 
136
198
  def position
@@ -180,6 +242,8 @@ module REXML
180
242
 
181
243
  # Returns the next event. This is a +PullEvent+ object.
182
244
  def pull
245
+ @source.drop_parsed_content
246
+
183
247
  pull_event.tap do |event|
184
248
  @listeners.each do |listener|
185
249
  listener.receive event
@@ -192,236 +256,277 @@ module REXML
192
256
  x, @closed = @closed, nil
193
257
  return [ :end_element, x ]
194
258
  end
195
- return [ :end_document ] if empty?
259
+ if empty?
260
+ if @document_status == :in_doctype
261
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
262
+ end
263
+ unless @tags.empty?
264
+ path = "/" + @tags.join("/")
265
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
266
+ end
267
+ return [ :end_document ]
268
+ end
196
269
  return @stack.shift if @stack.size > 0
197
270
  #STDERR.puts @source.encoding
198
271
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
272
+
273
+ @source.ensure_buffer
199
274
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
275
+ start_position = @source.position
276
+ if @source.match?("<?", true)
223
277
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
278
+ elsif @source.match?("<!", true)
279
+ if @source.match?("--", true)
280
+ md = @source.match(/(.*?)-->/um, true)
281
+ if md.nil?
282
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
283
  end
243
- if @source.match(/\A\s*\[/um, true)
284
+ if /--|-\z/.match?(md[1])
285
+ raise REXML::ParseException.new("Malformed comment", @source)
286
+ end
287
+ return [ :comment, md[1] ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.match?(/\s+/um, true)
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
298
+ end
299
+ name = parse_name(base_error_message)
300
+ @source.match?(/\s*/um, true) # skip spaces
301
+ if @source.match?("[", true)
302
+ id = [nil, nil, nil]
244
303
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
304
+ elsif @source.match?(">", true)
305
+ id = [nil, nil, nil]
246
306
  @document_status = :after_doctype
307
+ @source.ensure_buffer
247
308
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
309
+ id = parse_id(base_error_message,
310
+ accept_external_id: true,
311
+ accept_public_id: false)
312
+ if id[0] == "SYSTEM"
313
+ # For backward compatibility
314
+ id[1], id[2] = id[2], nil
315
+ end
316
+ @source.match?(/\s*/um, true) # skip spaces
317
+ if @source.match?("[", true)
318
+ @document_status = :in_doctype
319
+ elsif @source.match?(">", true)
320
+ @document_status = :after_doctype
321
+ @source.ensure_buffer
322
+ else
323
+ message = "#{base_error_message}: garbage after external ID"
324
+ raise REXML::ParseException.new(message, @source)
325
+ end
250
326
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
327
+ args = [:start_doctype, name, *id]
328
+ if @document_status == :after_doctype
329
+ @source.match?(/\s*/um, true)
330
+ @stack << [ :end_doctype ]
331
+ end
332
+ return args
333
+ else
334
+ message = "Invalid XML"
335
+ raise REXML::ParseException.new(message, @source)
263
336
  end
264
337
  end
265
338
  end
266
339
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
340
+ @source.match?(/\s*/um, true) # skip spaces
341
+ start_position = @source.position
342
+ if @source.match?("<!", true)
343
+ if @source.match?("ELEMENT", true)
344
+ md = @source.match(/(.*?)>/um, true)
345
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
346
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
347
+ elsif @source.match?("ENTITY", true)
348
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
349
+ unless match_data
350
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
351
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
352
+ match = [:entitydecl, *match_data.captures.compact]
353
+ ref = false
354
+ if match[1] == '%'
355
+ ref = true
356
+ match.delete_at 1
357
+ end
358
+ # Now we have to sort out what kind of entity reference this is
359
+ if match[2] == 'SYSTEM'
360
+ # External reference
361
+ match[3] = match[3][1..-2] # PUBID
362
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
363
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
364
+ elsif match[2] == 'PUBLIC'
365
+ # External reference
366
+ match[3] = match[3][1..-2] # PUBID
367
+ match[4] = match[4][1..-2] # HREF
368
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
369
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
370
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
371
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
372
  else
329
- message = "#{base_error_message}: invalid declaration name"
373
+ match[2] = match[2][1..-2]
374
+ match.pop if match.size == 4
375
+ # match is [ :entity, name, value ]
330
376
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
377
+ match << '%' if ref
378
+ return match
379
+ elsif @source.match?("ATTLIST", true)
380
+ md = @source.match(Private::ATTLISTDECL_END, true)
381
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
382
+ element = md[1]
383
+ contents = "<!ATTLIST" + md[0]
384
+
385
+ pairs = {}
386
+ values = md[0].strip.scan( ATTDEF_RE )
387
+ values.each do |attdef|
388
+ unless attdef[3] == "#IMPLIED"
389
+ attdef.compact!
390
+ val = attdef[3]
391
+ val = attdef[4] if val == "#FIXED "
392
+ pairs[attdef[0]] = val
393
+ if attdef[0] =~ /^xmlns:(.*)/
394
+ @namespaces[$1] = val
395
+ end
396
+ end
397
+ end
398
+ return [ :attlistdecl, element, pairs, contents ]
399
+ elsif @source.match?("NOTATION", true)
400
+ base_error_message = "Malformed notation declaration"
401
+ unless @source.match?(/\s+/um, true)
402
+ if @source.match?(">")
403
+ message = "#{base_error_message}: name is missing"
404
+ else
405
+ message = "#{base_error_message}: invalid name"
406
+ end
407
+ @source.position = start_position
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ name = parse_name(base_error_message)
411
+ id = parse_id(base_error_message,
412
+ accept_external_id: true,
413
+ accept_public_id: true)
414
+ @source.match?(/\s*/um, true) # skip spaces
415
+ unless @source.match?(">", true)
416
+ message = "#{base_error_message}: garbage before end >"
417
+ raise REXML::ParseException.new(message, @source)
418
+ end
419
+ return [:notationdecl, name, *id]
420
+ elsif md = @source.match(/--(.*?)-->/um, true)
421
+ case md[1]
422
+ when /--/, /-\z/
423
+ raise REXML::ParseException.new("Malformed comment", @source)
424
+ end
425
+ return [ :comment, md[1] ] if md
340
426
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
427
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
428
+ return [ :externalentity, match[1] ]
429
+ elsif @source.match?(/\]\s*>/um, true)
343
430
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
431
  return [ :end_doctype ]
346
432
  end
433
+ if @document_status == :in_doctype
434
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
435
+ end
347
436
  end
348
437
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
438
+ @source.match?(/\s*/um, true)
350
439
  end
351
440
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
441
+ start_position = @source.position
442
+ if @source.match?("<", true)
443
+ # :text's read_until may remain only "<" in buffer. In the
444
+ # case, buffer is empty here. So we need to fill buffer
445
+ # here explicitly.
446
+ @source.ensure_buffer
447
+ if @source.match?("/", true)
448
+ @namespaces_restore_stack.pop
356
449
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
450
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
451
  if md and !last_tag
359
452
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
453
  raise REXML::ParseException.new(message, @source)
361
454
  end
362
455
  if md.nil? or last_tag != md[1]
363
456
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
457
+ message += " (got '#{md[1]}')" if md
458
+ @source.position = start_position if md.nil?
365
459
  raise REXML::ParseException.new(message, @source)
366
460
  end
367
461
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
462
+ elsif @source.match?("!", true)
463
+ md = @source.match(/([^>]*>)/um)
370
464
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
465
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
466
+ if md[0][0] == ?-
467
+ md = @source.match(/--(.*?)-->/um, true)
374
468
 
375
- case md[1]
376
- when /--/, /-\z/
469
+ if md.nil? || /--|-\z/.match?(md[1])
377
470
  raise REXML::ParseException.new("Malformed comment", @source)
378
471
  end
379
472
 
380
- return [ :comment, md[1] ] if md
473
+ return [ :comment, md[1] ]
381
474
  else
382
- md = @source.match( CDATA_PATTERN, true )
475
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
476
  return [ :cdata, md[1] ] if md
384
477
  end
385
478
  raise REXML::ParseException.new( "Declarations can only occur "+
386
479
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
480
+ elsif @source.match?("?", true)
388
481
  return process_instruction
389
482
  else
390
483
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
484
+ md = @source.match(Private::TAG_PATTERN, true)
392
485
  unless md
486
+ @source.position = start_position
393
487
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
488
  end
489
+ tag = md[1]
395
490
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
491
+ @prefixes.clear
492
+ @prefixes << md[2] if md[2]
493
+ push_namespaces_restore
494
+ attributes, closed = parse_attributes(@prefixes)
400
495
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
496
+ for prefix in @prefixes
497
+ unless @namespaces.key?(prefix)
403
498
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
499
  end
405
500
  end
406
501
 
407
502
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
503
+ @closed = tag
504
+ pop_namespaces_restore
410
505
  else
411
- @tags.push( md[1] )
506
+ if @tags.empty? and @have_root
507
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
508
+ end
509
+ @tags.push( tag )
412
510
  end
413
- return [ :start_element, md[1], attributes ]
511
+ @have_root = true
512
+ return [ :start_element, tag, attributes ]
414
513
  end
415
514
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
515
+ text = @source.read_until("<")
516
+ if text.chomp!("<")
517
+ @source.position -= "<".bytesize
419
518
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
519
+ if @tags.empty?
520
+ unless /\A\s*\z/.match?(text)
521
+ if @have_root
522
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
523
+ else
524
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
525
+ end
526
+ end
527
+ return pull_event if @have_root
528
+ end
529
+ return [ :text, text ]
425
530
  end
426
531
  rescue REXML::UndefinedNamespaceException
427
532
  raise
@@ -436,13 +541,13 @@ module REXML
436
541
  private :pull_event
437
542
 
438
543
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
544
+ return unless entities
545
+
546
+ value = entities[ reference ]
547
+ return if value.nil?
548
+
549
+ record_entity_expansion
550
+ unnormalize( value, entities )
446
551
  end
447
552
 
448
553
  # Escapes all possible entities
@@ -463,35 +568,87 @@ module REXML
463
568
 
464
569
  # Unescapes all possible entities
465
570
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
571
+ if string.include?("\r")
572
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
573
+ else
574
+ rv = string.dup
575
+ end
468
576
  matches = rv.scan( REFERENCE_RE )
469
577
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
578
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
579
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
580
+ if m.start_with?("x")
581
+ code_point = Integer(m[1..-1], 16)
582
+ else
583
+ code_point = Integer(m, 10)
584
+ end
585
+ [code_point].pack('U*')
474
586
  }
475
587
  matches.collect!{|x|x[0]}.compact!
588
+ if filter
589
+ matches.reject! do |entity_reference|
590
+ filter.include?(entity_reference)
591
+ end
592
+ end
476
593
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
594
+ matches.tally.each do |entity_reference, n|
595
+ entity_expansion_count_before = @entity_expansion_count
596
+ entity_value = entity( entity_reference, entities )
597
+ if entity_value
598
+ if n > 1
599
+ entity_expansion_count_delta =
600
+ @entity_expansion_count - entity_expansion_count_before
601
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
602
+ end
603
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
604
+ rv.gsub!( re, entity_value )
605
+ if rv.bytesize > @entity_expansion_text_limit
606
+ raise "entity expansion has grown too large"
486
607
  end
608
+ else
609
+ er = DEFAULT_ENTITIES[entity_reference]
610
+ rv.gsub!( er[0], er[2] ) if er
487
611
  end
488
612
  end
489
- rv.gsub!( /&amp;/, '&' )
613
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
614
  end
491
615
  rv
492
616
  end
493
617
 
494
618
  private
619
+ def add_namespace(prefix, uri)
620
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
621
+ if uri.nil?
622
+ @namespaces.delete(prefix)
623
+ else
624
+ @namespaces[prefix] = uri
625
+ end
626
+ end
627
+
628
+ def push_namespaces_restore
629
+ namespaces_restore = {}
630
+ @namespaces_restore_stack.push(namespaces_restore)
631
+ namespaces_restore
632
+ end
633
+
634
+ def pop_namespaces_restore
635
+ namespaces_restore = @namespaces_restore_stack.pop
636
+ namespaces_restore.each do |prefix, uri|
637
+ if uri.nil?
638
+ @namespaces.delete(prefix)
639
+ else
640
+ @namespaces[prefix] = uri
641
+ end
642
+ end
643
+ end
644
+
645
+ def record_entity_expansion(delta=1)
646
+ @entity_expansion_count += delta
647
+ if @entity_expansion_count > @entity_expansion_limit
648
+ raise "number of entity expansions exceeded, processing aborted."
649
+ end
650
+ end
651
+
495
652
  def need_source_encoding_update?(xml_declaration_encoding)
496
653
  return false if xml_declaration_encoding.nil?
497
654
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +656,16 @@ module REXML
499
656
  end
500
657
 
501
658
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
659
+ md = @source.match(Private::NAME_PATTERN, true)
503
660
  unless md
504
- if @source.match(/\A\s*\S/um)
661
+ if @source.match?(/\S/um)
505
662
  message = "#{base_error_message}: invalid name"
506
663
  else
507
664
  message = "#{base_error_message}: name is missing"
508
665
  end
509
666
  raise REXML::ParseException.new(message, @source)
510
667
  end
511
- md[1]
668
+ md[0]
512
669
  end
513
670
 
514
671
  def parse_id(base_error_message,
@@ -543,34 +700,34 @@ module REXML
543
700
  accept_public_id:)
544
701
  public = /\A\s*PUBLIC/um
545
702
  system = /\A\s*SYSTEM/um
546
- if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
547
- if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
703
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
704
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
548
705
  return "public ID literal is missing"
549
706
  end
550
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
707
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
551
708
  return "invalid public ID literal"
552
709
  end
553
710
  if accept_public_id
554
- if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
711
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
555
712
  return "system ID literal is missing"
556
713
  end
557
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
714
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
558
715
  return "invalid system literal"
559
716
  end
560
717
  "garbage after system literal"
561
718
  else
562
719
  "garbage after public ID literal"
563
720
  end
564
- elsif accept_external_id and @source.match(/#{system}/um)
565
- if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
721
+ elsif accept_external_id and @source.match?(/#{system}/um)
722
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
566
723
  return "system literal is missing"
567
724
  end
568
- unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
725
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
569
726
  return "invalid system literal"
570
727
  end
571
728
  "garbage after system literal"
572
729
  else
573
- unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
730
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
574
731
  return "invalid ID type"
575
732
  end
576
733
  "ID type is missing"
@@ -578,96 +735,132 @@ module REXML
578
735
  end
579
736
 
580
737
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
738
+ name = parse_name("Malformed XML: Invalid processing instruction node")
739
+ if @source.match?(/\s+/um, true)
740
+ match_data = @source.match(/(.*?)\?>/um, true)
741
+ unless match_data
742
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
743
+ end
744
+ content = match_data[1]
745
+ else
746
+ content = nil
747
+ unless @source.match?("?>", true)
748
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
749
+ end
750
+ end
751
+ if name == "xml"
752
+ if @document_status
753
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
754
+ end
755
+ version = VERSION.match(content)
756
+ version = version[1] unless version.nil?
757
+ encoding = ENCODING.match(content)
758
+ encoding = encoding[1] unless encoding.nil?
759
+ if need_source_encoding_update?(encoding)
760
+ @source.encoding = encoding
761
+ end
762
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
763
+ encoding = "UTF-16"
764
+ end
765
+ standalone = STANDALONE.match(content)
766
+ standalone = standalone[1] unless standalone.nil?
767
+ return [ :xmldecl, version, encoding, standalone ]
585
768
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
769
+ [:processing_instruction, name, content]
587
770
  end
588
771
 
589
- def parse_attributes(prefixes, curr_ns)
590
- attributes = {}
591
- closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
772
+ if StringScanner::Version < "3.1.1"
773
+ def scan_quote
774
+ @source.match(/(['"])/, true)&.[](1)
596
775
  end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
776
+ else
777
+ def scan_quote
778
+ case @source.peek_byte
779
+ when 34 # '"'.ord
780
+ @source.scan_byte
781
+ '"'
782
+ when 39 # "'".ord
783
+ @source.scan_byte
784
+ "'"
785
+ else
786
+ nil
607
787
  end
788
+ end
789
+ end
608
790
 
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
791
+ def parse_attributes(prefixes)
792
+ attributes = {}
793
+ expanded_names = {}
794
+ closed = false
795
+ while true
796
+ if @source.match?(">", true)
797
+ return attributes, closed
798
+ elsif @source.match?("/>", true)
799
+ closed = true
800
+ return attributes, closed
801
+ elsif match = @source.match(QNAME, true)
802
+ name = match[1]
803
+ prefix = match[2]
804
+ local_part = match[3]
805
+
806
+ unless @source.match?(/\s*=\s*/um, true)
618
807
  message = "Missing attribute equal: <#{name}>"
619
808
  raise REXML::ParseException.new(message, @source)
620
809
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
810
+ unless quote = scan_quote
623
811
  message = "Missing attribute value start quote: <#{name}>"
624
812
  raise REXML::ParseException.new(message, @source)
625
813
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
814
+ start_position = @source.position
815
+ value = @source.read_until(quote)
816
+ unless value.chomp!(quote)
817
+ @source.position = start_position
818
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
819
  raise REXML::ParseException.new(message, @source)
639
820
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
821
+ @source.match?(/\s*/um, true)
822
+ if prefix == "xmlns"
823
+ if local_part == "xml"
824
+ if value != Private::XML_PREFIXED_NAMESPACE
825
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
826
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
827
+ raise REXML::ParseException.new( msg, @source, self )
828
+ end
829
+ elsif local_part == "xmlns"
830
+ msg = "The 'xmlns' prefix must not be declared "+
650
831
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
832
+ raise REXML::ParseException.new( msg, @source, self)
652
833
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
834
+ add_namespace(local_part, value)
835
+ elsif prefix
836
+ prefixes << prefix unless prefix == "xml"
657
837
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
838
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
839
+ if attributes[name]
840
+ msg = "Duplicate attribute #{name.inspect}"
841
+ raise REXML::ParseException.new(msg, @source, self)
842
+ end
667
843
 
668
- attributes[name] = value
844
+ unless prefix == "xmlns"
845
+ uri = @namespaces[prefix]
846
+ expanded_name = [uri, local_part]
847
+ existing_prefix = expanded_names[expanded_name]
848
+ if existing_prefix
849
+ message = "Namespace conflict in adding attribute " +
850
+ "\"#{local_part}\": " +
851
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
852
+ "prefix \"#{prefix}\" = \"#{uri}\""
853
+ raise REXML::ParseException.new(message, @source, self)
854
+ end
855
+ expanded_names[expanded_name] = prefix
856
+ end
857
+
858
+ attributes[name] = value
859
+ else
860
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
861
+ raise REXML::ParseException.new(message, @source)
862
+ end
669
863
  end
670
- return attributes, closed
671
864
  end
672
865
  end
673
866
  end