rexml 3.2.6 → 3.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,36 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ EQUAL_PATTERN = /\s*=\s*/um
148
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
149
+ NAME_PATTERN = /#{NAME}/um
150
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
151
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
152
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
153
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
154
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
155
+ DEFAULT_ENTITIES_PATTERNS = {}
156
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
157
+ default_entities.each do |term|
158
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
159
+ end
160
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
161
+ end
162
+ private_constant :Private
163
+
115
164
  def initialize( source )
116
165
  self.stream = source
117
166
  @listeners = []
167
+ @prefixes = Set.new
168
+ @entity_expansion_count = 0
169
+ @entity_expansion_limit = Security.entity_expansion_limit
170
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
171
+ @source.ensure_buffer
172
+ @version = nil
118
173
  end
119
174
 
120
175
  def add_listener( listener )
@@ -122,15 +177,24 @@ module REXML
122
177
  end
123
178
 
124
179
  attr_reader :source
180
+ attr_reader :entity_expansion_count
181
+ attr_writer :entity_expansion_limit
182
+ attr_writer :entity_expansion_text_limit
125
183
 
126
184
  def stream=( source )
127
185
  @source = SourceFactory.create_from( source )
186
+ reset
187
+ end
188
+
189
+ def reset
128
190
  @closed = nil
191
+ @have_root = false
129
192
  @document_status = nil
130
193
  @tags = []
131
194
  @stack = []
132
195
  @entities = []
133
- @nsstack = []
196
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
197
+ @namespaces_restore_stack = []
134
198
  end
135
199
 
136
200
  def position
@@ -144,12 +208,12 @@ module REXML
144
208
 
145
209
  # Returns true if there are no more events
146
210
  def empty?
147
- return (@source.empty? and @stack.empty?)
211
+ (@source.empty? and @stack.empty?)
148
212
  end
149
213
 
150
214
  # Returns true if there are more events. Synonymous with !empty?
151
215
  def has_next?
152
- return !(@source.empty? and @stack.empty?)
216
+ !(@source.empty? and @stack.empty?)
153
217
  end
154
218
 
155
219
  # Push an event back on the head of the stream. This method
@@ -180,6 +244,8 @@ module REXML
180
244
 
181
245
  # Returns the next event. This is a +PullEvent+ object.
182
246
  def pull
247
+ @source.drop_parsed_content
248
+
183
249
  pull_event.tap do |event|
184
250
  @listeners.each do |listener|
185
251
  listener.receive event
@@ -192,236 +258,263 @@ module REXML
192
258
  x, @closed = @closed, nil
193
259
  return [ :end_element, x ]
194
260
  end
195
- return [ :end_document ] if empty?
261
+ if empty?
262
+ if @document_status == :in_doctype
263
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
264
+ end
265
+ unless @tags.empty?
266
+ path = "/" + @tags.join("/")
267
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
268
+ end
269
+ return [ :end_document ]
270
+ end
196
271
  return @stack.shift if @stack.size > 0
197
272
  #STDERR.puts @source.encoding
198
273
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
274
+
275
+ @source.ensure_buffer
199
276
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
277
+ start_position = @source.position
278
+ if @source.match?("<?", true)
223
279
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
280
+ elsif @source.match?("<!", true)
281
+ if @source.match?("--", true)
282
+ return [ :comment, process_comment ]
283
+ elsif @source.match?("DOCTYPE", true)
284
+ base_error_message = "Malformed DOCTYPE"
285
+ unless @source.skip_spaces
286
+ if @source.match?(">")
287
+ message = "#{base_error_message}: name is missing"
288
+ else
289
+ message = "#{base_error_message}: invalid name"
290
+ end
291
+ @source.position = start_position
292
+ raise REXML::ParseException.new(message, @source)
242
293
  end
243
- if @source.match(/\A\s*\[/um, true)
294
+ name = parse_name(base_error_message)
295
+ @source.skip_spaces
296
+ if @source.match?("[", true)
297
+ id = [nil, nil, nil]
244
298
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
299
+ elsif @source.match?(">", true)
300
+ id = [nil, nil, nil]
246
301
  @document_status = :after_doctype
302
+ @source.ensure_buffer
247
303
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
304
+ id = parse_id(base_error_message,
305
+ accept_external_id: true,
306
+ accept_public_id: false)
307
+ if id[0] == "SYSTEM"
308
+ # For backward compatibility
309
+ id[1], id[2] = id[2], nil
310
+ end
311
+ @source.skip_spaces
312
+ if @source.match?("[", true)
313
+ @document_status = :in_doctype
314
+ elsif @source.match?(">", true)
315
+ @document_status = :after_doctype
316
+ @source.ensure_buffer
317
+ else
318
+ message = "#{base_error_message}: garbage after external ID"
319
+ raise REXML::ParseException.new(message, @source)
320
+ end
250
321
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
322
+ args = [:start_doctype, name, *id]
323
+ if @document_status == :after_doctype
324
+ @source.skip_spaces
325
+ @stack << [ :end_doctype ]
326
+ end
327
+ return args
328
+ else
329
+ message = "Invalid XML"
330
+ raise REXML::ParseException.new(message, @source)
263
331
  end
264
332
  end
265
333
  end
266
334
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
335
+ @source.skip_spaces
336
+ start_position = @source.position
337
+ if @source.match?("<!", true)
338
+ if @source.match?("ELEMENT", true)
339
+ md = @source.match(/(.*?)>/um, true)
340
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
341
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
342
+ elsif @source.match?("ENTITY", true)
343
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
344
+ unless match_data
345
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
346
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
347
+ match = [:entitydecl, *match_data.captures.compact]
348
+ ref = false
349
+ if match[1] == '%'
350
+ ref = true
351
+ match.delete_at 1
352
+ end
353
+ # Now we have to sort out what kind of entity reference this is
354
+ if match[2] == 'SYSTEM'
355
+ # External reference
356
+ match[3] = match[3][1..-2] # PUBID
357
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
358
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
359
+ elsif match[2] == 'PUBLIC'
360
+ # External reference
361
+ match[3] = match[3][1..-2] # PUBID
362
+ match[4] = match[4][1..-2] # HREF
363
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
364
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
365
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
366
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
367
  else
329
- message = "#{base_error_message}: invalid declaration name"
368
+ match[2] = match[2][1..-2]
369
+ match.pop if match.size == 4
370
+ # match is [ :entity, name, value ]
330
371
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
372
+ match << '%' if ref
373
+ return match
374
+ elsif @source.match?("ATTLIST", true)
375
+ md = @source.match(Private::ATTLISTDECL_END, true)
376
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
377
+ element = md[1]
378
+ contents = "<!ATTLIST" + md[0]
379
+
380
+ pairs = {}
381
+ values = md[0].strip.scan( ATTDEF_RE )
382
+ values.each do |attdef|
383
+ unless attdef[3] == "#IMPLIED"
384
+ attdef.compact!
385
+ val = attdef[3]
386
+ val = attdef[4] if val == "#FIXED "
387
+ pairs[attdef[0]] = val
388
+ if attdef[0] =~ /^xmlns:(.*)/
389
+ @namespaces[$1] = val
390
+ end
391
+ end
392
+ end
393
+ return [ :attlistdecl, element, pairs, contents ]
394
+ elsif @source.match?("NOTATION", true)
395
+ base_error_message = "Malformed notation declaration"
396
+ unless @source.skip_spaces
397
+ if @source.match?(">")
398
+ message = "#{base_error_message}: name is missing"
399
+ else
400
+ message = "#{base_error_message}: invalid name"
401
+ end
402
+ @source.position = start_position
403
+ raise REXML::ParseException.new(message, @source)
404
+ end
405
+ name = parse_name(base_error_message)
406
+ id = parse_id(base_error_message,
407
+ accept_external_id: true,
408
+ accept_public_id: true)
409
+ @source.skip_spaces
410
+ unless @source.match?(">", true)
411
+ message = "#{base_error_message}: garbage before end >"
412
+ raise REXML::ParseException.new(message, @source)
413
+ end
414
+ return [:notationdecl, name, *id]
415
+ elsif @source.match?("--", true)
416
+ return [ :comment, process_comment ]
417
+ else
418
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor ELEMENT,ENTITY,ATTLIST,NOTATION", @source)
340
419
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
420
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
421
+ return [ :externalentity, match[1] ]
422
+ elsif @source.match?(/\]\s*>/um, true)
343
423
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
424
  return [ :end_doctype ]
425
+ else
426
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
346
427
  end
347
428
  end
348
429
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
430
+ @source.skip_spaces
350
431
  end
351
432
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
433
+ start_position = @source.position
434
+ if @source.match?("<", true)
435
+ # :text's read_until may remain only "<" in buffer. In the
436
+ # case, buffer is empty here. So we need to fill buffer
437
+ # here explicitly.
438
+ @source.ensure_buffer
439
+ if @source.match?("/", true)
440
+ @namespaces_restore_stack.pop
356
441
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
442
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
443
  if md and !last_tag
359
444
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
445
  raise REXML::ParseException.new(message, @source)
361
446
  end
362
447
  if md.nil? or last_tag != md[1]
363
448
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
449
+ message += " (got '#{md[1]}')" if md
450
+ @source.position = start_position if md.nil?
365
451
  raise REXML::ParseException.new(message, @source)
366
452
  end
367
453
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
454
+ elsif @source.match?("!", true)
370
455
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
- raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
374
-
375
- case md[1]
376
- when /--/, /-\z/
377
- raise REXML::ParseException.new("Malformed comment", @source)
456
+ if @source.match?("--", true)
457
+ return [ :comment, process_comment ]
458
+ elsif @source.match?("[CDATA[", true)
459
+ text = @source.read_until("]]>")
460
+ if text.chomp!("]]>")
461
+ return [ :cdata, text ]
462
+ else
463
+ raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source)
378
464
  end
379
-
380
- return [ :comment, md[1] ] if md
381
465
  else
382
- md = @source.match( CDATA_PATTERN, true )
383
- return [ :cdata, md[1] ] if md
466
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor CDATA", @source)
384
467
  end
385
- raise REXML::ParseException.new( "Declarations can only occur "+
386
- "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
468
+ elsif @source.match?("?", true)
388
469
  return process_instruction
389
470
  else
390
471
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
472
+ md = @source.match(Private::TAG_PATTERN, true)
392
473
  unless md
474
+ @source.position = start_position
393
475
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
476
  end
477
+ tag = md[1]
395
478
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
479
+ @prefixes.clear
480
+ @prefixes << md[2] if md[2]
481
+ push_namespaces_restore
482
+ attributes, closed = parse_attributes(@prefixes)
400
483
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
484
+ for prefix in @prefixes
485
+ unless @namespaces.key?(prefix)
403
486
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
487
  end
405
488
  end
406
489
 
407
490
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
491
+ @closed = tag
492
+ pop_namespaces_restore
410
493
  else
411
- @tags.push( md[1] )
494
+ if @tags.empty? and @have_root
495
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
496
+ end
497
+ @tags.push( tag )
412
498
  end
413
- return [ :start_element, md[1], attributes ]
499
+ @have_root = true
500
+ return [ :start_element, tag, attributes ]
414
501
  end
415
502
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
503
+ text = @source.read_until("<")
504
+ if text.chomp!("<")
505
+ @source.position -= "<".bytesize
506
+ end
507
+ if @tags.empty?
508
+ unless /\A\s*\z/.match?(text)
509
+ if @have_root
510
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
511
+ else
512
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
513
+ end
514
+ end
515
+ return pull_event if @have_root
419
516
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
517
+ return [ :text, text ]
425
518
  end
426
519
  rescue REXML::UndefinedNamespaceException
427
520
  raise
@@ -431,18 +524,19 @@ module REXML
431
524
  raise REXML::ParseException.new( "Exception parsing",
432
525
  @source, self, (error ? error : $!) )
433
526
  end
434
- return [ :dummy ]
527
+ # NOTE: The end of the method never runs, because it is unreachable.
528
+ # All branches of code above have explicit unconditional return or raise statements.
435
529
  end
436
530
  private :pull_event
437
531
 
438
532
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
533
+ return unless entities
534
+
535
+ value = entities[ reference ]
536
+ return if value.nil?
537
+
538
+ record_entity_expansion
539
+ unnormalize( value, entities )
446
540
  end
447
541
 
448
542
  # Escapes all possible entities
@@ -463,52 +557,108 @@ module REXML
463
557
 
464
558
  # Unescapes all possible entities
465
559
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
560
+ if string.include?("\r")
561
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
562
+ else
563
+ rv = string.dup
564
+ end
468
565
  matches = rv.scan( REFERENCE_RE )
469
566
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
567
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
568
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
569
+ if m.start_with?("x")
570
+ code_point = Integer(m[1..-1], 16)
571
+ else
572
+ code_point = Integer(m, 10)
573
+ end
574
+ [code_point].pack('U*')
474
575
  }
475
576
  matches.collect!{|x|x[0]}.compact!
577
+ if filter
578
+ matches.reject! do |entity_reference|
579
+ filter.include?(entity_reference)
580
+ end
581
+ end
476
582
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
583
+ matches.tally.each do |entity_reference, n|
584
+ entity_expansion_count_before = @entity_expansion_count
585
+ entity_value = entity( entity_reference, entities )
586
+ if entity_value
587
+ if n > 1
588
+ entity_expansion_count_delta =
589
+ @entity_expansion_count - entity_expansion_count_before
590
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
591
+ end
592
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
593
+ rv.gsub!( re, entity_value )
594
+ if rv.bytesize > @entity_expansion_text_limit
595
+ raise "entity expansion has grown too large"
486
596
  end
597
+ else
598
+ er = DEFAULT_ENTITIES[entity_reference]
599
+ rv.gsub!( er[0], er[2] ) if er
487
600
  end
488
601
  end
489
- rv.gsub!( /&amp;/, '&' )
602
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
603
  end
491
604
  rv
492
605
  end
493
606
 
494
607
  private
608
+ def add_namespace(prefix, uri)
609
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
610
+ if uri.nil?
611
+ @namespaces.delete(prefix)
612
+ else
613
+ @namespaces[prefix] = uri
614
+ end
615
+ end
616
+
617
+ def push_namespaces_restore
618
+ namespaces_restore = {}
619
+ @namespaces_restore_stack.push(namespaces_restore)
620
+ namespaces_restore
621
+ end
622
+
623
+ def pop_namespaces_restore
624
+ namespaces_restore = @namespaces_restore_stack.pop
625
+ namespaces_restore.each do |prefix, uri|
626
+ if uri.nil?
627
+ @namespaces.delete(prefix)
628
+ else
629
+ @namespaces[prefix] = uri
630
+ end
631
+ end
632
+ end
633
+
634
+ def record_entity_expansion(delta=1)
635
+ @entity_expansion_count += delta
636
+ if @entity_expansion_count > @entity_expansion_limit
637
+ raise "number of entity expansions exceeded, processing aborted."
638
+ end
639
+ end
640
+
495
641
  def need_source_encoding_update?(xml_declaration_encoding)
496
642
  return false if xml_declaration_encoding.nil?
497
643
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
498
644
  true
499
645
  end
500
646
 
647
+ def normalize_xml_declaration_encoding(xml_declaration_encoding)
648
+ /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
649
+ end
650
+
501
651
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
652
+ md = @source.match(Private::NAME_PATTERN, true)
503
653
  unless md
504
- if @source.match(/\A\s*\S/um)
654
+ if @source.match?(/\S/um)
505
655
  message = "#{base_error_message}: invalid name"
506
656
  else
507
657
  message = "#{base_error_message}: name is missing"
508
658
  end
509
659
  raise REXML::ParseException.new(message, @source)
510
660
  end
511
- md[1]
661
+ md[0]
512
662
  end
513
663
 
514
664
  def parse_id(base_error_message,
@@ -543,131 +693,231 @@ module REXML
543
693
  accept_public_id:)
544
694
  public = /\A\s*PUBLIC/um
545
695
  system = /\A\s*SYSTEM/um
546
- if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
547
- if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
696
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
697
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
548
698
  return "public ID literal is missing"
549
699
  end
550
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
700
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
551
701
  return "invalid public ID literal"
552
702
  end
553
703
  if accept_public_id
554
- if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
704
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
555
705
  return "system ID literal is missing"
556
706
  end
557
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
707
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
558
708
  return "invalid system literal"
559
709
  end
560
710
  "garbage after system literal"
561
711
  else
562
712
  "garbage after public ID literal"
563
713
  end
564
- elsif accept_external_id and @source.match(/#{system}/um)
565
- if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
714
+ elsif accept_external_id and @source.match?(/#{system}/um)
715
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
566
716
  return "system literal is missing"
567
717
  end
568
- unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
718
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
569
719
  return "invalid system literal"
570
720
  end
571
721
  "garbage after system literal"
572
722
  else
573
- unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
723
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
574
724
  return "invalid ID type"
575
725
  end
576
726
  "ID type is missing"
577
727
  end
578
728
  end
579
729
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
730
+ def process_comment
731
+ text = @source.read_until("-->")
732
+ unless text.chomp!("-->")
733
+ raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source)
585
734
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
587
- end
588
735
 
589
- def parse_attributes(prefixes, curr_ns)
590
- attributes = {}
591
- closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
736
+ if text.include? "--" or text.end_with?("-")
737
+ raise REXML::ParseException.new("Malformed comment", @source)
596
738
  end
739
+ text
740
+ end
597
741
 
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
742
+ def process_instruction
743
+ name = parse_name("Malformed XML: Invalid processing instruction node")
744
+ if name == "xml"
745
+ xml_declaration
746
+ else # PITarget
747
+ if @source.skip_spaces # e.g. <?name content?>
748
+ start_position = @source.position
749
+ content = @source.read_until("?>")
750
+ unless content.chomp!("?>")
751
+ @source.position = start_position
752
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
753
+ end
754
+ else # e.g. <?name?>
755
+ content = nil
756
+ unless @source.match?("?>", true)
757
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
758
+ end
759
+ end
760
+ [:processing_instruction, name, content]
761
+ end
762
+ end
602
763
 
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
764
+ def xml_declaration
765
+ unless @version.nil?
766
+ raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
767
+ end
768
+ if @document_status
769
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
770
+ end
771
+ unless @source.skip_spaces
772
+ raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
773
+ end
774
+ unless @source.match?("version", true)
775
+ raise ParseException.new("Malformed XML: XML declaration misses version", @source)
776
+ end
777
+ @version = parse_attribute_value_with_equal("xml")
778
+ unless @source.skip_spaces
779
+ unless @source.match?("?>", true)
780
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
607
781
  end
782
+ encoding = normalize_xml_declaration_encoding(@source.encoding)
783
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
784
+ end
608
785
 
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
786
+ if @source.match?("encoding", true)
787
+ encoding = parse_attribute_value_with_equal("xml")
788
+ unless @source.skip_spaces
789
+ unless @source.match?("?>", true)
790
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
615
791
  end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
618
- message = "Missing attribute equal: <#{name}>"
619
- raise REXML::ParseException.new(message, @source)
620
- end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
623
- message = "Missing attribute value start quote: <#{name}>"
624
- raise REXML::ParseException.new(message, @source)
625
- end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
- raise REXML::ParseException.new(message, @source)
792
+ if need_source_encoding_update?(encoding)
793
+ @source.encoding = encoding
639
794
  end
795
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
796
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
797
+ end
798
+ end
799
+
800
+ if @source.match?("standalone", true)
801
+ standalone = parse_attribute_value_with_equal("xml")
802
+ case standalone
803
+ when "yes", "no"
804
+ else
805
+ raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
806
+ end
807
+ end
808
+ @source.skip_spaces
809
+ unless @source.match?("?>", true)
810
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
811
+ end
812
+
813
+ if need_source_encoding_update?(encoding)
814
+ @source.encoding = encoding
815
+ end
816
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
817
+
818
+ # e.g. <?xml version="1.0" ?>
819
+ # <?xml version="1.1" encoding="UTF-8" ?>
820
+ # <?xml version="1.1" standalone="yes"?>
821
+ # <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
822
+ [ :xmldecl, @version, encoding, standalone ]
823
+ end
824
+
825
+ if StringScanner::Version < "3.1.1"
826
+ def scan_quote
827
+ @source.match(/(['"])/, true)&.[](1)
828
+ end
829
+ else
830
+ def scan_quote
831
+ case @source.peek_byte
832
+ when 34 # '"'.ord
833
+ @source.scan_byte
834
+ '"'
835
+ when 39 # "'".ord
836
+ @source.scan_byte
837
+ "'"
838
+ else
839
+ nil
640
840
  end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
841
+ end
842
+ end
843
+
844
+ def parse_attribute_value_with_equal(name)
845
+ unless @source.match?(Private::EQUAL_PATTERN, true)
846
+ message = "Missing attribute equal: <#{name}>"
847
+ raise REXML::ParseException.new(message, @source)
848
+ end
849
+ unless quote = scan_quote
850
+ message = "Missing attribute value start quote: <#{name}>"
851
+ raise REXML::ParseException.new(message, @source)
852
+ end
853
+ start_position = @source.position
854
+ value = @source.read_until(quote)
855
+ unless value.chomp!(quote)
856
+ @source.position = start_position
857
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
858
+ raise REXML::ParseException.new(message, @source)
859
+ end
860
+ value
861
+ end
862
+
863
+ def parse_attributes(prefixes)
864
+ attributes = {}
865
+ expanded_names = {}
866
+ closed = false
867
+ while true
868
+ if @source.match?(">", true)
869
+ return attributes, closed
870
+ elsif @source.match?("/>", true)
871
+ closed = true
872
+ return attributes, closed
873
+ elsif match = @source.match(QNAME, true)
874
+ name = match[1]
875
+ prefix = match[2]
876
+ local_part = match[3]
877
+ value = parse_attribute_value_with_equal(name)
878
+ @source.skip_spaces
879
+ if prefix == "xmlns"
880
+ if local_part == "xml"
881
+ if value != Private::XML_PREFIXED_NAMESPACE
882
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
883
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
884
+ raise REXML::ParseException.new( msg, @source, self )
885
+ end
886
+ elsif local_part == "xmlns"
887
+ msg = "The 'xmlns' prefix must not be declared "+
650
888
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
889
+ raise REXML::ParseException.new( msg, @source, self)
652
890
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
891
+ add_namespace(local_part, value)
892
+ elsif prefix
893
+ prefixes << prefix unless prefix == "xml"
657
894
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
895
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
896
+ if attributes[name]
897
+ msg = "Duplicate attribute #{name.inspect}"
898
+ raise REXML::ParseException.new(msg, @source, self)
899
+ end
667
900
 
668
- attributes[name] = value
901
+ unless prefix == "xmlns"
902
+ uri = @namespaces[prefix]
903
+ expanded_name = [uri, local_part]
904
+ existing_prefix = expanded_names[expanded_name]
905
+ if existing_prefix
906
+ message = "Namespace conflict in adding attribute " +
907
+ "\"#{local_part}\": " +
908
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
909
+ "prefix \"#{prefix}\" = \"#{uri}\""
910
+ raise REXML::ParseException.new(message, @source, self)
911
+ end
912
+ expanded_names[expanded_name] = prefix
913
+ end
914
+
915
+ attributes[name] = value
916
+ else
917
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
918
+ raise REXML::ParseException.new(message, @source)
919
+ end
669
920
  end
670
- return attributes, closed
671
921
  end
672
922
  end
673
923
  end