rexml 3.2.5 → 3.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,36 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ EQUAL_PATTERN = /\s*=\s*/um
148
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
149
+ NAME_PATTERN = /#{NAME}/um
150
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
151
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
152
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
153
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
154
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
155
+ DEFAULT_ENTITIES_PATTERNS = {}
156
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
157
+ default_entities.each do |term|
158
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
159
+ end
160
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
161
+ end
162
+ private_constant :Private
163
+
115
164
  def initialize( source )
116
165
  self.stream = source
117
166
  @listeners = []
167
+ @prefixes = Set.new
168
+ @entity_expansion_count = 0
169
+ @entity_expansion_limit = Security.entity_expansion_limit
170
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
171
+ @source.ensure_buffer
172
+ @version = nil
118
173
  end
119
174
 
120
175
  def add_listener( listener )
@@ -122,15 +177,24 @@ module REXML
122
177
  end
123
178
 
124
179
  attr_reader :source
180
+ attr_reader :entity_expansion_count
181
+ attr_writer :entity_expansion_limit
182
+ attr_writer :entity_expansion_text_limit
125
183
 
126
184
  def stream=( source )
127
185
  @source = SourceFactory.create_from( source )
186
+ reset
187
+ end
188
+
189
+ def reset
128
190
  @closed = nil
191
+ @have_root = false
129
192
  @document_status = nil
130
193
  @tags = []
131
194
  @stack = []
132
195
  @entities = []
133
- @nsstack = []
196
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
197
+ @namespaces_restore_stack = []
134
198
  end
135
199
 
136
200
  def position
@@ -144,12 +208,12 @@ module REXML
144
208
 
145
209
  # Returns true if there are no more events
146
210
  def empty?
147
- return (@source.empty? and @stack.empty?)
211
+ (@source.empty? and @stack.empty?)
148
212
  end
149
213
 
150
214
  # Returns true if there are more events. Synonymous with !empty?
151
215
  def has_next?
152
- return !(@source.empty? and @stack.empty?)
216
+ !(@source.empty? and @stack.empty?)
153
217
  end
154
218
 
155
219
  # Push an event back on the head of the stream. This method
@@ -180,6 +244,8 @@ module REXML
180
244
 
181
245
  # Returns the next event. This is a +PullEvent+ object.
182
246
  def pull
247
+ @source.drop_parsed_content
248
+
183
249
  pull_event.tap do |event|
184
250
  @listeners.each do |listener|
185
251
  listener.receive event
@@ -192,236 +258,268 @@ module REXML
192
258
  x, @closed = @closed, nil
193
259
  return [ :end_element, x ]
194
260
  end
195
- return [ :end_document ] if empty?
261
+ if empty?
262
+ if @document_status == :in_doctype
263
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
264
+ end
265
+ unless @tags.empty?
266
+ path = "/" + @tags.join("/")
267
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
268
+ end
269
+
270
+ unless @document_status == :in_element
271
+ raise ParseException.new("Malformed XML: No root element", @source)
272
+ end
273
+
274
+ return [ :end_document ]
275
+ end
196
276
  return @stack.shift if @stack.size > 0
197
277
  #STDERR.puts @source.encoding
198
278
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
279
+
280
+ @source.ensure_buffer
199
281
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
282
+ start_position = @source.position
283
+ if @source.match?("<?", true)
223
284
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
285
+ elsif @source.match?("<!", true)
286
+ if @source.match?("--", true)
287
+ return [ :comment, process_comment ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.skip_spaces
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
242
298
  end
243
- if @source.match(/\A\s*\[/um, true)
299
+ name = parse_name(base_error_message)
300
+ @source.skip_spaces
301
+ if @source.match?("[", true)
302
+ id = [nil, nil, nil]
244
303
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
304
+ elsif @source.match?(">", true)
305
+ id = [nil, nil, nil]
246
306
  @document_status = :after_doctype
307
+ @source.ensure_buffer
247
308
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
309
+ id = parse_id(base_error_message,
310
+ accept_external_id: true,
311
+ accept_public_id: false)
312
+ if id[0] == "SYSTEM"
313
+ # For backward compatibility
314
+ id[1], id[2] = id[2], nil
315
+ end
316
+ @source.skip_spaces
317
+ if @source.match?("[", true)
318
+ @document_status = :in_doctype
319
+ elsif @source.match?(">", true)
320
+ @document_status = :after_doctype
321
+ @source.ensure_buffer
322
+ else
323
+ message = "#{base_error_message}: garbage after external ID"
324
+ raise REXML::ParseException.new(message, @source)
325
+ end
250
326
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
327
+ args = [:start_doctype, name, *id]
328
+ if @document_status == :after_doctype
329
+ @source.skip_spaces
330
+ @stack << [ :end_doctype ]
331
+ end
332
+ return args
333
+ else
334
+ message = "Invalid XML"
335
+ raise REXML::ParseException.new(message, @source)
263
336
  end
264
337
  end
265
338
  end
266
339
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
340
+ @source.skip_spaces
341
+ start_position = @source.position
342
+ if @source.match?("<!", true)
343
+ if @source.match?("ELEMENT", true)
344
+ md = @source.match(/(.*?)>/um, true)
345
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
346
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
347
+ elsif @source.match?("ENTITY", true)
348
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
349
+ unless match_data
350
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
351
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
352
+ match = [:entitydecl, *match_data.captures.compact]
353
+ ref = false
354
+ if match[1] == '%'
355
+ ref = true
356
+ match.delete_at 1
357
+ end
358
+ # Now we have to sort out what kind of entity reference this is
359
+ if match[2] == 'SYSTEM'
360
+ # External reference
361
+ match[3] = match[3][1..-2] # PUBID
362
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
363
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
364
+ elsif match[2] == 'PUBLIC'
365
+ # External reference
366
+ match[3] = match[3][1..-2] # PUBID
367
+ match[4] = match[4][1..-2] # HREF
368
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
369
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
370
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
371
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
372
  else
329
- message = "#{base_error_message}: invalid declaration name"
373
+ match[2] = match[2][1..-2]
374
+ match.pop if match.size == 4
375
+ # match is [ :entity, name, value ]
330
376
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
377
+ match << '%' if ref
378
+ return match
379
+ elsif @source.match?("ATTLIST", true)
380
+ md = @source.match(Private::ATTLISTDECL_END, true)
381
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
382
+ element = md[1]
383
+ contents = "<!ATTLIST" + md[0]
384
+
385
+ pairs = {}
386
+ values = md[0].strip.scan( ATTDEF_RE )
387
+ values.each do |attdef|
388
+ unless attdef[3] == "#IMPLIED"
389
+ attdef.compact!
390
+ val = attdef[3]
391
+ val = attdef[4] if val == "#FIXED "
392
+ pairs[attdef[0]] = val
393
+ if attdef[0] =~ /^xmlns:(.*)/
394
+ @namespaces[$1] = val
395
+ end
396
+ end
397
+ end
398
+ return [ :attlistdecl, element, pairs, contents ]
399
+ elsif @source.match?("NOTATION", true)
400
+ base_error_message = "Malformed notation declaration"
401
+ unless @source.skip_spaces
402
+ if @source.match?(">")
403
+ message = "#{base_error_message}: name is missing"
404
+ else
405
+ message = "#{base_error_message}: invalid name"
406
+ end
407
+ @source.position = start_position
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ name = parse_name(base_error_message)
411
+ id = parse_id(base_error_message,
412
+ accept_external_id: true,
413
+ accept_public_id: true)
414
+ @source.skip_spaces
415
+ unless @source.match?(">", true)
416
+ message = "#{base_error_message}: garbage before end >"
417
+ raise REXML::ParseException.new(message, @source)
418
+ end
419
+ return [:notationdecl, name, *id]
420
+ elsif @source.match?("--", true)
421
+ return [ :comment, process_comment ]
422
+ else
423
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor ELEMENT,ENTITY,ATTLIST,NOTATION", @source)
340
424
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
425
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
426
+ return [ :externalentity, match[1] ]
427
+ elsif @source.match?(/\]\s*>/um, true)
343
428
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
429
  return [ :end_doctype ]
430
+ else
431
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
346
432
  end
347
433
  end
348
434
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
435
+ @source.skip_spaces
350
436
  end
351
437
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
438
+ start_position = @source.position
439
+ if @source.match?("<", true)
440
+ # :text's read_until may remain only "<" in buffer. In the
441
+ # case, buffer is empty here. So we need to fill buffer
442
+ # here explicitly.
443
+ @source.ensure_buffer
444
+ if @source.match?("/", true)
445
+ @namespaces_restore_stack.pop
356
446
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
447
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
448
  if md and !last_tag
359
449
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
450
  raise REXML::ParseException.new(message, @source)
361
451
  end
362
452
  if md.nil? or last_tag != md[1]
363
453
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
454
+ message += " (got '#{md[1]}')" if md
455
+ @source.position = start_position if md.nil?
365
456
  raise REXML::ParseException.new(message, @source)
366
457
  end
367
458
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
459
+ elsif @source.match?("!", true)
370
460
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
- raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
374
-
375
- case md[1]
376
- when /--/, /-\z/
377
- raise REXML::ParseException.new("Malformed comment", @source)
461
+ if @source.match?("--", true)
462
+ return [ :comment, process_comment ]
463
+ elsif @source.match?("[CDATA[", true)
464
+ text = @source.read_until("]]>")
465
+ if text.chomp!("]]>")
466
+ return [ :cdata, text ]
467
+ else
468
+ raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source)
378
469
  end
379
-
380
- return [ :comment, md[1] ] if md
381
470
  else
382
- md = @source.match( CDATA_PATTERN, true )
383
- return [ :cdata, md[1] ] if md
471
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor CDATA", @source)
384
472
  end
385
- raise REXML::ParseException.new( "Declarations can only occur "+
386
- "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
473
+ elsif @source.match?("?", true)
388
474
  return process_instruction
389
475
  else
390
476
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
477
+ md = @source.match(Private::TAG_PATTERN, true)
392
478
  unless md
479
+ @source.position = start_position
393
480
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
481
  end
482
+ tag = md[1]
395
483
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
484
+ @prefixes.clear
485
+ @prefixes << md[2] if md[2]
486
+ push_namespaces_restore
487
+ attributes, closed = parse_attributes(@prefixes)
400
488
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
489
+ for prefix in @prefixes
490
+ unless @namespaces.key?(prefix)
403
491
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
492
  end
405
493
  end
406
494
 
407
495
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
496
+ @closed = tag
497
+ pop_namespaces_restore
410
498
  else
411
- @tags.push( md[1] )
499
+ if @tags.empty? and @have_root
500
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
501
+ end
502
+ @tags.push( tag )
412
503
  end
413
- return [ :start_element, md[1], attributes ]
504
+ @have_root = true
505
+ return [ :start_element, tag, attributes ]
414
506
  end
415
507
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
508
+ text = @source.read_until("<")
509
+ if text.chomp!("<")
510
+ @source.position -= "<".bytesize
419
511
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
512
+ if @tags.empty?
513
+ unless /\A\s*\z/.match?(text)
514
+ if @have_root
515
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
516
+ else
517
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
518
+ end
519
+ end
520
+ return pull_event if @have_root
521
+ end
522
+ return [ :text, text ]
425
523
  end
426
524
  rescue REXML::UndefinedNamespaceException
427
525
  raise
@@ -431,18 +529,19 @@ module REXML
431
529
  raise REXML::ParseException.new( "Exception parsing",
432
530
  @source, self, (error ? error : $!) )
433
531
  end
434
- return [ :dummy ]
532
+ # NOTE: The end of the method never runs, because it is unreachable.
533
+ # All branches of code above have explicit unconditional return or raise statements.
435
534
  end
436
535
  private :pull_event
437
536
 
438
537
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
538
+ return unless entities
539
+
540
+ value = entities[ reference ]
541
+ return if value.nil?
542
+
543
+ record_entity_expansion
544
+ unnormalize( value, entities )
446
545
  end
447
546
 
448
547
  # Escapes all possible entities
@@ -463,52 +562,108 @@ module REXML
463
562
 
464
563
  # Unescapes all possible entities
465
564
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
565
+ if string.include?("\r")
566
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
567
+ else
568
+ rv = string.dup
569
+ end
468
570
  matches = rv.scan( REFERENCE_RE )
469
571
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
572
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
573
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
574
+ if m.start_with?("x")
575
+ code_point = Integer(m[1..-1], 16)
576
+ else
577
+ code_point = Integer(m, 10)
578
+ end
579
+ [code_point].pack('U*')
474
580
  }
475
581
  matches.collect!{|x|x[0]}.compact!
582
+ if filter
583
+ matches.reject! do |entity_reference|
584
+ filter.include?(entity_reference)
585
+ end
586
+ end
476
587
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
588
+ matches.tally.each do |entity_reference, n|
589
+ entity_expansion_count_before = @entity_expansion_count
590
+ entity_value = entity( entity_reference, entities )
591
+ if entity_value
592
+ if n > 1
593
+ entity_expansion_count_delta =
594
+ @entity_expansion_count - entity_expansion_count_before
595
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
596
+ end
597
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
598
+ rv.gsub!( re, entity_value )
599
+ if rv.bytesize > @entity_expansion_text_limit
600
+ raise "entity expansion has grown too large"
486
601
  end
602
+ else
603
+ er = DEFAULT_ENTITIES[entity_reference]
604
+ rv.gsub!( er[0], er[2] ) if er
487
605
  end
488
606
  end
489
- rv.gsub!( /&amp;/, '&' )
607
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
608
  end
491
609
  rv
492
610
  end
493
611
 
494
612
  private
613
+ def add_namespace(prefix, uri)
614
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
615
+ if uri.nil?
616
+ @namespaces.delete(prefix)
617
+ else
618
+ @namespaces[prefix] = uri
619
+ end
620
+ end
621
+
622
+ def push_namespaces_restore
623
+ namespaces_restore = {}
624
+ @namespaces_restore_stack.push(namespaces_restore)
625
+ namespaces_restore
626
+ end
627
+
628
+ def pop_namespaces_restore
629
+ namespaces_restore = @namespaces_restore_stack.pop
630
+ namespaces_restore.each do |prefix, uri|
631
+ if uri.nil?
632
+ @namespaces.delete(prefix)
633
+ else
634
+ @namespaces[prefix] = uri
635
+ end
636
+ end
637
+ end
638
+
639
+ def record_entity_expansion(delta=1)
640
+ @entity_expansion_count += delta
641
+ if @entity_expansion_count > @entity_expansion_limit
642
+ raise "number of entity expansions exceeded, processing aborted."
643
+ end
644
+ end
645
+
495
646
  def need_source_encoding_update?(xml_declaration_encoding)
496
647
  return false if xml_declaration_encoding.nil?
497
648
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
498
649
  true
499
650
  end
500
651
 
652
+ def normalize_xml_declaration_encoding(xml_declaration_encoding)
653
+ /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
654
+ end
655
+
501
656
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
657
+ md = @source.match(Private::NAME_PATTERN, true)
503
658
  unless md
504
- if @source.match(/\A\s*\S/um)
659
+ if @source.match?(/\S/um)
505
660
  message = "#{base_error_message}: invalid name"
506
661
  else
507
662
  message = "#{base_error_message}: name is missing"
508
663
  end
509
664
  raise REXML::ParseException.new(message, @source)
510
665
  end
511
- md[1]
666
+ md[0]
512
667
  end
513
668
 
514
669
  def parse_id(base_error_message,
@@ -543,131 +698,231 @@ module REXML
543
698
  accept_public_id:)
544
699
  public = /\A\s*PUBLIC/um
545
700
  system = /\A\s*SYSTEM/um
546
- if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
547
- if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
701
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
702
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
548
703
  return "public ID literal is missing"
549
704
  end
550
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
705
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
551
706
  return "invalid public ID literal"
552
707
  end
553
708
  if accept_public_id
554
- if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
709
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
555
710
  return "system ID literal is missing"
556
711
  end
557
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
712
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
558
713
  return "invalid system literal"
559
714
  end
560
715
  "garbage after system literal"
561
716
  else
562
717
  "garbage after public ID literal"
563
718
  end
564
- elsif accept_external_id and @source.match(/#{system}/um)
565
- if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
719
+ elsif accept_external_id and @source.match?(/#{system}/um)
720
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
566
721
  return "system literal is missing"
567
722
  end
568
- unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
723
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
569
724
  return "invalid system literal"
570
725
  end
571
726
  "garbage after system literal"
572
727
  else
573
- unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
728
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
574
729
  return "invalid ID type"
575
730
  end
576
731
  "ID type is missing"
577
732
  end
578
733
  end
579
734
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
735
+ def process_comment
736
+ text = @source.read_until("-->")
737
+ unless text.chomp!("-->")
738
+ raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source)
585
739
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
587
- end
588
740
 
589
- def parse_attributes(prefixes, curr_ns)
590
- attributes = {}
591
- closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
741
+ if text.include? "--" or text.end_with?("-")
742
+ raise REXML::ParseException.new("Malformed comment", @source)
596
743
  end
744
+ text
745
+ end
597
746
 
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
747
+ def process_instruction
748
+ name = parse_name("Malformed XML: Invalid processing instruction node")
749
+ if name == "xml"
750
+ xml_declaration
751
+ else # PITarget
752
+ if @source.skip_spaces # e.g. <?name content?>
753
+ start_position = @source.position
754
+ content = @source.read_until("?>")
755
+ unless content.chomp!("?>")
756
+ @source.position = start_position
757
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
758
+ end
759
+ else # e.g. <?name?>
760
+ content = nil
761
+ unless @source.match?("?>", true)
762
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
763
+ end
764
+ end
765
+ [:processing_instruction, name, content]
766
+ end
767
+ end
602
768
 
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
769
+ def xml_declaration
770
+ unless @version.nil?
771
+ raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
772
+ end
773
+ if @document_status
774
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
775
+ end
776
+ unless @source.skip_spaces
777
+ raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
778
+ end
779
+ unless @source.match?("version", true)
780
+ raise ParseException.new("Malformed XML: XML declaration misses version", @source)
781
+ end
782
+ @version = parse_attribute_value_with_equal("xml")
783
+ unless @source.skip_spaces
784
+ unless @source.match?("?>", true)
785
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
607
786
  end
787
+ encoding = normalize_xml_declaration_encoding(@source.encoding)
788
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
789
+ end
608
790
 
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
618
- message = "Missing attribute equal: <#{name}>"
619
- raise REXML::ParseException.new(message, @source)
620
- end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
623
- message = "Missing attribute value start quote: <#{name}>"
624
- raise REXML::ParseException.new(message, @source)
791
+ if @source.match?("encoding", true)
792
+ encoding = parse_attribute_value_with_equal("xml")
793
+ unless @source.skip_spaces
794
+ unless @source.match?("?>", true)
795
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
625
796
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
- raise REXML::ParseException.new(message, @source)
797
+ if need_source_encoding_update?(encoding)
798
+ @source.encoding = encoding
639
799
  end
800
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
801
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
802
+ end
803
+ end
804
+
805
+ if @source.match?("standalone", true)
806
+ standalone = parse_attribute_value_with_equal("xml")
807
+ case standalone
808
+ when "yes", "no"
809
+ else
810
+ raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
811
+ end
812
+ end
813
+ @source.skip_spaces
814
+ unless @source.match?("?>", true)
815
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
816
+ end
817
+
818
+ if need_source_encoding_update?(encoding)
819
+ @source.encoding = encoding
820
+ end
821
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
822
+
823
+ # e.g. <?xml version="1.0" ?>
824
+ # <?xml version="1.1" encoding="UTF-8" ?>
825
+ # <?xml version="1.1" standalone="yes"?>
826
+ # <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
827
+ [ :xmldecl, @version, encoding, standalone ]
828
+ end
829
+
830
+ if StringScanner::Version < "3.1.1"
831
+ def scan_quote
832
+ @source.match(/(['"])/, true)&.[](1)
833
+ end
834
+ else
835
+ def scan_quote
836
+ case @source.peek_byte
837
+ when 34 # '"'.ord
838
+ @source.scan_byte
839
+ '"'
840
+ when 39 # "'".ord
841
+ @source.scan_byte
842
+ "'"
843
+ else
844
+ nil
640
845
  end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
846
+ end
847
+ end
848
+
849
+ def parse_attribute_value_with_equal(name)
850
+ unless @source.match?(Private::EQUAL_PATTERN, true)
851
+ message = "Missing attribute equal: <#{name}>"
852
+ raise REXML::ParseException.new(message, @source)
853
+ end
854
+ unless quote = scan_quote
855
+ message = "Missing attribute value start quote: <#{name}>"
856
+ raise REXML::ParseException.new(message, @source)
857
+ end
858
+ start_position = @source.position
859
+ value = @source.read_until(quote)
860
+ unless value.chomp!(quote)
861
+ @source.position = start_position
862
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
863
+ raise REXML::ParseException.new(message, @source)
864
+ end
865
+ value
866
+ end
867
+
868
+ def parse_attributes(prefixes)
869
+ attributes = {}
870
+ expanded_names = {}
871
+ closed = false
872
+ while true
873
+ if @source.match?(">", true)
874
+ return attributes, closed
875
+ elsif @source.match?("/>", true)
876
+ closed = true
877
+ return attributes, closed
878
+ elsif match = @source.match(QNAME, true)
879
+ name = match[1]
880
+ prefix = match[2]
881
+ local_part = match[3]
882
+ value = parse_attribute_value_with_equal(name)
883
+ @source.skip_spaces
884
+ if prefix == "xmlns"
885
+ if local_part == "xml"
886
+ if value != Private::XML_PREFIXED_NAMESPACE
887
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
888
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
889
+ raise REXML::ParseException.new( msg, @source, self )
890
+ end
891
+ elsif local_part == "xmlns"
892
+ msg = "The 'xmlns' prefix must not be declared "+
650
893
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
894
+ raise REXML::ParseException.new( msg, @source, self)
652
895
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
896
+ add_namespace(local_part, value)
897
+ elsif prefix
898
+ prefixes << prefix unless prefix == "xml"
657
899
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
900
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
901
+ if attributes[name]
902
+ msg = "Duplicate attribute #{name.inspect}"
903
+ raise REXML::ParseException.new(msg, @source, self)
904
+ end
667
905
 
668
- attributes[name] = value
906
+ unless prefix == "xmlns"
907
+ uri = @namespaces[prefix]
908
+ expanded_name = [uri, local_part]
909
+ existing_prefix = expanded_names[expanded_name]
910
+ if existing_prefix
911
+ message = "Namespace conflict in adding attribute " +
912
+ "\"#{local_part}\": " +
913
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
914
+ "prefix \"#{prefix}\" = \"#{uri}\""
915
+ raise REXML::ParseException.new(message, @source, self)
916
+ end
917
+ expanded_names[expanded_name] = prefix
918
+ end
919
+
920
+ attributes[name] = value
921
+ else
922
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
923
+ raise REXML::ParseException.new(message, @source)
924
+ end
669
925
  end
670
- return attributes, closed
671
926
  end
672
927
  end
673
928
  end