rexml 3.2.6 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,24 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
184
+ reset
185
+ end
186
+
187
+ def reset
128
188
  @closed = nil
189
+ @have_root = false
129
190
  @document_status = nil
130
191
  @tags = []
131
192
  @stack = []
132
193
  @entities = []
133
- @nsstack = []
194
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
195
+ @namespaces_restore_stack = []
134
196
  end
135
197
 
136
198
  def position
@@ -180,6 +242,8 @@ module REXML
180
242
 
181
243
  # Returns the next event. This is a +PullEvent+ object.
182
244
  def pull
245
+ @source.drop_parsed_content
246
+
183
247
  pull_event.tap do |event|
184
248
  @listeners.each do |listener|
185
249
  listener.receive event
@@ -192,236 +256,274 @@ module REXML
192
256
  x, @closed = @closed, nil
193
257
  return [ :end_element, x ]
194
258
  end
195
- return [ :end_document ] if empty?
259
+ if empty?
260
+ if @document_status == :in_doctype
261
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
262
+ end
263
+ unless @tags.empty?
264
+ path = "/" + @tags.join("/")
265
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
266
+ end
267
+ return [ :end_document ]
268
+ end
196
269
  return @stack.shift if @stack.size > 0
197
270
  #STDERR.puts @source.encoding
198
271
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
272
+
273
+ @source.ensure_buffer
199
274
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
275
+ start_position = @source.position
276
+ if @source.match?("<?", true)
223
277
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
278
+ elsif @source.match?("<!", true)
279
+ if @source.match?("--", true)
280
+ md = @source.match(/(.*?)-->/um, true)
281
+ if md.nil?
282
+ raise REXML::ParseException.new("Unclosed comment", @source)
283
+ end
284
+ if /--|-\z/.match?(md[1])
285
+ raise REXML::ParseException.new("Malformed comment", @source)
286
+ end
287
+ return [ :comment, md[1] ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.match?(/\s+/um, true)
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
242
298
  end
243
- if @source.match(/\A\s*\[/um, true)
299
+ name = parse_name(base_error_message)
300
+ if @source.match?(/\s*\[/um, true)
301
+ id = [nil, nil, nil]
244
302
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
303
+ elsif @source.match?(/\s*>/um, true)
304
+ id = [nil, nil, nil]
246
305
  @document_status = :after_doctype
306
+ @source.ensure_buffer
247
307
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
308
+ id = parse_id(base_error_message,
309
+ accept_external_id: true,
310
+ accept_public_id: false)
311
+ if id[0] == "SYSTEM"
312
+ # For backward compatibility
313
+ id[1], id[2] = id[2], nil
314
+ end
315
+ if @source.match?(/\s*\[/um, true)
316
+ @document_status = :in_doctype
317
+ elsif @source.match?(/\s*>/um, true)
318
+ @document_status = :after_doctype
319
+ @source.ensure_buffer
320
+ else
321
+ message = "#{base_error_message}: garbage after external ID"
322
+ raise REXML::ParseException.new(message, @source)
323
+ end
250
324
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
325
+ args = [:start_doctype, name, *id]
326
+ if @document_status == :after_doctype
327
+ @source.match?(/\s*/um, true)
328
+ @stack << [ :end_doctype ]
329
+ end
330
+ return args
331
+ else
332
+ message = "Invalid XML"
333
+ raise REXML::ParseException.new(message, @source)
263
334
  end
264
335
  end
265
336
  end
266
337
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
338
+ @source.match?(/\s*/um, true) # skip spaces
339
+ start_position = @source.position
340
+ if @source.match?("<!", true)
341
+ if @source.match?("ELEMENT", true)
342
+ md = @source.match(/(.*?)>/um, true)
343
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
344
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
345
+ elsif @source.match?("ENTITY", true)
346
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
347
+ unless match_data
348
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
349
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
350
+ match = [:entitydecl, *match_data.captures.compact]
351
+ ref = false
352
+ if match[1] == '%'
353
+ ref = true
354
+ match.delete_at 1
355
+ end
356
+ # Now we have to sort out what kind of entity reference this is
357
+ if match[2] == 'SYSTEM'
358
+ # External reference
359
+ match[3] = match[3][1..-2] # PUBID
360
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
361
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
362
+ elsif match[2] == 'PUBLIC'
363
+ # External reference
364
+ match[3] = match[3][1..-2] # PUBID
365
+ match[4] = match[4][1..-2] # HREF
366
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
367
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
368
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
369
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
370
  else
329
- message = "#{base_error_message}: invalid declaration name"
371
+ match[2] = match[2][1..-2]
372
+ match.pop if match.size == 4
373
+ # match is [ :entity, name, value ]
330
374
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
375
+ match << '%' if ref
376
+ return match
377
+ elsif @source.match?("ATTLIST", true)
378
+ md = @source.match(Private::ATTLISTDECL_END, true)
379
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
380
+ element = md[1]
381
+ contents = md[0]
382
+
383
+ pairs = {}
384
+ values = md[0].strip.scan( ATTDEF_RE )
385
+ values.each do |attdef|
386
+ unless attdef[3] == "#IMPLIED"
387
+ attdef.compact!
388
+ val = attdef[3]
389
+ val = attdef[4] if val == "#FIXED "
390
+ pairs[attdef[0]] = val
391
+ if attdef[0] =~ /^xmlns:(.*)/
392
+ @namespaces[$1] = val
393
+ end
394
+ end
395
+ end
396
+ return [ :attlistdecl, element, pairs, contents ]
397
+ elsif @source.match?("NOTATION", true)
398
+ base_error_message = "Malformed notation declaration"
399
+ unless @source.match?(/\s+/um, true)
400
+ if @source.match?(">")
401
+ message = "#{base_error_message}: name is missing"
402
+ else
403
+ message = "#{base_error_message}: invalid name"
404
+ end
405
+ @source.position = start_position
406
+ raise REXML::ParseException.new(message, @source)
407
+ end
408
+ name = parse_name(base_error_message)
409
+ id = parse_id(base_error_message,
410
+ accept_external_id: true,
411
+ accept_public_id: true)
412
+ unless @source.match?(/\s*>/um, true)
413
+ message = "#{base_error_message}: garbage before end >"
414
+ raise REXML::ParseException.new(message, @source)
415
+ end
416
+ return [:notationdecl, name, *id]
417
+ elsif md = @source.match(/--(.*?)-->/um, true)
418
+ case md[1]
419
+ when /--/, /-\z/
420
+ raise REXML::ParseException.new("Malformed comment", @source)
421
+ end
422
+ return [ :comment, md[1] ] if md
340
423
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
424
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
425
+ return [ :externalentity, match[1] ]
426
+ elsif @source.match?(/\]\s*>/um, true)
343
427
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
428
  return [ :end_doctype ]
346
429
  end
430
+ if @document_status == :in_doctype
431
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
432
+ end
347
433
  end
348
434
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
435
+ @source.match?(/\s*/um, true)
350
436
  end
351
437
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
438
+ start_position = @source.position
439
+ if @source.match?("<", true)
440
+ # :text's read_until may remain only "<" in buffer. In the
441
+ # case, buffer is empty here. So we need to fill buffer
442
+ # here explicitly.
443
+ @source.ensure_buffer
444
+ if @source.match?("/", true)
445
+ @namespaces_restore_stack.pop
356
446
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
447
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
448
  if md and !last_tag
359
449
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
450
  raise REXML::ParseException.new(message, @source)
361
451
  end
362
452
  if md.nil? or last_tag != md[1]
363
453
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
454
+ message += " (got '#{md[1]}')" if md
455
+ @source.position = start_position if md.nil?
365
456
  raise REXML::ParseException.new(message, @source)
366
457
  end
367
458
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
459
+ elsif @source.match?("!", true)
460
+ md = @source.match(/([^>]*>)/um)
370
461
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
462
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
463
+ if md[0][0] == ?-
464
+ md = @source.match(/--(.*?)-->/um, true)
374
465
 
375
- case md[1]
376
- when /--/, /-\z/
466
+ if md.nil? || /--|-\z/.match?(md[1])
377
467
  raise REXML::ParseException.new("Malformed comment", @source)
378
468
  end
379
469
 
380
- return [ :comment, md[1] ] if md
470
+ return [ :comment, md[1] ]
381
471
  else
382
- md = @source.match( CDATA_PATTERN, true )
472
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
473
  return [ :cdata, md[1] ] if md
384
474
  end
385
475
  raise REXML::ParseException.new( "Declarations can only occur "+
386
476
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
477
+ elsif @source.match?("?", true)
388
478
  return process_instruction
389
479
  else
390
480
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
481
+ md = @source.match(Private::TAG_PATTERN, true)
392
482
  unless md
483
+ @source.position = start_position
393
484
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
485
  end
486
+ tag = md[1]
395
487
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
488
+ @prefixes.clear
489
+ @prefixes << md[2] if md[2]
490
+ push_namespaces_restore
491
+ attributes, closed = parse_attributes(@prefixes)
400
492
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
493
+ for prefix in @prefixes
494
+ unless @namespaces.key?(prefix)
403
495
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
496
  end
405
497
  end
406
498
 
407
499
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
500
+ @closed = tag
501
+ pop_namespaces_restore
410
502
  else
411
- @tags.push( md[1] )
503
+ if @tags.empty? and @have_root
504
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
505
+ end
506
+ @tags.push( tag )
412
507
  end
413
- return [ :start_element, md[1], attributes ]
508
+ @have_root = true
509
+ return [ :start_element, tag, attributes ]
414
510
  end
415
511
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
512
+ text = @source.read_until("<")
513
+ if text.chomp!("<")
514
+ @source.position -= "<".bytesize
515
+ end
516
+ if @tags.empty?
517
+ unless /\A\s*\z/.match?(text)
518
+ if @have_root
519
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
520
+ else
521
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
522
+ end
523
+ end
524
+ return pull_event if @have_root
419
525
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
526
+ return [ :text, text ]
425
527
  end
426
528
  rescue REXML::UndefinedNamespaceException
427
529
  raise
@@ -436,13 +538,13 @@ module REXML
436
538
  private :pull_event
437
539
 
438
540
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
541
+ return unless entities
542
+
543
+ value = entities[ reference ]
544
+ return if value.nil?
545
+
546
+ record_entity_expansion
547
+ unnormalize( value, entities )
446
548
  end
447
549
 
448
550
  # Escapes all possible entities
@@ -463,35 +565,87 @@ module REXML
463
565
 
464
566
  # Unescapes all possible entities
465
567
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
568
+ if string.include?("\r")
569
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
570
+ else
571
+ rv = string.dup
572
+ end
468
573
  matches = rv.scan( REFERENCE_RE )
469
574
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
575
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
576
  m=$1
472
- m = "0#{m}" if m[0] == ?x
473
- [Integer(m)].pack('U*')
577
+ if m.start_with?("x")
578
+ code_point = Integer(m[1..-1], 16)
579
+ else
580
+ code_point = Integer(m, 10)
581
+ end
582
+ [code_point].pack('U*')
474
583
  }
475
584
  matches.collect!{|x|x[0]}.compact!
585
+ if filter
586
+ matches.reject! do |entity_reference|
587
+ filter.include?(entity_reference)
588
+ end
589
+ end
476
590
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
591
+ matches.tally.each do |entity_reference, n|
592
+ entity_expansion_count_before = @entity_expansion_count
593
+ entity_value = entity( entity_reference, entities )
594
+ if entity_value
595
+ if n > 1
596
+ entity_expansion_count_delta =
597
+ @entity_expansion_count - entity_expansion_count_before
598
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
486
599
  end
600
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
601
+ rv.gsub!( re, entity_value )
602
+ if rv.bytesize > @entity_expansion_text_limit
603
+ raise "entity expansion has grown too large"
604
+ end
605
+ else
606
+ er = DEFAULT_ENTITIES[entity_reference]
607
+ rv.gsub!( er[0], er[2] ) if er
487
608
  end
488
609
  end
489
- rv.gsub!( /&amp;/, '&' )
610
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
611
  end
491
612
  rv
492
613
  end
493
614
 
494
615
  private
616
+ def add_namespace(prefix, uri)
617
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
618
+ if uri.nil?
619
+ @namespaces.delete(prefix)
620
+ else
621
+ @namespaces[prefix] = uri
622
+ end
623
+ end
624
+
625
+ def push_namespaces_restore
626
+ namespaces_restore = {}
627
+ @namespaces_restore_stack.push(namespaces_restore)
628
+ namespaces_restore
629
+ end
630
+
631
+ def pop_namespaces_restore
632
+ namespaces_restore = @namespaces_restore_stack.pop
633
+ namespaces_restore.each do |prefix, uri|
634
+ if uri.nil?
635
+ @namespaces.delete(prefix)
636
+ else
637
+ @namespaces[prefix] = uri
638
+ end
639
+ end
640
+ end
641
+
642
+ def record_entity_expansion(delta=1)
643
+ @entity_expansion_count += delta
644
+ if @entity_expansion_count > @entity_expansion_limit
645
+ raise "number of entity expansions exceeded, processing aborted."
646
+ end
647
+ end
648
+
495
649
  def need_source_encoding_update?(xml_declaration_encoding)
496
650
  return false if xml_declaration_encoding.nil?
497
651
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +653,16 @@ module REXML
499
653
  end
500
654
 
501
655
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
656
+ md = @source.match(Private::NAME_PATTERN, true)
503
657
  unless md
504
- if @source.match(/\A\s*\S/um)
658
+ if @source.match?(/\S/um)
505
659
  message = "#{base_error_message}: invalid name"
506
660
  else
507
661
  message = "#{base_error_message}: name is missing"
508
662
  end
509
663
  raise REXML::ParseException.new(message, @source)
510
664
  end
511
- md[1]
665
+ md[0]
512
666
  end
513
667
 
514
668
  def parse_id(base_error_message,
@@ -543,34 +697,34 @@ module REXML
543
697
  accept_public_id:)
544
698
  public = /\A\s*PUBLIC/um
545
699
  system = /\A\s*SYSTEM/um
546
- if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
547
- if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
700
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
701
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
548
702
  return "public ID literal is missing"
549
703
  end
550
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
704
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
551
705
  return "invalid public ID literal"
552
706
  end
553
707
  if accept_public_id
554
- if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
708
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
555
709
  return "system ID literal is missing"
556
710
  end
557
- unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
711
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
558
712
  return "invalid system literal"
559
713
  end
560
714
  "garbage after system literal"
561
715
  else
562
716
  "garbage after public ID literal"
563
717
  end
564
- elsif accept_external_id and @source.match(/#{system}/um)
565
- if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
718
+ elsif accept_external_id and @source.match?(/#{system}/um)
719
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
566
720
  return "system literal is missing"
567
721
  end
568
- unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
722
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
569
723
  return "invalid system literal"
570
724
  end
571
725
  "garbage after system literal"
572
726
  else
573
- unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
727
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
574
728
  return "invalid ID type"
575
729
  end
576
730
  "ID type is missing"
@@ -578,96 +732,114 @@ module REXML
578
732
  end
579
733
 
580
734
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
735
+ name = parse_name("Malformed XML: Invalid processing instruction node")
736
+ if @source.match?(/\s+/um, true)
737
+ match_data = @source.match(/(.*?)\?>/um, true)
738
+ unless match_data
739
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
740
+ end
741
+ content = match_data[1]
742
+ else
743
+ content = nil
744
+ unless @source.match?("?>", true)
745
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
746
+ end
585
747
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
748
+ if name == "xml"
749
+ if @document_status
750
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
751
+ end
752
+ version = VERSION.match(content)
753
+ version = version[1] unless version.nil?
754
+ encoding = ENCODING.match(content)
755
+ encoding = encoding[1] unless encoding.nil?
756
+ if need_source_encoding_update?(encoding)
757
+ @source.encoding = encoding
758
+ end
759
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
760
+ encoding = "UTF-16"
761
+ end
762
+ standalone = STANDALONE.match(content)
763
+ standalone = standalone[1] unless standalone.nil?
764
+ return [ :xmldecl, version, encoding, standalone ]
765
+ end
766
+ [:processing_instruction, name, content]
587
767
  end
588
768
 
589
- def parse_attributes(prefixes, curr_ns)
769
+ def parse_attributes(prefixes)
590
770
  attributes = {}
771
+ expanded_names = {}
591
772
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
773
+ while true
774
+ if @source.match?(">", true)
775
+ return attributes, closed
776
+ elsif @source.match?("/>", true)
777
+ closed = true
778
+ return attributes, closed
779
+ elsif match = @source.match(QNAME, true)
780
+ name = match[1]
781
+ prefix = match[2]
782
+ local_part = match[3]
783
+
784
+ unless @source.match?(/\s*=\s*/um, true)
618
785
  message = "Missing attribute equal: <#{name}>"
619
786
  raise REXML::ParseException.new(message, @source)
620
787
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
788
+ unless match = @source.match(/(['"])/, true)
623
789
  message = "Missing attribute value start quote: <#{name}>"
624
790
  raise REXML::ParseException.new(message, @source)
625
791
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
792
+ quote = match[1]
793
+ start_position = @source.position
794
+ value = @source.read_until(quote)
795
+ unless value.chomp!(quote)
796
+ @source.position = start_position
797
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
798
  raise REXML::ParseException.new(message, @source)
639
799
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
800
+ @source.match?(/\s*/um, true)
801
+ if prefix == "xmlns"
802
+ if local_part == "xml"
803
+ if value != Private::XML_PREFIXED_NAMESPACE
804
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
805
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
806
+ raise REXML::ParseException.new( msg, @source, self )
807
+ end
808
+ elsif local_part == "xmlns"
809
+ msg = "The 'xmlns' prefix must not be declared "+
650
810
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
811
+ raise REXML::ParseException.new( msg, @source, self)
652
812
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
813
+ add_namespace(local_part, value)
814
+ elsif prefix
815
+ prefixes << prefix unless prefix == "xml"
657
816
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
817
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
818
+ if attributes[name]
819
+ msg = "Duplicate attribute #{name.inspect}"
820
+ raise REXML::ParseException.new(msg, @source, self)
821
+ end
667
822
 
668
- attributes[name] = value
823
+ unless prefix == "xmlns"
824
+ uri = @namespaces[prefix]
825
+ expanded_name = [uri, local_part]
826
+ existing_prefix = expanded_names[expanded_name]
827
+ if existing_prefix
828
+ message = "Namespace conflict in adding attribute " +
829
+ "\"#{local_part}\": " +
830
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
831
+ "prefix \"#{prefix}\" = \"#{uri}\""
832
+ raise REXML::ParseException.new(message, @source, self)
833
+ end
834
+ expanded_names[expanded_name] = prefix
835
+ end
836
+
837
+ attributes[name] = value
838
+ else
839
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
840
+ raise REXML::ParseException.new(message, @source)
841
+ end
669
842
  end
670
- return attributes, closed
671
843
  end
672
844
  end
673
845
  end