rexml 3.2.6 → 3.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +124,7 @@ module REXML
96
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
128
 
101
129
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
130
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +140,32 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ end
160
+ private_constant :Private
161
+
115
162
  def initialize( source )
116
163
  self.stream = source
117
164
  @listeners = []
165
+ @prefixes = Set.new
166
+ @entity_expansion_count = 0
167
+ @entity_expansion_limit = Security.entity_expansion_limit
168
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
118
169
  end
119
170
 
120
171
  def add_listener( listener )
@@ -122,15 +173,20 @@ module REXML
122
173
  end
123
174
 
124
175
  attr_reader :source
176
+ attr_reader :entity_expansion_count
177
+ attr_writer :entity_expansion_limit
178
+ attr_writer :entity_expansion_text_limit
125
179
 
126
180
  def stream=( source )
127
181
  @source = SourceFactory.create_from( source )
128
182
  @closed = nil
183
+ @have_root = false
129
184
  @document_status = nil
130
185
  @tags = []
131
186
  @stack = []
132
187
  @entities = []
133
- @nsstack = []
188
+ @namespaces = {}
189
+ @namespaces_restore_stack = []
134
190
  end
135
191
 
136
192
  def position
@@ -180,6 +236,8 @@ module REXML
180
236
 
181
237
  # Returns the next event. This is a +PullEvent+ object.
182
238
  def pull
239
+ @source.drop_parsed_content
240
+
183
241
  pull_event.tap do |event|
184
242
  @listeners.each do |listener|
185
243
  listener.receive event
@@ -192,236 +250,274 @@ module REXML
192
250
  x, @closed = @closed, nil
193
251
  return [ :end_element, x ]
194
252
  end
195
- return [ :end_document ] if empty?
253
+ if empty?
254
+ if @document_status == :in_doctype
255
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
256
+ end
257
+ unless @tags.empty?
258
+ path = "/" + @tags.join("/")
259
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
260
+ end
261
+ return [ :end_document ]
262
+ end
196
263
  return @stack.shift if @stack.size > 0
197
264
  #STDERR.puts @source.encoding
198
265
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
266
+
267
+ @source.ensure_buffer
199
268
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
269
+ start_position = @source.position
270
+ if @source.match("<?", true)
223
271
  return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
272
+ elsif @source.match("<!", true)
273
+ if @source.match("--", true)
274
+ md = @source.match(/(.*?)-->/um, true)
275
+ if md.nil?
276
+ raise REXML::ParseException.new("Unclosed comment", @source)
242
277
  end
243
- if @source.match(/\A\s*\[/um, true)
278
+ if /--|-\z/.match?(md[1])
279
+ raise REXML::ParseException.new("Malformed comment", @source)
280
+ end
281
+ return [ :comment, md[1] ]
282
+ elsif @source.match("DOCTYPE", true)
283
+ base_error_message = "Malformed DOCTYPE"
284
+ unless @source.match(/\s+/um, true)
285
+ if @source.match(">")
286
+ message = "#{base_error_message}: name is missing"
287
+ else
288
+ message = "#{base_error_message}: invalid name"
289
+ end
290
+ @source.position = start_position
291
+ raise REXML::ParseException.new(message, @source)
292
+ end
293
+ name = parse_name(base_error_message)
294
+ if @source.match(/\s*\[/um, true)
295
+ id = [nil, nil, nil]
244
296
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
297
+ elsif @source.match(/\s*>/um, true)
298
+ id = [nil, nil, nil]
246
299
  @document_status = :after_doctype
300
+ @source.ensure_buffer
247
301
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
302
+ id = parse_id(base_error_message,
303
+ accept_external_id: true,
304
+ accept_public_id: false)
305
+ if id[0] == "SYSTEM"
306
+ # For backward compatibility
307
+ id[1], id[2] = id[2], nil
308
+ end
309
+ if @source.match(/\s*\[/um, true)
310
+ @document_status = :in_doctype
311
+ elsif @source.match(/\s*>/um, true)
312
+ @document_status = :after_doctype
313
+ @source.ensure_buffer
314
+ else
315
+ message = "#{base_error_message}: garbage after external ID"
316
+ raise REXML::ParseException.new(message, @source)
317
+ end
250
318
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
319
+ args = [:start_doctype, name, *id]
320
+ if @document_status == :after_doctype
321
+ @source.match(/\s*/um, true)
322
+ @stack << [ :end_doctype ]
323
+ end
324
+ return args
325
+ else
326
+ message = "Invalid XML"
327
+ raise REXML::ParseException.new(message, @source)
263
328
  end
264
329
  end
265
330
  end
266
331
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
332
+ @source.match(/\s*/um, true) # skip spaces
333
+ start_position = @source.position
334
+ if @source.match("<!", true)
335
+ if @source.match("ELEMENT", true)
336
+ md = @source.match(/(.*?)>/um, true)
337
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
338
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
339
+ elsif @source.match("ENTITY", true)
340
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
341
+ unless match_data
342
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
320
343
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
344
+ match = [:entitydecl, *match_data.captures.compact]
345
+ ref = false
346
+ if match[1] == '%'
347
+ ref = true
348
+ match.delete_at 1
349
+ end
350
+ # Now we have to sort out what kind of entity reference this is
351
+ if match[2] == 'SYSTEM'
352
+ # External reference
353
+ match[3] = match[3][1..-2] # PUBID
354
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
355
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
356
+ elsif match[2] == 'PUBLIC'
357
+ # External reference
358
+ match[3] = match[3][1..-2] # PUBID
359
+ match[4] = match[4][1..-2] # HREF
360
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
361
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
362
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
363
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
328
364
  else
329
- message = "#{base_error_message}: invalid declaration name"
365
+ match[2] = match[2][1..-2]
366
+ match.pop if match.size == 4
367
+ # match is [ :entity, name, value ]
330
368
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
369
+ match << '%' if ref
370
+ return match
371
+ elsif @source.match("ATTLIST", true)
372
+ md = @source.match(Private::ATTLISTDECL_END, true)
373
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
374
+ element = md[1]
375
+ contents = md[0]
376
+
377
+ pairs = {}
378
+ values = md[0].strip.scan( ATTDEF_RE )
379
+ values.each do |attdef|
380
+ unless attdef[3] == "#IMPLIED"
381
+ attdef.compact!
382
+ val = attdef[3]
383
+ val = attdef[4] if val == "#FIXED "
384
+ pairs[attdef[0]] = val
385
+ if attdef[0] =~ /^xmlns:(.*)/
386
+ @namespaces[$1] = val
387
+ end
388
+ end
389
+ end
390
+ return [ :attlistdecl, element, pairs, contents ]
391
+ elsif @source.match("NOTATION", true)
392
+ base_error_message = "Malformed notation declaration"
393
+ unless @source.match(/\s+/um, true)
394
+ if @source.match(">")
395
+ message = "#{base_error_message}: name is missing"
396
+ else
397
+ message = "#{base_error_message}: invalid name"
398
+ end
399
+ @source.position = start_position
400
+ raise REXML::ParseException.new(message, @source)
401
+ end
402
+ name = parse_name(base_error_message)
403
+ id = parse_id(base_error_message,
404
+ accept_external_id: true,
405
+ accept_public_id: true)
406
+ unless @source.match(/\s*>/um, true)
407
+ message = "#{base_error_message}: garbage before end >"
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ return [:notationdecl, name, *id]
411
+ elsif md = @source.match(/--(.*?)-->/um, true)
412
+ case md[1]
413
+ when /--/, /-\z/
414
+ raise REXML::ParseException.new("Malformed comment", @source)
415
+ end
416
+ return [ :comment, md[1] ] if md
340
417
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
418
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
419
+ return [ :externalentity, match[1] ]
420
+ elsif @source.match(/\]\s*>/um, true)
343
421
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
422
  return [ :end_doctype ]
346
423
  end
424
+ if @document_status == :in_doctype
425
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
426
+ end
347
427
  end
348
428
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
429
+ @source.match(/\s*/um, true)
350
430
  end
351
431
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
355
- @nsstack.shift
432
+ start_position = @source.position
433
+ if @source.match("<", true)
434
+ # :text's read_until may remain only "<" in buffer. In the
435
+ # case, buffer is empty here. So we need to fill buffer
436
+ # here explicitly.
437
+ @source.ensure_buffer
438
+ if @source.match("/", true)
439
+ @namespaces_restore_stack.pop
356
440
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
441
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
442
  if md and !last_tag
359
443
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
444
  raise REXML::ParseException.new(message, @source)
361
445
  end
362
446
  if md.nil? or last_tag != md[1]
363
447
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
448
+ message += " (got '#{md[1]}')" if md
449
+ @source.position = start_position if md.nil?
365
450
  raise REXML::ParseException.new(message, @source)
366
451
  end
367
452
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
453
+ elsif @source.match("!", true)
454
+ md = @source.match(/([^>]*>)/um)
370
455
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
456
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
457
+ if md[0][0] == ?-
458
+ md = @source.match(/--(.*?)-->/um, true)
374
459
 
375
- case md[1]
376
- when /--/, /-\z/
460
+ if md.nil? || /--|-\z/.match?(md[1])
377
461
  raise REXML::ParseException.new("Malformed comment", @source)
378
462
  end
379
463
 
380
- return [ :comment, md[1] ] if md
464
+ return [ :comment, md[1] ]
381
465
  else
382
- md = @source.match( CDATA_PATTERN, true )
466
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
467
  return [ :cdata, md[1] ] if md
384
468
  end
385
469
  raise REXML::ParseException.new( "Declarations can only occur "+
386
470
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
471
+ elsif @source.match("?", true)
388
472
  return process_instruction
389
473
  else
390
474
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
475
+ md = @source.match(Private::TAG_PATTERN, true)
392
476
  unless md
477
+ @source.position = start_position
393
478
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
479
  end
480
+ tag = md[1]
395
481
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
398
- @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
482
+ @prefixes.clear
483
+ @prefixes << md[2] if md[2]
484
+ push_namespaces_restore
485
+ attributes, closed = parse_attributes(@prefixes)
400
486
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
402
- unless @nsstack.find{|k| k.member?(prefix)}
487
+ for prefix in @prefixes
488
+ unless @namespaces.key?(prefix)
403
489
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
490
  end
405
491
  end
406
492
 
407
493
  if closed
408
- @closed = md[1]
409
- @nsstack.shift
494
+ @closed = tag
495
+ pop_namespaces_restore
410
496
  else
411
- @tags.push( md[1] )
497
+ if @tags.empty? and @have_root
498
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
499
+ end
500
+ @tags.push( tag )
412
501
  end
413
- return [ :start_element, md[1], attributes ]
502
+ @have_root = true
503
+ return [ :start_element, tag, attributes ]
414
504
  end
415
505
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
506
+ text = @source.read_until("<")
507
+ if text.chomp!("<")
508
+ @source.position -= "<".bytesize
419
509
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
510
+ if @tags.empty?
511
+ unless /\A\s*\z/.match?(text)
512
+ if @have_root
513
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
514
+ else
515
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
516
+ end
517
+ end
518
+ return pull_event if @have_root
519
+ end
520
+ return [ :text, text ]
425
521
  end
426
522
  rescue REXML::UndefinedNamespaceException
427
523
  raise
@@ -436,13 +532,13 @@ module REXML
436
532
  private :pull_event
437
533
 
438
534
  def entity( reference, entities )
439
- value = nil
440
- value = entities[ reference ] if entities
441
- if not value
442
- value = DEFAULT_ENTITIES[ reference ]
443
- value = value[2] if value
444
- end
445
- unnormalize( value, entities ) if value
535
+ return unless entities
536
+
537
+ value = entities[ reference ]
538
+ return if value.nil?
539
+
540
+ record_entity_expansion
541
+ unnormalize( value, entities )
446
542
  end
447
543
 
448
544
  # Escapes all possible entities
@@ -463,35 +559,83 @@ module REXML
463
559
 
464
560
  # Unescapes all possible entities
465
561
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
562
+ if string.include?("\r")
563
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
564
+ else
565
+ rv = string.dup
566
+ end
468
567
  matches = rv.scan( REFERENCE_RE )
469
568
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
569
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
570
  m=$1
472
571
  m = "0#{m}" if m[0] == ?x
473
572
  [Integer(m)].pack('U*')
474
573
  }
475
574
  matches.collect!{|x|x[0]}.compact!
575
+ if filter
576
+ matches.reject! do |entity_reference|
577
+ filter.include?(entity_reference)
578
+ end
579
+ end
476
580
  if matches.size > 0
477
- matches.each do |entity_reference|
478
- unless filter and filter.include?(entity_reference)
479
- entity_value = entity( entity_reference, entities )
480
- if entity_value
481
- re = /&#{entity_reference};/
482
- rv.gsub!( re, entity_value )
483
- else
484
- er = DEFAULT_ENTITIES[entity_reference]
485
- rv.gsub!( er[0], er[2] ) if er
581
+ matches.tally.each do |entity_reference, n|
582
+ entity_expansion_count_before = @entity_expansion_count
583
+ entity_value = entity( entity_reference, entities )
584
+ if entity_value
585
+ if n > 1
586
+ entity_expansion_count_delta =
587
+ @entity_expansion_count - entity_expansion_count_before
588
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
589
+ end
590
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
591
+ rv.gsub!( re, entity_value )
592
+ if rv.bytesize > @entity_expansion_text_limit
593
+ raise "entity expansion has grown too large"
486
594
  end
595
+ else
596
+ er = DEFAULT_ENTITIES[entity_reference]
597
+ rv.gsub!( er[0], er[2] ) if er
487
598
  end
488
599
  end
489
- rv.gsub!( /&amp;/, '&' )
600
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
601
  end
491
602
  rv
492
603
  end
493
604
 
494
605
  private
606
+ def add_namespace(prefix, uri)
607
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
608
+ if uri.nil?
609
+ @namespaces.delete(prefix)
610
+ else
611
+ @namespaces[prefix] = uri
612
+ end
613
+ end
614
+
615
+ def push_namespaces_restore
616
+ namespaces_restore = {}
617
+ @namespaces_restore_stack.push(namespaces_restore)
618
+ namespaces_restore
619
+ end
620
+
621
+ def pop_namespaces_restore
622
+ namespaces_restore = @namespaces_restore_stack.pop
623
+ namespaces_restore.each do |prefix, uri|
624
+ if uri.nil?
625
+ @namespaces.delete(prefix)
626
+ else
627
+ @namespaces[prefix] = uri
628
+ end
629
+ end
630
+ end
631
+
632
+ def record_entity_expansion(delta=1)
633
+ @entity_expansion_count += delta
634
+ if @entity_expansion_count > @entity_expansion_limit
635
+ raise "number of entity expansions exceeded, processing aborted."
636
+ end
637
+ end
638
+
495
639
  def need_source_encoding_update?(xml_declaration_encoding)
496
640
  return false if xml_declaration_encoding.nil?
497
641
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
@@ -499,16 +643,16 @@ module REXML
499
643
  end
500
644
 
501
645
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
646
+ md = @source.match(Private::NAME_PATTERN, true)
503
647
  unless md
504
- if @source.match(/\A\s*\S/um)
648
+ if @source.match(/\S/um)
505
649
  message = "#{base_error_message}: invalid name"
506
650
  else
507
651
  message = "#{base_error_message}: name is missing"
508
652
  end
509
653
  raise REXML::ParseException.new(message, @source)
510
654
  end
511
- md[1]
655
+ md[0]
512
656
  end
513
657
 
514
658
  def parse_id(base_error_message,
@@ -578,96 +722,114 @@ module REXML
578
722
  end
579
723
 
580
724
  def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
582
- unless match_data
583
- message = "Invalid processing instruction node"
584
- raise REXML::ParseException.new(message, @source)
725
+ name = parse_name("Malformed XML: Invalid processing instruction node")
726
+ if @source.match(/\s+/um, true)
727
+ match_data = @source.match(/(.*?)\?>/um, true)
728
+ unless match_data
729
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
730
+ end
731
+ content = match_data[1]
732
+ else
733
+ content = nil
734
+ unless @source.match("?>", true)
735
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736
+ end
737
+ end
738
+ if name == "xml"
739
+ if @document_status
740
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
741
+ end
742
+ version = VERSION.match(content)
743
+ version = version[1] unless version.nil?
744
+ encoding = ENCODING.match(content)
745
+ encoding = encoding[1] unless encoding.nil?
746
+ if need_source_encoding_update?(encoding)
747
+ @source.encoding = encoding
748
+ end
749
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
750
+ encoding = "UTF-16"
751
+ end
752
+ standalone = STANDALONE.match(content)
753
+ standalone = standalone[1] unless standalone.nil?
754
+ return [ :xmldecl, version, encoding, standalone ]
585
755
  end
586
- [:processing_instruction, match_data[1], match_data[2]]
756
+ [:processing_instruction, name, content]
587
757
  end
588
758
 
589
- def parse_attributes(prefixes, curr_ns)
759
+ def parse_attributes(prefixes)
590
760
  attributes = {}
761
+ expanded_names = {}
591
762
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
763
+ while true
764
+ if @source.match(">", true)
765
+ return attributes, closed
766
+ elsif @source.match("/>", true)
767
+ closed = true
768
+ return attributes, closed
769
+ elsif match = @source.match(QNAME, true)
770
+ name = match[1]
771
+ prefix = match[2]
772
+ local_part = match[3]
773
+
774
+ unless @source.match(/\s*=\s*/um, true)
618
775
  message = "Missing attribute equal: <#{name}>"
619
776
  raise REXML::ParseException.new(message, @source)
620
777
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
778
+ unless match = @source.match(/(['"])/, true)
623
779
  message = "Missing attribute value start quote: <#{name}>"
624
780
  raise REXML::ParseException.new(message, @source)
625
781
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
782
+ quote = match[1]
783
+ start_position = @source.position
784
+ value = @source.read_until(quote)
785
+ unless value.chomp!(quote)
786
+ @source.position = start_position
787
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
788
  raise REXML::ParseException.new(message, @source)
639
789
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
790
+ @source.match(/\s*/um, true)
791
+ if prefix == "xmlns"
792
+ if local_part == "xml"
793
+ if value != "http://www.w3.org/XML/1998/namespace"
794
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
795
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
796
+ raise REXML::ParseException.new( msg, @source, self )
797
+ end
798
+ elsif local_part == "xmlns"
799
+ msg = "The 'xmlns' prefix must not be declared "+
650
800
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
801
+ raise REXML::ParseException.new( msg, @source, self)
652
802
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
803
+ add_namespace(local_part, value)
804
+ elsif prefix
805
+ prefixes << prefix unless prefix == "xml"
657
806
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
807
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
808
+ if attributes[name]
809
+ msg = "Duplicate attribute #{name.inspect}"
810
+ raise REXML::ParseException.new(msg, @source, self)
811
+ end
667
812
 
668
- attributes[name] = value
813
+ unless prefix == "xmlns"
814
+ uri = @namespaces[prefix]
815
+ expanded_name = [uri, local_part]
816
+ existing_prefix = expanded_names[expanded_name]
817
+ if existing_prefix
818
+ message = "Namespace conflict in adding attribute " +
819
+ "\"#{local_part}\": " +
820
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
821
+ "prefix \"#{prefix}\" = \"#{uri}\""
822
+ raise REXML::ParseException.new(message, @source, self)
823
+ end
824
+ expanded_names[expanded_name] = prefix
825
+ end
826
+
827
+ attributes[name] = value
828
+ else
829
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
830
+ raise REXML::ParseException.new(message, @source)
831
+ end
669
832
  end
670
- return attributes, closed
671
833
  end
672
834
  end
673
835
  end