rexml 3.2.5 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
4
  require_relative '../source'
@@ -7,6 +7,17 @@ require "strscan"
7
7
 
8
8
  module REXML
9
9
  module Parsers
10
+ if StringScanner::Version < "3.0.8"
11
+ module StringScannerCaptures
12
+ refine StringScanner do
13
+ def captures
14
+ values_at(*(1...size))
15
+ end
16
+ end
17
+ end
18
+ using StringScannerCaptures
19
+ end
20
+
10
21
  # = Using the Pull Parser
11
22
  # <em>This API is experimental, and subject to change.</em>
12
23
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +107,7 @@ module REXML
96
107
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
108
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
109
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
110
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
111
 
101
112
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
113
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +123,29 @@ module REXML
112
123
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
124
  }
114
125
 
126
+ module Private
127
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
128
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
129
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
130
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
131
+ NAME_PATTERN = /\s*#{NAME}/um
132
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
133
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
134
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
135
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
136
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
137
+ DEFAULT_ENTITIES_PATTERNS = {}
138
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
139
+ default_entities.each do |term|
140
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
141
+ end
142
+ end
143
+ private_constant :Private
144
+
115
145
  def initialize( source )
116
146
  self.stream = source
117
147
  @listeners = []
148
+ @prefixes = Set.new
118
149
  end
119
150
 
120
151
  def add_listener( listener )
@@ -180,6 +211,8 @@ module REXML
180
211
 
181
212
  # Returns the next event. This is a +PullEvent+ object.
182
213
  def pull
214
+ @source.drop_parsed_content
215
+
183
216
  pull_event.tap do |event|
184
217
  @listeners.each do |listener|
185
218
  listener.receive event
@@ -192,236 +225,251 @@ module REXML
192
225
  x, @closed = @closed, nil
193
226
  return [ :end_element, x ]
194
227
  end
195
- return [ :end_document ] if empty?
228
+ if empty?
229
+ if @document_status == :in_doctype
230
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
231
+ end
232
+ return [ :end_document ]
233
+ end
196
234
  return @stack.shift if @stack.size > 0
197
235
  #STDERR.puts @source.encoding
198
236
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
237
+
238
+ @source.ensure_buffer
199
239
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
223
- return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
240
+ start_position = @source.position
241
+ if @source.match("<?", true)
242
+ return process_instruction(start_position)
243
+ elsif @source.match("<!", true)
244
+ if @source.match("--", true)
245
+ md = @source.match(/(.*?)-->/um, true)
246
+ if md.nil?
247
+ raise REXML::ParseException.new("Unclosed comment", @source)
248
+ end
249
+ if /--|-\z/.match?(md[1])
250
+ raise REXML::ParseException.new("Malformed comment", @source)
242
251
  end
243
- if @source.match(/\A\s*\[/um, true)
252
+ return [ :comment, md[1] ]
253
+ elsif @source.match("DOCTYPE", true)
254
+ base_error_message = "Malformed DOCTYPE"
255
+ unless @source.match(/\s+/um, true)
256
+ if @source.match(">")
257
+ message = "#{base_error_message}: name is missing"
258
+ else
259
+ message = "#{base_error_message}: invalid name"
260
+ end
261
+ @source.position = start_position
262
+ raise REXML::ParseException.new(message, @source)
263
+ end
264
+ @nsstack.unshift(Set.new)
265
+ name = parse_name(base_error_message)
266
+ if @source.match(/\s*\[/um, true)
267
+ id = [nil, nil, nil]
244
268
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
269
+ elsif @source.match(/\s*>/um, true)
270
+ id = [nil, nil, nil]
246
271
  @document_status = :after_doctype
272
+ @source.ensure_buffer
247
273
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
274
+ id = parse_id(base_error_message,
275
+ accept_external_id: true,
276
+ accept_public_id: false)
277
+ if id[0] == "SYSTEM"
278
+ # For backward compatibility
279
+ id[1], id[2] = id[2], nil
280
+ end
281
+ if @source.match(/\s*\[/um, true)
282
+ @document_status = :in_doctype
283
+ elsif @source.match(/\s*>/um, true)
284
+ @document_status = :after_doctype
285
+ @source.ensure_buffer
286
+ else
287
+ message = "#{base_error_message}: garbage after external ID"
288
+ raise REXML::ParseException.new(message, @source)
289
+ end
250
290
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
291
+ args = [:start_doctype, name, *id]
292
+ if @document_status == :after_doctype
293
+ @source.match(/\s*/um, true)
294
+ @stack << [ :end_doctype ]
295
+ end
296
+ return args
297
+ else
298
+ message = "Invalid XML"
299
+ raise REXML::ParseException.new(message, @source)
263
300
  end
264
301
  end
265
302
  end
266
303
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
304
+ @source.match(/\s*/um, true) # skip spaces
305
+ start_position = @source.position
306
+ if @source.match("<!", true)
307
+ if @source.match("ELEMENT", true)
308
+ md = @source.match(/(.*?)>/um, true)
309
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
310
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
311
+ elsif @source.match("ENTITY", true)
312
+ match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
313
+ ref = false
314
+ if match[1] == '%'
315
+ ref = true
316
+ match.delete_at 1
320
317
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
318
+ # Now we have to sort out what kind of entity reference this is
319
+ if match[2] == 'SYSTEM'
320
+ # External reference
321
+ match[3] = match[3][1..-2] # PUBID
322
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
323
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
324
+ elsif match[2] == 'PUBLIC'
325
+ # External reference
326
+ match[3] = match[3][1..-2] # PUBID
327
+ match[4] = match[4][1..-2] # HREF
328
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
329
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
328
330
  else
329
- message = "#{base_error_message}: invalid declaration name"
331
+ match[2] = match[2][1..-2]
332
+ match.pop if match.size == 4
333
+ # match is [ :entity, name, value ]
330
334
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
335
+ match << '%' if ref
336
+ return match
337
+ elsif @source.match("ATTLIST", true)
338
+ md = @source.match(Private::ATTLISTDECL_END, true)
339
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
340
+ element = md[1]
341
+ contents = md[0]
342
+
343
+ pairs = {}
344
+ values = md[0].scan( ATTDEF_RE )
345
+ values.each do |attdef|
346
+ unless attdef[3] == "#IMPLIED"
347
+ attdef.compact!
348
+ val = attdef[3]
349
+ val = attdef[4] if val == "#FIXED "
350
+ pairs[attdef[0]] = val
351
+ if attdef[0] =~ /^xmlns:(.*)/
352
+ @nsstack[0] << $1
353
+ end
354
+ end
355
+ end
356
+ return [ :attlistdecl, element, pairs, contents ]
357
+ elsif @source.match("NOTATION", true)
358
+ base_error_message = "Malformed notation declaration"
359
+ unless @source.match(/\s+/um, true)
360
+ if @source.match(">")
361
+ message = "#{base_error_message}: name is missing"
362
+ else
363
+ message = "#{base_error_message}: invalid name"
364
+ end
365
+ @source.position = start_position
366
+ raise REXML::ParseException.new(message, @source)
367
+ end
368
+ name = parse_name(base_error_message)
369
+ id = parse_id(base_error_message,
370
+ accept_external_id: true,
371
+ accept_public_id: true)
372
+ unless @source.match(/\s*>/um, true)
373
+ message = "#{base_error_message}: garbage before end >"
374
+ raise REXML::ParseException.new(message, @source)
375
+ end
376
+ return [:notationdecl, name, *id]
377
+ elsif md = @source.match(/--(.*?)-->/um, true)
378
+ case md[1]
379
+ when /--/, /-\z/
380
+ raise REXML::ParseException.new("Malformed comment", @source)
381
+ end
382
+ return [ :comment, md[1] ] if md
340
383
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
384
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
385
+ return [ :externalentity, match[1] ]
386
+ elsif @source.match(/\]\s*>/um, true)
343
387
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
388
  return [ :end_doctype ]
346
389
  end
390
+ if @document_status == :in_doctype
391
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
392
+ end
347
393
  end
348
394
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
395
+ @source.match(/\s*/um, true)
350
396
  end
351
397
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
398
+ start_position = @source.position
399
+ if @source.match("<", true)
400
+ # :text's read_until may remain only "<" in buffer. In the
401
+ # case, buffer is empty here. So we need to fill buffer
402
+ # here explicitly.
403
+ @source.ensure_buffer
404
+ if @source.match("/", true)
355
405
  @nsstack.shift
356
406
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
407
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
408
  if md and !last_tag
359
409
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
410
  raise REXML::ParseException.new(message, @source)
361
411
  end
362
412
  if md.nil? or last_tag != md[1]
363
413
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
414
+ message += " (got '#{md[1]}')" if md
415
+ @source.position = start_position if md.nil?
365
416
  raise REXML::ParseException.new(message, @source)
366
417
  end
367
418
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
419
+ elsif @source.match("!", true)
420
+ md = @source.match(/([^>]*>)/um)
370
421
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
422
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
423
+ if md[0][0] == ?-
424
+ md = @source.match(/--(.*?)-->/um, true)
374
425
 
375
- case md[1]
376
- when /--/, /-\z/
426
+ if md.nil? || /--|-\z/.match?(md[1])
377
427
  raise REXML::ParseException.new("Malformed comment", @source)
378
428
  end
379
429
 
380
- return [ :comment, md[1] ] if md
430
+ return [ :comment, md[1] ]
381
431
  else
382
- md = @source.match( CDATA_PATTERN, true )
432
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
433
  return [ :cdata, md[1] ] if md
384
434
  end
385
435
  raise REXML::ParseException.new( "Declarations can only occur "+
386
436
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
388
- return process_instruction
437
+ elsif @source.match("?", true)
438
+ return process_instruction(start_position)
389
439
  else
390
440
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
441
+ md = @source.match(Private::TAG_PATTERN, true)
392
442
  unless md
443
+ @source.position = start_position
393
444
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
445
  end
446
+ tag = md[1]
395
447
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
448
+ @prefixes.clear
449
+ @prefixes << md[2] if md[2]
398
450
  @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
451
+ attributes, closed = parse_attributes(@prefixes, curr_ns)
400
452
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
453
+ for prefix in @prefixes
402
454
  unless @nsstack.find{|k| k.member?(prefix)}
403
455
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
456
  end
405
457
  end
406
458
 
407
459
  if closed
408
- @closed = md[1]
460
+ @closed = tag
409
461
  @nsstack.shift
410
462
  else
411
- @tags.push( md[1] )
463
+ @tags.push( tag )
412
464
  end
413
- return [ :start_element, md[1], attributes ]
465
+ return [ :start_element, tag, attributes ]
414
466
  end
415
467
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
468
+ text = @source.read_until("<")
469
+ if text.chomp!("<")
470
+ @source.position -= "<".bytesize
419
471
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
472
+ return [ :text, text ]
425
473
  end
426
474
  rescue REXML::UndefinedNamespaceException
427
475
  raise
@@ -463,11 +511,10 @@ module REXML
463
511
 
464
512
  # Unescapes all possible entities
465
513
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
514
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
468
515
  matches = rv.scan( REFERENCE_RE )
469
516
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
517
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
518
  m=$1
472
519
  m = "0#{m}" if m[0] == ?x
473
520
  [Integer(m)].pack('U*')
@@ -478,7 +525,7 @@ module REXML
478
525
  unless filter and filter.include?(entity_reference)
479
526
  entity_value = entity( entity_reference, entities )
480
527
  if entity_value
481
- re = /&#{entity_reference};/
528
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
482
529
  rv.gsub!( re, entity_value )
483
530
  else
484
531
  er = DEFAULT_ENTITIES[entity_reference]
@@ -486,7 +533,7 @@ module REXML
486
533
  end
487
534
  end
488
535
  end
489
- rv.gsub!( /&amp;/, '&' )
536
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
537
  end
491
538
  rv
492
539
  end
@@ -499,9 +546,9 @@ module REXML
499
546
  end
500
547
 
501
548
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
549
+ md = @source.match(Private::NAME_PATTERN, true)
503
550
  unless md
504
- if @source.match(/\A\s*\S/um)
551
+ if @source.match(/\s*\S/um)
505
552
  message = "#{base_error_message}: invalid name"
506
553
  else
507
554
  message = "#{base_error_message}: name is missing"
@@ -577,97 +624,91 @@ module REXML
577
624
  end
578
625
  end
579
626
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
627
+ def process_instruction(start_position)
628
+ match_data = @source.match(Private::INSTRUCTION_END, true)
582
629
  unless match_data
583
630
  message = "Invalid processing instruction node"
631
+ @source.position = start_position
584
632
  raise REXML::ParseException.new(message, @source)
585
633
  end
634
+ if @document_status.nil? and match_data[1] == "xml"
635
+ content = match_data[2]
636
+ version = VERSION.match(content)
637
+ version = version[1] unless version.nil?
638
+ encoding = ENCODING.match(content)
639
+ encoding = encoding[1] unless encoding.nil?
640
+ if need_source_encoding_update?(encoding)
641
+ @source.encoding = encoding
642
+ end
643
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
644
+ encoding = "UTF-16"
645
+ end
646
+ standalone = STANDALONE.match(content)
647
+ standalone = standalone[1] unless standalone.nil?
648
+ return [ :xmldecl, version, encoding, standalone ]
649
+ end
586
650
  [:processing_instruction, match_data[1], match_data[2]]
587
651
  end
588
652
 
589
653
  def parse_attributes(prefixes, curr_ns)
590
654
  attributes = {}
591
655
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
656
+ while true
657
+ if @source.match(">", true)
658
+ return attributes, closed
659
+ elsif @source.match("/>", true)
660
+ closed = true
661
+ return attributes, closed
662
+ elsif match = @source.match(QNAME, true)
663
+ name = match[1]
664
+ prefix = match[2]
665
+ local_part = match[3]
666
+
667
+ unless @source.match(/\s*=\s*/um, true)
618
668
  message = "Missing attribute equal: <#{name}>"
619
669
  raise REXML::ParseException.new(message, @source)
620
670
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
671
+ unless match = @source.match(/(['"])/, true)
623
672
  message = "Missing attribute value start quote: <#{name}>"
624
673
  raise REXML::ParseException.new(message, @source)
625
674
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
675
+ quote = match[1]
676
+ start_position = @source.position
677
+ value = @source.read_until(quote)
678
+ unless value.chomp!(quote)
679
+ @source.position = start_position
680
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
681
  raise REXML::ParseException.new(message, @source)
639
682
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
683
+ @source.match(/\s*/um, true)
684
+ if prefix == "xmlns"
685
+ if local_part == "xml"
686
+ if value != "http://www.w3.org/XML/1998/namespace"
687
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
688
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
689
+ raise REXML::ParseException.new( msg, @source, self )
690
+ end
691
+ elsif local_part == "xmlns"
692
+ msg = "The 'xmlns' prefix must not be declared "+
650
693
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
694
+ raise REXML::ParseException.new( msg, @source, self)
652
695
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
696
+ curr_ns << local_part
697
+ elsif prefix
698
+ prefixes << prefix unless prefix == "xml"
657
699
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
700
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
701
+ if attributes[name]
702
+ msg = "Duplicate attribute #{name.inspect}"
703
+ raise REXML::ParseException.new(msg, @source, self)
704
+ end
667
705
 
668
- attributes[name] = value
706
+ attributes[name] = value
707
+ else
708
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
709
+ raise REXML::ParseException.new(message, @source)
710
+ end
669
711
  end
670
- return attributes, closed
671
712
  end
672
713
  end
673
714
  end