rexml 3.2.5 → 3.3.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
4
  require_relative '../source'
@@ -7,6 +7,17 @@ require "strscan"
7
7
 
8
8
  module REXML
9
9
  module Parsers
10
+ if StringScanner::Version < "3.0.8"
11
+ module StringScannerCaptures
12
+ refine StringScanner do
13
+ def captures
14
+ values_at(*(1...size))
15
+ end
16
+ end
17
+ end
18
+ using StringScannerCaptures
19
+ end
20
+
10
21
  # = Using the Pull Parser
11
22
  # <em>This API is experimental, and subject to change.</em>
12
23
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +107,7 @@ module REXML
96
107
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
108
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
109
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
110
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
111
 
101
112
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
113
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,9 +123,37 @@ module REXML
112
123
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
124
  }
114
125
 
126
+ module Private
127
+ # Terminal requires two or more letters.
128
+ INSTRUCTION_TERM = "?>"
129
+ COMMENT_TERM = "-->"
130
+ CDATA_TERM = "]]>"
131
+ DOCTYPE_TERM = "]>"
132
+ # Read to the end of DOCTYPE because there is no proper ENTITY termination
133
+ ENTITY_TERM = DOCTYPE_TERM
134
+
135
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
136
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
137
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
138
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
139
+ NAME_PATTERN = /\s*#{NAME}/um
140
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
141
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
142
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
143
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
144
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
145
+ DEFAULT_ENTITIES_PATTERNS = {}
146
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
147
+ default_entities.each do |term|
148
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
149
+ end
150
+ end
151
+ private_constant :Private
152
+
115
153
  def initialize( source )
116
154
  self.stream = source
117
155
  @listeners = []
156
+ @prefixes = Set.new
118
157
  end
119
158
 
120
159
  def add_listener( listener )
@@ -126,6 +165,7 @@ module REXML
126
165
  def stream=( source )
127
166
  @source = SourceFactory.create_from( source )
128
167
  @closed = nil
168
+ @have_root = false
129
169
  @document_status = nil
130
170
  @tags = []
131
171
  @stack = []
@@ -180,6 +220,8 @@ module REXML
180
220
 
181
221
  # Returns the next event. This is a +PullEvent+ object.
182
222
  def pull
223
+ @source.drop_parsed_content
224
+
183
225
  pull_event.tap do |event|
184
226
  @listeners.each do |listener|
185
227
  listener.receive event
@@ -192,236 +234,261 @@ module REXML
192
234
  x, @closed = @closed, nil
193
235
  return [ :end_element, x ]
194
236
  end
195
- return [ :end_document ] if empty?
237
+ if empty?
238
+ if @document_status == :in_doctype
239
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
240
+ end
241
+ return [ :end_document ]
242
+ end
196
243
  return @stack.shift if @stack.size > 0
197
244
  #STDERR.puts @source.encoding
198
245
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
246
+
247
+ @source.ensure_buffer
199
248
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
223
- return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
249
+ start_position = @source.position
250
+ if @source.match("<?", true)
251
+ return process_instruction(start_position)
252
+ elsif @source.match("<!", true)
253
+ if @source.match("--", true)
254
+ md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
255
+ if md.nil?
256
+ raise REXML::ParseException.new("Unclosed comment", @source)
257
+ end
258
+ if /--|-\z/.match?(md[1])
259
+ raise REXML::ParseException.new("Malformed comment", @source)
260
+ end
261
+ return [ :comment, md[1] ]
262
+ elsif @source.match("DOCTYPE", true)
263
+ base_error_message = "Malformed DOCTYPE"
264
+ unless @source.match(/\s+/um, true)
265
+ if @source.match(">")
266
+ message = "#{base_error_message}: name is missing"
267
+ else
268
+ message = "#{base_error_message}: invalid name"
269
+ end
270
+ @source.position = start_position
271
+ raise REXML::ParseException.new(message, @source)
242
272
  end
243
- if @source.match(/\A\s*\[/um, true)
273
+ @nsstack.unshift(Set.new)
274
+ name = parse_name(base_error_message)
275
+ if @source.match(/\s*\[/um, true)
276
+ id = [nil, nil, nil]
244
277
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
278
+ elsif @source.match(/\s*>/um, true)
279
+ id = [nil, nil, nil]
246
280
  @document_status = :after_doctype
281
+ @source.ensure_buffer
247
282
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
283
+ id = parse_id(base_error_message,
284
+ accept_external_id: true,
285
+ accept_public_id: false)
286
+ if id[0] == "SYSTEM"
287
+ # For backward compatibility
288
+ id[1], id[2] = id[2], nil
289
+ end
290
+ if @source.match(/\s*\[/um, true)
291
+ @document_status = :in_doctype
292
+ elsif @source.match(/\s*>/um, true)
293
+ @document_status = :after_doctype
294
+ @source.ensure_buffer
295
+ else
296
+ message = "#{base_error_message}: garbage after external ID"
297
+ raise REXML::ParseException.new(message, @source)
298
+ end
250
299
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
300
+ args = [:start_doctype, name, *id]
301
+ if @document_status == :after_doctype
302
+ @source.match(/\s*/um, true)
303
+ @stack << [ :end_doctype ]
304
+ end
305
+ return args
306
+ else
307
+ message = "Invalid XML"
308
+ raise REXML::ParseException.new(message, @source)
263
309
  end
264
310
  end
265
311
  end
266
312
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
313
+ @source.match(/\s*/um, true) # skip spaces
314
+ start_position = @source.position
315
+ if @source.match("<!", true)
316
+ if @source.match("ELEMENT", true)
317
+ md = @source.match(/(.*?)>/um, true)
318
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
319
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
320
+ elsif @source.match("ENTITY", true)
321
+ match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM).captures.compact]
322
+ ref = false
323
+ if match[1] == '%'
324
+ ref = true
325
+ match.delete_at 1
320
326
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
327
+ # Now we have to sort out what kind of entity reference this is
328
+ if match[2] == 'SYSTEM'
329
+ # External reference
330
+ match[3] = match[3][1..-2] # PUBID
331
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
332
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
333
+ elsif match[2] == 'PUBLIC'
334
+ # External reference
335
+ match[3] = match[3][1..-2] # PUBID
336
+ match[4] = match[4][1..-2] # HREF
337
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
338
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
328
339
  else
329
- message = "#{base_error_message}: invalid declaration name"
340
+ match[2] = match[2][1..-2]
341
+ match.pop if match.size == 4
342
+ # match is [ :entity, name, value ]
330
343
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
344
+ match << '%' if ref
345
+ return match
346
+ elsif @source.match("ATTLIST", true)
347
+ md = @source.match(Private::ATTLISTDECL_END, true)
348
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
349
+ element = md[1]
350
+ contents = md[0]
351
+
352
+ pairs = {}
353
+ values = md[0].strip.scan( ATTDEF_RE )
354
+ values.each do |attdef|
355
+ unless attdef[3] == "#IMPLIED"
356
+ attdef.compact!
357
+ val = attdef[3]
358
+ val = attdef[4] if val == "#FIXED "
359
+ pairs[attdef[0]] = val
360
+ if attdef[0] =~ /^xmlns:(.*)/
361
+ @nsstack[0] << $1
362
+ end
363
+ end
364
+ end
365
+ return [ :attlistdecl, element, pairs, contents ]
366
+ elsif @source.match("NOTATION", true)
367
+ base_error_message = "Malformed notation declaration"
368
+ unless @source.match(/\s+/um, true)
369
+ if @source.match(">")
370
+ message = "#{base_error_message}: name is missing"
371
+ else
372
+ message = "#{base_error_message}: invalid name"
373
+ end
374
+ @source.position = start_position
375
+ raise REXML::ParseException.new(message, @source)
376
+ end
377
+ name = parse_name(base_error_message)
378
+ id = parse_id(base_error_message,
379
+ accept_external_id: true,
380
+ accept_public_id: true)
381
+ unless @source.match(/\s*>/um, true)
382
+ message = "#{base_error_message}: garbage before end >"
383
+ raise REXML::ParseException.new(message, @source)
384
+ end
385
+ return [:notationdecl, name, *id]
386
+ elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
387
+ case md[1]
388
+ when /--/, /-\z/
389
+ raise REXML::ParseException.new("Malformed comment", @source)
390
+ end
391
+ return [ :comment, md[1] ] if md
340
392
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
393
+ elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
394
+ return [ :externalentity, match[1] ]
395
+ elsif @source.match(/\]\s*>/um, true)
343
396
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
397
  return [ :end_doctype ]
346
398
  end
399
+ if @document_status == :in_doctype
400
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
401
+ end
347
402
  end
348
403
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
404
+ @source.match(/\s*/um, true)
350
405
  end
351
406
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
407
+ start_position = @source.position
408
+ if @source.match("<", true)
409
+ # :text's read_until may remain only "<" in buffer. In the
410
+ # case, buffer is empty here. So we need to fill buffer
411
+ # here explicitly.
412
+ @source.ensure_buffer
413
+ if @source.match("/", true)
355
414
  @nsstack.shift
356
415
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
416
+ md = @source.match(Private::CLOSE_PATTERN, true)
358
417
  if md and !last_tag
359
418
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
419
  raise REXML::ParseException.new(message, @source)
361
420
  end
362
421
  if md.nil? or last_tag != md[1]
363
422
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
423
+ message += " (got '#{md[1]}')" if md
424
+ @source.position = start_position if md.nil?
365
425
  raise REXML::ParseException.new(message, @source)
366
426
  end
367
427
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
428
+ elsif @source.match("!", true)
429
+ md = @source.match(/([^>]*>)/um)
370
430
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
431
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
432
+ if md[0][0] == ?-
433
+ md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
374
434
 
375
- case md[1]
376
- when /--/, /-\z/
435
+ if md.nil? || /--|-\z/.match?(md[1])
377
436
  raise REXML::ParseException.new("Malformed comment", @source)
378
437
  end
379
438
 
380
- return [ :comment, md[1] ] if md
439
+ return [ :comment, md[1] ]
381
440
  else
382
- md = @source.match( CDATA_PATTERN, true )
441
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
383
442
  return [ :cdata, md[1] ] if md
384
443
  end
385
444
  raise REXML::ParseException.new( "Declarations can only occur "+
386
445
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
388
- return process_instruction
446
+ elsif @source.match("?", true)
447
+ return process_instruction(start_position)
389
448
  else
390
449
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
450
+ md = @source.match(Private::TAG_PATTERN, true)
392
451
  unless md
452
+ @source.position = start_position
393
453
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
454
  end
455
+ tag = md[1]
395
456
  @document_status = :in_element
396
- prefixes = Set.new
397
- prefixes << md[2] if md[2]
457
+ @prefixes.clear
458
+ @prefixes << md[2] if md[2]
398
459
  @nsstack.unshift(curr_ns=Set.new)
399
- attributes, closed = parse_attributes(prefixes, curr_ns)
460
+ attributes, closed = parse_attributes(@prefixes, curr_ns)
400
461
  # Verify that all of the prefixes have been defined
401
- for prefix in prefixes
462
+ for prefix in @prefixes
402
463
  unless @nsstack.find{|k| k.member?(prefix)}
403
464
  raise UndefinedNamespaceException.new(prefix,@source,self)
404
465
  end
405
466
  end
406
467
 
407
468
  if closed
408
- @closed = md[1]
469
+ @closed = tag
409
470
  @nsstack.shift
410
471
  else
411
- @tags.push( md[1] )
472
+ if @tags.empty? and @have_root
473
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
474
+ end
475
+ @tags.push( tag )
412
476
  end
413
- return [ :start_element, md[1], attributes ]
477
+ @have_root = true
478
+ return [ :start_element, tag, attributes ]
414
479
  end
415
480
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
481
+ text = @source.read_until("<")
482
+ if text.chomp!("<")
483
+ @source.position -= "<".bytesize
484
+ end
485
+ if @tags.empty? and @have_root
486
+ unless /\A\s*\z/.match?(text)
487
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
488
+ end
489
+ return pull_event
419
490
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
491
+ return [ :text, text ]
425
492
  end
426
493
  rescue REXML::UndefinedNamespaceException
427
494
  raise
@@ -463,11 +530,14 @@ module REXML
463
530
 
464
531
  # Unescapes all possible entities
465
532
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
533
+ if string.include?("\r")
534
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
535
+ else
536
+ rv = string.dup
537
+ end
468
538
  matches = rv.scan( REFERENCE_RE )
469
539
  return rv if matches.size == 0
470
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
540
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
471
541
  m=$1
472
542
  m = "0#{m}" if m[0] == ?x
473
543
  [Integer(m)].pack('U*')
@@ -478,7 +548,7 @@ module REXML
478
548
  unless filter and filter.include?(entity_reference)
479
549
  entity_value = entity( entity_reference, entities )
480
550
  if entity_value
481
- re = /&#{entity_reference};/
551
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
482
552
  rv.gsub!( re, entity_value )
483
553
  else
484
554
  er = DEFAULT_ENTITIES[entity_reference]
@@ -486,7 +556,7 @@ module REXML
486
556
  end
487
557
  end
488
558
  end
489
- rv.gsub!( /&amp;/, '&' )
559
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
490
560
  end
491
561
  rv
492
562
  end
@@ -499,9 +569,9 @@ module REXML
499
569
  end
500
570
 
501
571
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
572
+ md = @source.match(Private::NAME_PATTERN, true)
503
573
  unless md
504
- if @source.match(/\A\s*\S/um)
574
+ if @source.match(/\s*\S/um)
505
575
  message = "#{base_error_message}: invalid name"
506
576
  else
507
577
  message = "#{base_error_message}: name is missing"
@@ -577,97 +647,94 @@ module REXML
577
647
  end
578
648
  end
579
649
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
650
+ def process_instruction(start_position)
651
+ match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
582
652
  unless match_data
583
653
  message = "Invalid processing instruction node"
654
+ @source.position = start_position
584
655
  raise REXML::ParseException.new(message, @source)
585
656
  end
657
+ if match_data[1] == "xml"
658
+ if @document_status
659
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
660
+ end
661
+ content = match_data[2]
662
+ version = VERSION.match(content)
663
+ version = version[1] unless version.nil?
664
+ encoding = ENCODING.match(content)
665
+ encoding = encoding[1] unless encoding.nil?
666
+ if need_source_encoding_update?(encoding)
667
+ @source.encoding = encoding
668
+ end
669
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
670
+ encoding = "UTF-16"
671
+ end
672
+ standalone = STANDALONE.match(content)
673
+ standalone = standalone[1] unless standalone.nil?
674
+ return [ :xmldecl, version, encoding, standalone ]
675
+ end
586
676
  [:processing_instruction, match_data[1], match_data[2]]
587
677
  end
588
678
 
589
679
  def parse_attributes(prefixes, curr_ns)
590
680
  attributes = {}
591
681
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
682
+ while true
683
+ if @source.match(">", true)
684
+ return attributes, closed
685
+ elsif @source.match("/>", true)
686
+ closed = true
687
+ return attributes, closed
688
+ elsif match = @source.match(QNAME, true)
689
+ name = match[1]
690
+ prefix = match[2]
691
+ local_part = match[3]
692
+
693
+ unless @source.match(/\s*=\s*/um, true)
618
694
  message = "Missing attribute equal: <#{name}>"
619
695
  raise REXML::ParseException.new(message, @source)
620
696
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
697
+ unless match = @source.match(/(['"])/, true)
623
698
  message = "Missing attribute value start quote: <#{name}>"
624
699
  raise REXML::ParseException.new(message, @source)
625
700
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
701
+ quote = match[1]
702
+ start_position = @source.position
703
+ value = @source.read_until(quote)
704
+ unless value.chomp!(quote)
705
+ @source.position = start_position
706
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
707
  raise REXML::ParseException.new(message, @source)
639
708
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
709
+ @source.match(/\s*/um, true)
710
+ if prefix == "xmlns"
711
+ if local_part == "xml"
712
+ if value != "http://www.w3.org/XML/1998/namespace"
713
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
714
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
715
+ raise REXML::ParseException.new( msg, @source, self )
716
+ end
717
+ elsif local_part == "xmlns"
718
+ msg = "The 'xmlns' prefix must not be declared "+
650
719
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
720
+ raise REXML::ParseException.new( msg, @source, self)
652
721
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
722
+ curr_ns << local_part
723
+ elsif prefix
724
+ prefixes << prefix unless prefix == "xml"
657
725
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
726
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
727
+ if attributes[name]
728
+ msg = "Duplicate attribute #{name.inspect}"
729
+ raise REXML::ParseException.new(msg, @source, self)
730
+ end
667
731
 
668
- attributes[name] = value
732
+ attributes[name] = value
733
+ else
734
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
735
+ raise REXML::ParseException.new(message, @source)
736
+ end
669
737
  end
670
- return attributes, closed
671
738
  end
672
739
  end
673
740
  end