rexml 3.2.3 → 3.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
4
  require_relative '../source'
@@ -50,7 +50,6 @@ module REXML
50
50
 
51
51
  DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
52
52
  DOCTYPE_END = /\A\s*\]\s*>/um
53
- DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
54
53
  ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
55
54
  COMMENT_START = /\A<!--/u
56
55
  COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -61,15 +60,14 @@ module REXML
61
60
  XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
62
61
  INSTRUCTION_START = /\A<\?/u
63
62
  INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
64
- TAG_MATCH = /^<((?>#{QNAME_STR}))/um
65
- CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
63
+ TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
64
+ CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
66
65
 
67
66
  VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
68
67
  ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
69
68
  STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
70
69
 
71
70
  ENTITY_START = /\A\s*<!ENTITY/
72
- IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
73
71
  ELEMENTDECL_START = /\A\s*<!ELEMENT/um
74
72
  ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
75
73
  SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
@@ -83,9 +81,6 @@ module REXML
83
81
  ATTDEF_RE = /#{ATTDEF}/
84
82
  ATTLISTDECL_START = /\A\s*<!ATTLIST/um
85
83
  ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
86
- NOTATIONDECL_START = /\A\s*<!NOTATION/um
87
- PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
88
- SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
89
84
 
90
85
  TEXT_PATTERN = /\A([^<]*)/um
91
86
 
@@ -101,7 +96,12 @@ module REXML
101
96
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
102
97
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
103
98
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
104
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
99
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
+
101
+ NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
+ EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
103
+ EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
104
+ PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
105
105
 
106
106
  EREFERENCE = /&(?!#{NAME};)/
107
107
 
@@ -112,6 +112,19 @@ module REXML
112
112
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
113
  }
114
114
 
115
+ module Private
116
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
117
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
118
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
119
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
120
+ NAME_PATTERN = /\s*#{NAME}/um
121
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
122
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
123
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
124
+ end
125
+ private_constant :Private
126
+ include Private
127
+
115
128
  def initialize( source )
116
129
  self.stream = source
117
130
  @listeners = []
@@ -195,162 +208,181 @@ module REXML
195
208
  return [ :end_document ] if empty?
196
209
  return @stack.shift if @stack.size > 0
197
210
  #STDERR.puts @source.encoding
198
- @source.read if @source.buffer.size<2
199
211
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
212
+
213
+ @source.ensure_buffer
200
214
  if @document_status == nil
201
- #@source.consume( /^\s*/um )
202
- word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
203
- word = word[1] unless word.nil?
204
- #STDERR.puts "WORD = #{word.inspect}"
205
- case word
206
- when COMMENT_START
207
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
208
- when XMLDECL_START
209
- #STDERR.puts "XMLDECL"
210
- results = @source.match( XMLDECL_PATTERN, true )[1]
211
- version = VERSION.match( results )
212
- version = version[1] unless version.nil?
213
- encoding = ENCODING.match(results)
214
- encoding = encoding[1] unless encoding.nil?
215
- if need_source_encoding_update?(encoding)
216
- @source.encoding = encoding
217
- end
218
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
219
- encoding = "UTF-16"
220
- end
221
- standalone = STANDALONE.match(results)
222
- standalone = standalone[1] unless standalone.nil?
223
- return [ :xmldecl, version, encoding, standalone ]
224
- when INSTRUCTION_START
225
- return process_instruction
226
- when DOCTYPE_START
227
- md = @source.match( DOCTYPE_PATTERN, true )
228
- @nsstack.unshift(curr_ns=Set.new)
229
- identity = md[1]
230
- close = md[2]
231
- identity =~ IDENTITY
232
- name = $1
233
- raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
234
- pub_sys = $2.nil? ? nil : $2.strip
235
- long_name = $4.nil? ? nil : $4.strip
236
- uri = $6.nil? ? nil : $6.strip
237
- args = [ :start_doctype, name, pub_sys, long_name, uri ]
238
- if close == ">"
239
- @document_status = :after_doctype
240
- @source.read if @source.buffer.size<2
241
- md = @source.match(/^\s*/um, true)
242
- @stack << [ :end_doctype ]
215
+ start_position = @source.position
216
+ if @source.match("<?", true)
217
+ return process_instruction(start_position)
218
+ elsif @source.match("<!", true)
219
+ if @source.match("--", true)
220
+ return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
221
+ elsif @source.match("DOCTYPE", true)
222
+ base_error_message = "Malformed DOCTYPE"
223
+ unless @source.match(/\s+/um, true)
224
+ if @source.match(">")
225
+ message = "#{base_error_message}: name is missing"
226
+ else
227
+ message = "#{base_error_message}: invalid name"
228
+ end
229
+ @source.position = start_position
230
+ raise REXML::ParseException.new(message, @source)
231
+ end
232
+ @nsstack.unshift(curr_ns=Set.new)
233
+ name = parse_name(base_error_message)
234
+ if @source.match(/\s*\[/um, true)
235
+ id = [nil, nil, nil]
236
+ @document_status = :in_doctype
237
+ elsif @source.match(/\s*>/um, true)
238
+ id = [nil, nil, nil]
239
+ @document_status = :after_doctype
240
+ @source.ensure_buffer
241
+ else
242
+ id = parse_id(base_error_message,
243
+ accept_external_id: true,
244
+ accept_public_id: false)
245
+ if id[0] == "SYSTEM"
246
+ # For backward compatibility
247
+ id[1], id[2] = id[2], nil
248
+ end
249
+ if @source.match(/\s*\[/um, true)
250
+ @document_status = :in_doctype
251
+ elsif @source.match(/\s*>/um, true)
252
+ @document_status = :after_doctype
253
+ @source.ensure_buffer
254
+ else
255
+ message = "#{base_error_message}: garbage after external ID"
256
+ raise REXML::ParseException.new(message, @source)
257
+ end
258
+ end
259
+ args = [:start_doctype, name, *id]
260
+ if @document_status == :after_doctype
261
+ @source.match(/\s*/um, true)
262
+ @stack << [ :end_doctype ]
263
+ end
264
+ return args
243
265
  else
244
- @document_status = :in_doctype
245
- end
246
- return args
247
- when /^\s+/
248
- else
249
- @document_status = :after_doctype
250
- @source.read if @source.buffer.size<2
251
- md = @source.match(/\s*/um, true)
252
- if @source.encoding == "UTF-8"
253
- @source.buffer.force_encoding(::Encoding::UTF_8)
266
+ message = "Invalid XML"
267
+ raise REXML::ParseException.new(message, @source)
254
268
  end
255
269
  end
256
270
  end
257
271
  if @document_status == :in_doctype
258
- md = @source.match(/\s*(.*?>)/um)
259
- case md[1]
260
- when SYSTEMENTITY
261
- match = @source.match( SYSTEMENTITY, true )[1]
262
- return [ :externalentity, match ]
263
-
264
- when ELEMENTDECL_START
265
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
266
-
267
- when ENTITY_START
268
- match = @source.match( ENTITYDECL, true ).to_a.compact
269
- match[0] = :entitydecl
270
- ref = false
271
- if match[1] == '%'
272
- ref = true
273
- match.delete_at 1
274
- end
275
- # Now we have to sort out what kind of entity reference this is
276
- if match[2] == 'SYSTEM'
277
- # External reference
278
- match[3] = match[3][1..-2] # PUBID
279
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
280
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
281
- elsif match[2] == 'PUBLIC'
282
- # External reference
283
- match[3] = match[3][1..-2] # PUBID
284
- match[4] = match[4][1..-2] # HREF
285
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
286
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
287
- else
288
- match[2] = match[2][1..-2]
289
- match.pop if match.size == 4
290
- # match is [ :entity, name, value ]
291
- end
292
- match << '%' if ref
293
- return match
294
- when ATTLISTDECL_START
295
- md = @source.match( ATTLISTDECL_PATTERN, true )
296
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
297
- element = md[1]
298
- contents = md[0]
299
-
300
- pairs = {}
301
- values = md[0].scan( ATTDEF_RE )
302
- values.each do |attdef|
303
- unless attdef[3] == "#IMPLIED"
304
- attdef.compact!
305
- val = attdef[3]
306
- val = attdef[4] if val == "#FIXED "
307
- pairs[attdef[0]] = val
308
- if attdef[0] =~ /^xmlns:(.*)/
309
- @nsstack[0] << $1
272
+ @source.match(/\s*/um, true) # skip spaces
273
+ start_position = @source.position
274
+ if @source.match("<!", true)
275
+ if @source.match("ELEMENT", true)
276
+ md = @source.match(/(.*?)>/um, true)
277
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
278
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
279
+ elsif @source.match("ENTITY", true)
280
+ match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
281
+ ref = false
282
+ if match[1] == '%'
283
+ ref = true
284
+ match.delete_at 1
285
+ end
286
+ # Now we have to sort out what kind of entity reference this is
287
+ if match[2] == 'SYSTEM'
288
+ # External reference
289
+ match[3] = match[3][1..-2] # PUBID
290
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
291
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
292
+ elsif match[2] == 'PUBLIC'
293
+ # External reference
294
+ match[3] = match[3][1..-2] # PUBID
295
+ match[4] = match[4][1..-2] # HREF
296
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
297
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
298
+ else
299
+ match[2] = match[2][1..-2]
300
+ match.pop if match.size == 4
301
+ # match is [ :entity, name, value ]
302
+ end
303
+ match << '%' if ref
304
+ return match
305
+ elsif @source.match("ATTLIST", true)
306
+ md = @source.match(ATTLISTDECL_END, true)
307
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
308
+ element = md[1]
309
+ contents = md[0]
310
+
311
+ pairs = {}
312
+ values = md[0].scan( ATTDEF_RE )
313
+ values.each do |attdef|
314
+ unless attdef[3] == "#IMPLIED"
315
+ attdef.compact!
316
+ val = attdef[3]
317
+ val = attdef[4] if val == "#FIXED "
318
+ pairs[attdef[0]] = val
319
+ if attdef[0] =~ /^xmlns:(.*)/
320
+ @nsstack[0] << $1
321
+ end
310
322
  end
311
323
  end
324
+ return [ :attlistdecl, element, pairs, contents ]
325
+ elsif @source.match("NOTATION", true)
326
+ base_error_message = "Malformed notation declaration"
327
+ unless @source.match(/\s+/um, true)
328
+ if @source.match(">")
329
+ message = "#{base_error_message}: name is missing"
330
+ else
331
+ message = "#{base_error_message}: invalid name"
332
+ end
333
+ @source.position = start_position
334
+ raise REXML::ParseException.new(message, @source)
335
+ end
336
+ name = parse_name(base_error_message)
337
+ id = parse_id(base_error_message,
338
+ accept_external_id: true,
339
+ accept_public_id: true)
340
+ unless @source.match(/\s*>/um, true)
341
+ message = "#{base_error_message}: garbage before end >"
342
+ raise REXML::ParseException.new(message, @source)
343
+ end
344
+ return [:notationdecl, name, *id]
345
+ elsif md = @source.match(/--(.*?)-->/um, true)
346
+ case md[1]
347
+ when /--/, /-\z/
348
+ raise REXML::ParseException.new("Malformed comment", @source)
349
+ end
350
+ return [ :comment, md[1] ] if md
312
351
  end
313
- return [ :attlistdecl, element, pairs, contents ]
314
- when NOTATIONDECL_START
315
- md = nil
316
- if @source.match( PUBLIC )
317
- md = @source.match( PUBLIC, true )
318
- vals = [md[1],md[2],md[4],md[6]]
319
- elsif @source.match( SYSTEM )
320
- md = @source.match( SYSTEM, true )
321
- vals = [md[1],md[2],nil,md[4]]
322
- else
323
- raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
324
- end
325
- return [ :notationdecl, *vals ]
326
- when DOCTYPE_END
352
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
353
+ return [ :externalentity, match[1] ]
354
+ elsif @source.match(/\]\s*>/um, true)
327
355
  @document_status = :after_doctype
328
- @source.match( DOCTYPE_END, true )
329
356
  return [ :end_doctype ]
330
357
  end
331
358
  end
359
+ if @document_status == :after_doctype
360
+ @source.match(/\s*/um, true)
361
+ end
332
362
  begin
333
- if @source.buffer[0] == ?<
334
- if @source.buffer[1] == ?/
363
+ start_position = @source.position
364
+ if @source.match("<", true)
365
+ if @source.match("/", true)
335
366
  @nsstack.shift
336
367
  last_tag = @tags.pop
337
- md = @source.match( CLOSE_MATCH, true )
368
+ md = @source.match(CLOSE_PATTERN, true)
338
369
  if md and !last_tag
339
370
  message = "Unexpected top-level end tag (got '#{md[1]}')"
340
371
  raise REXML::ParseException.new(message, @source)
341
372
  end
342
373
  if md.nil? or last_tag != md[1]
343
374
  message = "Missing end tag for '#{last_tag}'"
344
- message << " (got '#{md[1]}')" if md
375
+ message += " (got '#{md[1]}')" if md
376
+ @source.position = start_position if md.nil?
345
377
  raise REXML::ParseException.new(message, @source)
346
378
  end
347
379
  return [ :end_element, last_tag ]
348
- elsif @source.buffer[1] == ?!
349
- md = @source.match(/\A(\s*[^>]*>)/um)
380
+ elsif @source.match("!", true)
381
+ md = @source.match(/([^>]*>)/um)
350
382
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
351
383
  raise REXML::ParseException.new("Malformed node", @source) unless md
352
- if md[0][2] == ?-
353
- md = @source.match( COMMENT_PATTERN, true )
384
+ if md[0][0] == ?-
385
+ md = @source.match(/--(.*?)-->/um, true)
354
386
 
355
387
  case md[1]
356
388
  when /--/, /-\z/
@@ -359,19 +391,22 @@ module REXML
359
391
 
360
392
  return [ :comment, md[1] ] if md
361
393
  else
362
- md = @source.match( CDATA_PATTERN, true )
394
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
363
395
  return [ :cdata, md[1] ] if md
364
396
  end
365
397
  raise REXML::ParseException.new( "Declarations can only occur "+
366
398
  "in the doctype declaration.", @source)
367
- elsif @source.buffer[1] == ??
368
- return process_instruction
399
+ elsif @source.match("?", true)
400
+ return process_instruction(start_position)
369
401
  else
370
402
  # Get the next tag
371
- md = @source.match(TAG_MATCH, true)
403
+ md = @source.match(TAG_PATTERN, true)
372
404
  unless md
405
+ @source.position = start_position
373
406
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
374
407
  end
408
+ tag = md[1]
409
+ @document_status = :in_element
375
410
  prefixes = Set.new
376
411
  prefixes << md[2] if md[2]
377
412
  @nsstack.unshift(curr_ns=Set.new)
@@ -384,23 +419,17 @@ module REXML
384
419
  end
385
420
 
386
421
  if closed
387
- @closed = md[1]
422
+ @closed = tag
388
423
  @nsstack.shift
389
424
  else
390
- @tags.push( md[1] )
425
+ @tags.push( tag )
391
426
  end
392
- return [ :start_element, md[1], attributes ]
427
+ return [ :start_element, tag, attributes ]
393
428
  end
394
429
  else
395
- md = @source.match( TEXT_PATTERN, true )
396
- if md[0].length == 0
397
- @source.match( /(\s+)/, true )
398
- end
399
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
400
- #return [ :text, "" ] if md[0].length == 0
401
- # unnormalized = Text::unnormalize( md[1], self )
402
- # return PullEvent.new( :text, md[1], unnormalized )
403
- return [ :text, md[1] ]
430
+ md = @source.match(/([^<]*)/um, true)
431
+ text = md[1]
432
+ return [ :text, text ]
404
433
  end
405
434
  rescue REXML::UndefinedNamespaceException
406
435
  raise
@@ -442,8 +471,7 @@ module REXML
442
471
 
443
472
  # Unescapes all possible entities
444
473
  def unnormalize( string, entities=nil, filter=nil )
445
- rv = string.clone
446
- rv.gsub!( /\r\n?/, "\n" )
474
+ rv = string.gsub( /\r\n?/, "\n" )
447
475
  matches = rv.scan( REFERENCE_RE )
448
476
  return rv if matches.size == 0
449
477
  rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -477,97 +505,168 @@ module REXML
477
505
  true
478
506
  end
479
507
 
480
- def process_instruction
481
- match_data = @source.match(INSTRUCTION_PATTERN, true)
482
- unless match_data
483
- message = "Invalid processing instruction node"
508
+ def parse_name(base_error_message)
509
+ md = @source.match(NAME_PATTERN, true)
510
+ unless md
511
+ if @source.match(/\s*\S/um)
512
+ message = "#{base_error_message}: invalid name"
513
+ else
514
+ message = "#{base_error_message}: name is missing"
515
+ end
484
516
  raise REXML::ParseException.new(message, @source)
485
517
  end
486
- [:processing_instruction, match_data[1], match_data[2]]
518
+ md[1]
487
519
  end
488
520
 
489
- def parse_attributes(prefixes, curr_ns)
490
- attributes = {}
491
- closed = false
492
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
493
- if match_data.nil?
494
- message = "Start tag isn't ended"
521
+ def parse_id(base_error_message,
522
+ accept_external_id:,
523
+ accept_public_id:)
524
+ if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
525
+ pubid = system = nil
526
+ pubid_literal = md[1]
527
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
528
+ system_literal = md[2]
529
+ system = system_literal[1..-2] if system_literal # Remove quote
530
+ ["PUBLIC", pubid, system]
531
+ elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
532
+ pubid = system = nil
533
+ pubid_literal = md[1]
534
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
535
+ ["PUBLIC", pubid, nil]
536
+ elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
537
+ system = nil
538
+ system_literal = md[1]
539
+ system = system_literal[1..-2] if system_literal # Remove quote
540
+ ["SYSTEM", nil, system]
541
+ else
542
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
543
+ accept_public_id: accept_public_id)
544
+ message = "#{base_error_message}: #{details}"
495
545
  raise REXML::ParseException.new(message, @source)
496
546
  end
547
+ end
497
548
 
498
- raw_attributes = match_data[1]
499
- closed = !match_data[2].nil?
500
- return attributes, closed if raw_attributes.nil?
501
- return attributes, closed if raw_attributes.empty?
549
+ def parse_id_invalid_details(accept_external_id:,
550
+ accept_public_id:)
551
+ public = /\A\s*PUBLIC/um
552
+ system = /\A\s*SYSTEM/um
553
+ if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
554
+ if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
555
+ return "public ID literal is missing"
556
+ end
557
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
558
+ return "invalid public ID literal"
559
+ end
560
+ if accept_public_id
561
+ if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
562
+ return "system ID literal is missing"
563
+ end
564
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
565
+ return "invalid system literal"
566
+ end
567
+ "garbage after system literal"
568
+ else
569
+ "garbage after public ID literal"
570
+ end
571
+ elsif accept_external_id and @source.match(/#{system}/um)
572
+ if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
573
+ return "system literal is missing"
574
+ end
575
+ unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
576
+ return "invalid system literal"
577
+ end
578
+ "garbage after system literal"
579
+ else
580
+ unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
581
+ return "invalid ID type"
582
+ end
583
+ "ID type is missing"
584
+ end
585
+ end
502
586
 
503
- scanner = StringScanner.new(raw_attributes)
504
- until scanner.eos?
505
- if scanner.scan(/\s+/)
506
- break if scanner.eos?
587
+ def process_instruction(start_position)
588
+ match_data = @source.match(INSTRUCTION_END, true)
589
+ unless match_data
590
+ message = "Invalid processing instruction node"
591
+ @source.position = start_position
592
+ raise REXML::ParseException.new(message, @source)
593
+ end
594
+ if @document_status.nil? and match_data[1] == "xml"
595
+ content = match_data[2]
596
+ version = VERSION.match(content)
597
+ version = version[1] unless version.nil?
598
+ encoding = ENCODING.match(content)
599
+ encoding = encoding[1] unless encoding.nil?
600
+ if need_source_encoding_update?(encoding)
601
+ @source.encoding = encoding
507
602
  end
603
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
604
+ encoding = "UTF-16"
605
+ end
606
+ standalone = STANDALONE.match(content)
607
+ standalone = standalone[1] unless standalone.nil?
608
+ return [ :xmldecl, version, encoding, standalone ]
609
+ end
610
+ [:processing_instruction, match_data[1], match_data[2]]
611
+ end
508
612
 
509
- pos = scanner.pos
510
- loop do
511
- break if scanner.scan(ATTRIBUTE_PATTERN)
512
- unless scanner.scan(QNAME)
513
- message = "Invalid attribute name: <#{scanner.rest}>"
514
- raise REXML::ParseException.new(message, @source)
515
- end
516
- name = scanner[0]
517
- unless scanner.scan(/\s*=\s*/um)
613
+ def parse_attributes(prefixes, curr_ns)
614
+ attributes = {}
615
+ closed = false
616
+ while true
617
+ if @source.match(">", true)
618
+ return attributes, closed
619
+ elsif @source.match("/>", true)
620
+ closed = true
621
+ return attributes, closed
622
+ elsif match = @source.match(QNAME, true)
623
+ name = match[1]
624
+ prefix = match[2]
625
+ local_part = match[3]
626
+
627
+ unless @source.match(/\s*=\s*/um, true)
518
628
  message = "Missing attribute equal: <#{name}>"
519
629
  raise REXML::ParseException.new(message, @source)
520
630
  end
521
- quote = scanner.scan(/['"]/)
522
- unless quote
631
+ unless match = @source.match(/(['"])/, true)
523
632
  message = "Missing attribute value start quote: <#{name}>"
524
633
  raise REXML::ParseException.new(message, @source)
525
634
  end
526
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
527
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
528
- if match_data
529
- scanner << "/" if closed
530
- scanner << ">"
531
- scanner << match_data[1]
532
- scanner.pos = pos
533
- closed = !match_data[2].nil?
534
- next
535
- end
536
- message =
537
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
635
+ quote = match[1]
636
+ value = @source.read_until(quote)
637
+ unless value.chomp!(quote)
638
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
538
639
  raise REXML::ParseException.new(message, @source)
539
640
  end
540
- end
541
- name = scanner[1]
542
- prefix = scanner[2]
543
- local_part = scanner[3]
544
- # quote = scanner[4]
545
- value = scanner[5]
546
- if prefix == "xmlns"
547
- if local_part == "xml"
548
- if value != "http://www.w3.org/XML/1998/namespace"
549
- msg = "The 'xml' prefix must not be bound to any other namespace "+
641
+ @source.match(/\s*/um, true)
642
+ if prefix == "xmlns"
643
+ if local_part == "xml"
644
+ if value != "http://www.w3.org/XML/1998/namespace"
645
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
646
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
647
+ raise REXML::ParseException.new( msg, @source, self )
648
+ end
649
+ elsif local_part == "xmlns"
650
+ msg = "The 'xmlns' prefix must not be declared "+
550
651
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
551
- raise REXML::ParseException.new( msg, @source, self )
652
+ raise REXML::ParseException.new( msg, @source, self)
552
653
  end
553
- elsif local_part == "xmlns"
554
- msg = "The 'xmlns' prefix must not be declared "+
555
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
556
- raise REXML::ParseException.new( msg, @source, self)
654
+ curr_ns << local_part
655
+ elsif prefix
656
+ prefixes << prefix unless prefix == "xml"
557
657
  end
558
- curr_ns << local_part
559
- elsif prefix
560
- prefixes << prefix unless prefix == "xml"
561
- end
562
658
 
563
- if attributes.has_key?(name)
564
- msg = "Duplicate attribute #{name.inspect}"
565
- raise REXML::ParseException.new(msg, @source, self)
566
- end
659
+ if attributes[name]
660
+ msg = "Duplicate attribute #{name.inspect}"
661
+ raise REXML::ParseException.new(msg, @source, self)
662
+ end
567
663
 
568
- attributes[name] = value
664
+ attributes[name] = value
665
+ else
666
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
667
+ raise REXML::ParseException.new(message, @source)
668
+ end
569
669
  end
570
- return attributes, closed
571
670
  end
572
671
  end
573
672
  end