rexml 3.2.3 → 3.3.8

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +502 -0
  3. data/README.md +11 -14
  4. data/doc/rexml/context.rdoc +143 -0
  5. data/doc/rexml/tasks/rdoc/child.rdoc +87 -0
  6. data/doc/rexml/tasks/rdoc/document.rdoc +276 -0
  7. data/doc/rexml/tasks/rdoc/element.rdoc +602 -0
  8. data/doc/rexml/tasks/rdoc/node.rdoc +97 -0
  9. data/doc/rexml/tasks/rdoc/parent.rdoc +267 -0
  10. data/doc/rexml/tasks/tocs/child_toc.rdoc +12 -0
  11. data/doc/rexml/tasks/tocs/document_toc.rdoc +30 -0
  12. data/doc/rexml/tasks/tocs/element_toc.rdoc +55 -0
  13. data/doc/rexml/tasks/tocs/master_toc.rdoc +135 -0
  14. data/doc/rexml/tasks/tocs/node_toc.rdoc +16 -0
  15. data/doc/rexml/tasks/tocs/parent_toc.rdoc +25 -0
  16. data/doc/rexml/tutorial.rdoc +1358 -0
  17. data/lib/rexml/attribute.rb +17 -11
  18. data/lib/rexml/doctype.rb +55 -31
  19. data/lib/rexml/document.rb +199 -35
  20. data/lib/rexml/element.rb +1802 -487
  21. data/lib/rexml/entity.rb +10 -39
  22. data/lib/rexml/formatters/pretty.rb +3 -3
  23. data/lib/rexml/functions.rb +1 -2
  24. data/lib/rexml/light/node.rb +0 -8
  25. data/lib/rexml/namespace.rb +8 -4
  26. data/lib/rexml/node.rb +8 -4
  27. data/lib/rexml/parseexception.rb +1 -0
  28. data/lib/rexml/parsers/baseparser.rb +513 -250
  29. data/lib/rexml/parsers/pullparser.rb +12 -0
  30. data/lib/rexml/parsers/sax2parser.rb +16 -19
  31. data/lib/rexml/parsers/streamparser.rb +16 -10
  32. data/lib/rexml/parsers/treeparser.rb +9 -21
  33. data/lib/rexml/parsers/xpathparser.rb +161 -97
  34. data/lib/rexml/rexml.rb +29 -22
  35. data/lib/rexml/source.rb +128 -98
  36. data/lib/rexml/text.rb +46 -22
  37. data/lib/rexml/xpath_parser.rb +43 -33
  38. data/lib/rexml.rb +3 -0
  39. metadata +42 -46
  40. data/.gitignore +0 -9
  41. data/.travis.yml +0 -24
  42. data/Gemfile +0 -6
  43. data/Rakefile +0 -8
  44. data/rexml.gemspec +0 -84
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -50,7 +78,6 @@ module REXML
50
78
 
51
79
  DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
52
80
  DOCTYPE_END = /\A\s*\]\s*>/um
53
- DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
54
81
  ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
55
82
  COMMENT_START = /\A<!--/u
56
83
  COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -61,15 +88,14 @@ module REXML
61
88
  XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
62
89
  INSTRUCTION_START = /\A<\?/u
63
90
  INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
64
- TAG_MATCH = /^<((?>#{QNAME_STR}))/um
65
- CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
91
+ TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
92
+ CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
66
93
 
67
94
  VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
68
95
  ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
69
96
  STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
70
97
 
71
98
  ENTITY_START = /\A\s*<!ENTITY/
72
- IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
73
99
  ELEMENTDECL_START = /\A\s*<!ELEMENT/um
74
100
  ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
75
101
  SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
@@ -83,9 +109,6 @@ module REXML
83
109
  ATTDEF_RE = /#{ATTDEF}/
84
110
  ATTLISTDECL_START = /\A\s*<!ATTLIST/um
85
111
  ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
86
- NOTATIONDECL_START = /\A\s*<!NOTATION/um
87
- PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
88
- SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
89
112
 
90
113
  TEXT_PATTERN = /\A([^<]*)/um
91
114
 
@@ -101,7 +124,12 @@ module REXML
101
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
102
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
103
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
104
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
128
+
129
+ NOTATIONDECL_START = /\A\s*<!NOTATION/um
130
+ EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
131
+ EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
132
+ PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
105
133
 
106
134
  EREFERENCE = /&(?!#{NAME};)/
107
135
 
@@ -112,9 +140,33 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
118
170
  end
119
171
 
120
172
  def add_listener( listener )
@@ -122,15 +174,20 @@ module REXML
122
174
  end
123
175
 
124
176
  attr_reader :source
177
+ attr_reader :entity_expansion_count
178
+ attr_writer :entity_expansion_limit
179
+ attr_writer :entity_expansion_text_limit
125
180
 
126
181
  def stream=( source )
127
182
  @source = SourceFactory.create_from( source )
128
183
  @closed = nil
184
+ @have_root = false
129
185
  @document_status = nil
130
186
  @tags = []
131
187
  @stack = []
132
188
  @entities = []
133
- @nsstack = []
189
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
190
+ @namespaces_restore_stack = []
134
191
  end
135
192
 
136
193
  def position
@@ -180,6 +237,8 @@ module REXML
180
237
 
181
238
  # Returns the next event. This is a +PullEvent+ object.
182
239
  def pull
240
+ @source.drop_parsed_content
241
+
183
242
  pull_event.tap do |event|
184
243
  @listeners.each do |listener|
185
244
  listener.receive event
@@ -192,215 +251,274 @@ module REXML
192
251
  x, @closed = @closed, nil
193
252
  return [ :end_element, x ]
194
253
  end
195
- return [ :end_document ] if empty?
254
+ if empty?
255
+ if @document_status == :in_doctype
256
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
257
+ end
258
+ unless @tags.empty?
259
+ path = "/" + @tags.join("/")
260
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
261
+ end
262
+ return [ :end_document ]
263
+ end
196
264
  return @stack.shift if @stack.size > 0
197
265
  #STDERR.puts @source.encoding
198
- @source.read if @source.buffer.size<2
199
266
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
267
+
268
+ @source.ensure_buffer
200
269
  if @document_status == nil
201
- #@source.consume( /^\s*/um )
202
- word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
203
- word = word[1] unless word.nil?
204
- #STDERR.puts "WORD = #{word.inspect}"
205
- case word
206
- when COMMENT_START
207
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
208
- when XMLDECL_START
209
- #STDERR.puts "XMLDECL"
210
- results = @source.match( XMLDECL_PATTERN, true )[1]
211
- version = VERSION.match( results )
212
- version = version[1] unless version.nil?
213
- encoding = ENCODING.match(results)
214
- encoding = encoding[1] unless encoding.nil?
215
- if need_source_encoding_update?(encoding)
216
- @source.encoding = encoding
217
- end
218
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
219
- encoding = "UTF-16"
220
- end
221
- standalone = STANDALONE.match(results)
222
- standalone = standalone[1] unless standalone.nil?
223
- return [ :xmldecl, version, encoding, standalone ]
224
- when INSTRUCTION_START
270
+ start_position = @source.position
271
+ if @source.match("<?", true)
225
272
  return process_instruction
226
- when DOCTYPE_START
227
- md = @source.match( DOCTYPE_PATTERN, true )
228
- @nsstack.unshift(curr_ns=Set.new)
229
- identity = md[1]
230
- close = md[2]
231
- identity =~ IDENTITY
232
- name = $1
233
- raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
234
- pub_sys = $2.nil? ? nil : $2.strip
235
- long_name = $4.nil? ? nil : $4.strip
236
- uri = $6.nil? ? nil : $6.strip
237
- args = [ :start_doctype, name, pub_sys, long_name, uri ]
238
- if close == ">"
239
- @document_status = :after_doctype
240
- @source.read if @source.buffer.size<2
241
- md = @source.match(/^\s*/um, true)
242
- @stack << [ :end_doctype ]
273
+ elsif @source.match("<!", true)
274
+ if @source.match("--", true)
275
+ md = @source.match(/(.*?)-->/um, true)
276
+ if md.nil?
277
+ raise REXML::ParseException.new("Unclosed comment", @source)
278
+ end
279
+ if /--|-\z/.match?(md[1])
280
+ raise REXML::ParseException.new("Malformed comment", @source)
281
+ end
282
+ return [ :comment, md[1] ]
283
+ elsif @source.match("DOCTYPE", true)
284
+ base_error_message = "Malformed DOCTYPE"
285
+ unless @source.match(/\s+/um, true)
286
+ if @source.match(">")
287
+ message = "#{base_error_message}: name is missing"
288
+ else
289
+ message = "#{base_error_message}: invalid name"
290
+ end
291
+ @source.position = start_position
292
+ raise REXML::ParseException.new(message, @source)
293
+ end
294
+ name = parse_name(base_error_message)
295
+ if @source.match(/\s*\[/um, true)
296
+ id = [nil, nil, nil]
297
+ @document_status = :in_doctype
298
+ elsif @source.match(/\s*>/um, true)
299
+ id = [nil, nil, nil]
300
+ @document_status = :after_doctype
301
+ @source.ensure_buffer
302
+ else
303
+ id = parse_id(base_error_message,
304
+ accept_external_id: true,
305
+ accept_public_id: false)
306
+ if id[0] == "SYSTEM"
307
+ # For backward compatibility
308
+ id[1], id[2] = id[2], nil
309
+ end
310
+ if @source.match(/\s*\[/um, true)
311
+ @document_status = :in_doctype
312
+ elsif @source.match(/\s*>/um, true)
313
+ @document_status = :after_doctype
314
+ @source.ensure_buffer
315
+ else
316
+ message = "#{base_error_message}: garbage after external ID"
317
+ raise REXML::ParseException.new(message, @source)
318
+ end
319
+ end
320
+ args = [:start_doctype, name, *id]
321
+ if @document_status == :after_doctype
322
+ @source.match(/\s*/um, true)
323
+ @stack << [ :end_doctype ]
324
+ end
325
+ return args
243
326
  else
244
- @document_status = :in_doctype
245
- end
246
- return args
247
- when /^\s+/
248
- else
249
- @document_status = :after_doctype
250
- @source.read if @source.buffer.size<2
251
- md = @source.match(/\s*/um, true)
252
- if @source.encoding == "UTF-8"
253
- @source.buffer.force_encoding(::Encoding::UTF_8)
327
+ message = "Invalid XML"
328
+ raise REXML::ParseException.new(message, @source)
254
329
  end
255
330
  end
256
331
  end
257
332
  if @document_status == :in_doctype
258
- md = @source.match(/\s*(.*?>)/um)
259
- case md[1]
260
- when SYSTEMENTITY
261
- match = @source.match( SYSTEMENTITY, true )[1]
262
- return [ :externalentity, match ]
263
-
264
- when ELEMENTDECL_START
265
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
266
-
267
- when ENTITY_START
268
- match = @source.match( ENTITYDECL, true ).to_a.compact
269
- match[0] = :entitydecl
270
- ref = false
271
- if match[1] == '%'
272
- ref = true
273
- match.delete_at 1
274
- end
275
- # Now we have to sort out what kind of entity reference this is
276
- if match[2] == 'SYSTEM'
277
- # External reference
278
- match[3] = match[3][1..-2] # PUBID
279
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
280
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
281
- elsif match[2] == 'PUBLIC'
282
- # External reference
283
- match[3] = match[3][1..-2] # PUBID
284
- match[4] = match[4][1..-2] # HREF
285
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
286
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
287
- else
288
- match[2] = match[2][1..-2]
289
- match.pop if match.size == 4
290
- # match is [ :entity, name, value ]
291
- end
292
- match << '%' if ref
293
- return match
294
- when ATTLISTDECL_START
295
- md = @source.match( ATTLISTDECL_PATTERN, true )
296
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
297
- element = md[1]
298
- contents = md[0]
299
-
300
- pairs = {}
301
- values = md[0].scan( ATTDEF_RE )
302
- values.each do |attdef|
303
- unless attdef[3] == "#IMPLIED"
304
- attdef.compact!
305
- val = attdef[3]
306
- val = attdef[4] if val == "#FIXED "
307
- pairs[attdef[0]] = val
308
- if attdef[0] =~ /^xmlns:(.*)/
309
- @nsstack[0] << $1
333
+ @source.match(/\s*/um, true) # skip spaces
334
+ start_position = @source.position
335
+ if @source.match("<!", true)
336
+ if @source.match("ELEMENT", true)
337
+ md = @source.match(/(.*?)>/um, true)
338
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
339
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
340
+ elsif @source.match("ENTITY", true)
341
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
342
+ unless match_data
343
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
344
+ end
345
+ match = [:entitydecl, *match_data.captures.compact]
346
+ ref = false
347
+ if match[1] == '%'
348
+ ref = true
349
+ match.delete_at 1
350
+ end
351
+ # Now we have to sort out what kind of entity reference this is
352
+ if match[2] == 'SYSTEM'
353
+ # External reference
354
+ match[3] = match[3][1..-2] # PUBID
355
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
356
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
357
+ elsif match[2] == 'PUBLIC'
358
+ # External reference
359
+ match[3] = match[3][1..-2] # PUBID
360
+ match[4] = match[4][1..-2] # HREF
361
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
362
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
363
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
364
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
365
+ else
366
+ match[2] = match[2][1..-2]
367
+ match.pop if match.size == 4
368
+ # match is [ :entity, name, value ]
369
+ end
370
+ match << '%' if ref
371
+ return match
372
+ elsif @source.match("ATTLIST", true)
373
+ md = @source.match(Private::ATTLISTDECL_END, true)
374
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
375
+ element = md[1]
376
+ contents = md[0]
377
+
378
+ pairs = {}
379
+ values = md[0].strip.scan( ATTDEF_RE )
380
+ values.each do |attdef|
381
+ unless attdef[3] == "#IMPLIED"
382
+ attdef.compact!
383
+ val = attdef[3]
384
+ val = attdef[4] if val == "#FIXED "
385
+ pairs[attdef[0]] = val
386
+ if attdef[0] =~ /^xmlns:(.*)/
387
+ @namespaces[$1] = val
388
+ end
310
389
  end
311
390
  end
391
+ return [ :attlistdecl, element, pairs, contents ]
392
+ elsif @source.match("NOTATION", true)
393
+ base_error_message = "Malformed notation declaration"
394
+ unless @source.match(/\s+/um, true)
395
+ if @source.match(">")
396
+ message = "#{base_error_message}: name is missing"
397
+ else
398
+ message = "#{base_error_message}: invalid name"
399
+ end
400
+ @source.position = start_position
401
+ raise REXML::ParseException.new(message, @source)
402
+ end
403
+ name = parse_name(base_error_message)
404
+ id = parse_id(base_error_message,
405
+ accept_external_id: true,
406
+ accept_public_id: true)
407
+ unless @source.match(/\s*>/um, true)
408
+ message = "#{base_error_message}: garbage before end >"
409
+ raise REXML::ParseException.new(message, @source)
410
+ end
411
+ return [:notationdecl, name, *id]
412
+ elsif md = @source.match(/--(.*?)-->/um, true)
413
+ case md[1]
414
+ when /--/, /-\z/
415
+ raise REXML::ParseException.new("Malformed comment", @source)
416
+ end
417
+ return [ :comment, md[1] ] if md
312
418
  end
313
- return [ :attlistdecl, element, pairs, contents ]
314
- when NOTATIONDECL_START
315
- md = nil
316
- if @source.match( PUBLIC )
317
- md = @source.match( PUBLIC, true )
318
- vals = [md[1],md[2],md[4],md[6]]
319
- elsif @source.match( SYSTEM )
320
- md = @source.match( SYSTEM, true )
321
- vals = [md[1],md[2],nil,md[4]]
322
- else
323
- raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
324
- end
325
- return [ :notationdecl, *vals ]
326
- when DOCTYPE_END
419
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
420
+ return [ :externalentity, match[1] ]
421
+ elsif @source.match(/\]\s*>/um, true)
327
422
  @document_status = :after_doctype
328
- @source.match( DOCTYPE_END, true )
329
423
  return [ :end_doctype ]
330
424
  end
425
+ if @document_status == :in_doctype
426
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
427
+ end
428
+ end
429
+ if @document_status == :after_doctype
430
+ @source.match(/\s*/um, true)
331
431
  end
332
432
  begin
333
- if @source.buffer[0] == ?<
334
- if @source.buffer[1] == ?/
335
- @nsstack.shift
433
+ start_position = @source.position
434
+ if @source.match("<", true)
435
+ # :text's read_until may remain only "<" in buffer. In the
436
+ # case, buffer is empty here. So we need to fill buffer
437
+ # here explicitly.
438
+ @source.ensure_buffer
439
+ if @source.match("/", true)
440
+ @namespaces_restore_stack.pop
336
441
  last_tag = @tags.pop
337
- md = @source.match( CLOSE_MATCH, true )
442
+ md = @source.match(Private::CLOSE_PATTERN, true)
338
443
  if md and !last_tag
339
444
  message = "Unexpected top-level end tag (got '#{md[1]}')"
340
445
  raise REXML::ParseException.new(message, @source)
341
446
  end
342
447
  if md.nil? or last_tag != md[1]
343
448
  message = "Missing end tag for '#{last_tag}'"
344
- message << " (got '#{md[1]}')" if md
449
+ message += " (got '#{md[1]}')" if md
450
+ @source.position = start_position if md.nil?
345
451
  raise REXML::ParseException.new(message, @source)
346
452
  end
347
453
  return [ :end_element, last_tag ]
348
- elsif @source.buffer[1] == ?!
349
- md = @source.match(/\A(\s*[^>]*>)/um)
454
+ elsif @source.match("!", true)
455
+ md = @source.match(/([^>]*>)/um)
350
456
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
351
457
  raise REXML::ParseException.new("Malformed node", @source) unless md
352
- if md[0][2] == ?-
353
- md = @source.match( COMMENT_PATTERN, true )
458
+ if md[0][0] == ?-
459
+ md = @source.match(/--(.*?)-->/um, true)
354
460
 
355
- case md[1]
356
- when /--/, /-\z/
461
+ if md.nil? || /--|-\z/.match?(md[1])
357
462
  raise REXML::ParseException.new("Malformed comment", @source)
358
463
  end
359
464
 
360
- return [ :comment, md[1] ] if md
465
+ return [ :comment, md[1] ]
361
466
  else
362
- md = @source.match( CDATA_PATTERN, true )
467
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
363
468
  return [ :cdata, md[1] ] if md
364
469
  end
365
470
  raise REXML::ParseException.new( "Declarations can only occur "+
366
471
  "in the doctype declaration.", @source)
367
- elsif @source.buffer[1] == ??
472
+ elsif @source.match("?", true)
368
473
  return process_instruction
369
474
  else
370
475
  # Get the next tag
371
- md = @source.match(TAG_MATCH, true)
476
+ md = @source.match(Private::TAG_PATTERN, true)
372
477
  unless md
478
+ @source.position = start_position
373
479
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
374
480
  end
375
- prefixes = Set.new
376
- prefixes << md[2] if md[2]
377
- @nsstack.unshift(curr_ns=Set.new)
378
- attributes, closed = parse_attributes(prefixes, curr_ns)
481
+ tag = md[1]
482
+ @document_status = :in_element
483
+ @prefixes.clear
484
+ @prefixes << md[2] if md[2]
485
+ push_namespaces_restore
486
+ attributes, closed = parse_attributes(@prefixes)
379
487
  # Verify that all of the prefixes have been defined
380
- for prefix in prefixes
381
- unless @nsstack.find{|k| k.member?(prefix)}
488
+ for prefix in @prefixes
489
+ unless @namespaces.key?(prefix)
382
490
  raise UndefinedNamespaceException.new(prefix,@source,self)
383
491
  end
384
492
  end
385
493
 
386
494
  if closed
387
- @closed = md[1]
388
- @nsstack.shift
495
+ @closed = tag
496
+ pop_namespaces_restore
389
497
  else
390
- @tags.push( md[1] )
498
+ if @tags.empty? and @have_root
499
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
500
+ end
501
+ @tags.push( tag )
391
502
  end
392
- return [ :start_element, md[1], attributes ]
503
+ @have_root = true
504
+ return [ :start_element, tag, attributes ]
393
505
  end
394
506
  else
395
- md = @source.match( TEXT_PATTERN, true )
396
- if md[0].length == 0
397
- @source.match( /(\s+)/, true )
507
+ text = @source.read_until("<")
508
+ if text.chomp!("<")
509
+ @source.position -= "<".bytesize
398
510
  end
399
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
400
- #return [ :text, "" ] if md[0].length == 0
401
- # unnormalized = Text::unnormalize( md[1], self )
402
- # return PullEvent.new( :text, md[1], unnormalized )
403
- return [ :text, md[1] ]
511
+ if @tags.empty?
512
+ unless /\A\s*\z/.match?(text)
513
+ if @have_root
514
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
515
+ else
516
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
517
+ end
518
+ end
519
+ return pull_event if @have_root
520
+ end
521
+ return [ :text, text ]
404
522
  end
405
523
  rescue REXML::UndefinedNamespaceException
406
524
  raise
@@ -415,13 +533,13 @@ module REXML
415
533
  private :pull_event
416
534
 
417
535
  def entity( reference, entities )
418
- value = nil
419
- value = entities[ reference ] if entities
420
- if not value
421
- value = DEFAULT_ENTITIES[ reference ]
422
- value = value[2] if value
423
- end
424
- unnormalize( value, entities ) if value
536
+ return unless entities
537
+
538
+ value = entities[ reference ]
539
+ return if value.nil?
540
+
541
+ record_entity_expansion
542
+ unnormalize( value, entities )
425
543
  end
426
544
 
427
545
  # Escapes all possible entities
@@ -442,132 +560,277 @@ module REXML
442
560
 
443
561
  # Unescapes all possible entities
444
562
  def unnormalize( string, entities=nil, filter=nil )
445
- rv = string.clone
446
- rv.gsub!( /\r\n?/, "\n" )
563
+ if string.include?("\r")
564
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
565
+ else
566
+ rv = string.dup
567
+ end
447
568
  matches = rv.scan( REFERENCE_RE )
448
569
  return rv if matches.size == 0
449
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
570
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
450
571
  m=$1
451
572
  m = "0#{m}" if m[0] == ?x
452
573
  [Integer(m)].pack('U*')
453
574
  }
454
575
  matches.collect!{|x|x[0]}.compact!
576
+ if filter
577
+ matches.reject! do |entity_reference|
578
+ filter.include?(entity_reference)
579
+ end
580
+ end
455
581
  if matches.size > 0
456
- matches.each do |entity_reference|
457
- unless filter and filter.include?(entity_reference)
458
- entity_value = entity( entity_reference, entities )
459
- if entity_value
460
- re = /&#{entity_reference};/
461
- rv.gsub!( re, entity_value )
462
- else
463
- er = DEFAULT_ENTITIES[entity_reference]
464
- rv.gsub!( er[0], er[2] ) if er
582
+ matches.tally.each do |entity_reference, n|
583
+ entity_expansion_count_before = @entity_expansion_count
584
+ entity_value = entity( entity_reference, entities )
585
+ if entity_value
586
+ if n > 1
587
+ entity_expansion_count_delta =
588
+ @entity_expansion_count - entity_expansion_count_before
589
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
465
590
  end
591
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
592
+ rv.gsub!( re, entity_value )
593
+ if rv.bytesize > @entity_expansion_text_limit
594
+ raise "entity expansion has grown too large"
595
+ end
596
+ else
597
+ er = DEFAULT_ENTITIES[entity_reference]
598
+ rv.gsub!( er[0], er[2] ) if er
466
599
  end
467
600
  end
468
- rv.gsub!( /&amp;/, '&' )
601
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
469
602
  end
470
603
  rv
471
604
  end
472
605
 
473
606
  private
607
+ def add_namespace(prefix, uri)
608
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
609
+ if uri.nil?
610
+ @namespaces.delete(prefix)
611
+ else
612
+ @namespaces[prefix] = uri
613
+ end
614
+ end
615
+
616
+ def push_namespaces_restore
617
+ namespaces_restore = {}
618
+ @namespaces_restore_stack.push(namespaces_restore)
619
+ namespaces_restore
620
+ end
621
+
622
+ def pop_namespaces_restore
623
+ namespaces_restore = @namespaces_restore_stack.pop
624
+ namespaces_restore.each do |prefix, uri|
625
+ if uri.nil?
626
+ @namespaces.delete(prefix)
627
+ else
628
+ @namespaces[prefix] = uri
629
+ end
630
+ end
631
+ end
632
+
633
+ def record_entity_expansion(delta=1)
634
+ @entity_expansion_count += delta
635
+ if @entity_expansion_count > @entity_expansion_limit
636
+ raise "number of entity expansions exceeded, processing aborted."
637
+ end
638
+ end
639
+
474
640
  def need_source_encoding_update?(xml_declaration_encoding)
475
641
  return false if xml_declaration_encoding.nil?
476
642
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
477
643
  true
478
644
  end
479
645
 
480
- def process_instruction
481
- match_data = @source.match(INSTRUCTION_PATTERN, true)
482
- unless match_data
483
- message = "Invalid processing instruction node"
646
+ def parse_name(base_error_message)
647
+ md = @source.match(Private::NAME_PATTERN, true)
648
+ unless md
649
+ if @source.match(/\S/um)
650
+ message = "#{base_error_message}: invalid name"
651
+ else
652
+ message = "#{base_error_message}: name is missing"
653
+ end
484
654
  raise REXML::ParseException.new(message, @source)
485
655
  end
486
- [:processing_instruction, match_data[1], match_data[2]]
656
+ md[0]
487
657
  end
488
658
 
489
- def parse_attributes(prefixes, curr_ns)
490
- attributes = {}
491
- closed = false
492
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
493
- if match_data.nil?
494
- message = "Start tag isn't ended"
659
+ def parse_id(base_error_message,
660
+ accept_external_id:,
661
+ accept_public_id:)
662
+ if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
663
+ pubid = system = nil
664
+ pubid_literal = md[1]
665
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
666
+ system_literal = md[2]
667
+ system = system_literal[1..-2] if system_literal # Remove quote
668
+ ["PUBLIC", pubid, system]
669
+ elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
670
+ pubid = system = nil
671
+ pubid_literal = md[1]
672
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
673
+ ["PUBLIC", pubid, nil]
674
+ elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
675
+ system = nil
676
+ system_literal = md[1]
677
+ system = system_literal[1..-2] if system_literal # Remove quote
678
+ ["SYSTEM", nil, system]
679
+ else
680
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
681
+ accept_public_id: accept_public_id)
682
+ message = "#{base_error_message}: #{details}"
495
683
  raise REXML::ParseException.new(message, @source)
496
684
  end
685
+ end
497
686
 
498
- raw_attributes = match_data[1]
499
- closed = !match_data[2].nil?
500
- return attributes, closed if raw_attributes.nil?
501
- return attributes, closed if raw_attributes.empty?
687
+ def parse_id_invalid_details(accept_external_id:,
688
+ accept_public_id:)
689
+ public = /\A\s*PUBLIC/um
690
+ system = /\A\s*SYSTEM/um
691
+ if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
692
+ if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
693
+ return "public ID literal is missing"
694
+ end
695
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
696
+ return "invalid public ID literal"
697
+ end
698
+ if accept_public_id
699
+ if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
700
+ return "system ID literal is missing"
701
+ end
702
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
703
+ return "invalid system literal"
704
+ end
705
+ "garbage after system literal"
706
+ else
707
+ "garbage after public ID literal"
708
+ end
709
+ elsif accept_external_id and @source.match(/#{system}/um)
710
+ if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
711
+ return "system literal is missing"
712
+ end
713
+ unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
714
+ return "invalid system literal"
715
+ end
716
+ "garbage after system literal"
717
+ else
718
+ unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
719
+ return "invalid ID type"
720
+ end
721
+ "ID type is missing"
722
+ end
723
+ end
502
724
 
503
- scanner = StringScanner.new(raw_attributes)
504
- until scanner.eos?
505
- if scanner.scan(/\s+/)
506
- break if scanner.eos?
725
+ def process_instruction
726
+ name = parse_name("Malformed XML: Invalid processing instruction node")
727
+ if @source.match(/\s+/um, true)
728
+ match_data = @source.match(/(.*?)\?>/um, true)
729
+ unless match_data
730
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
507
731
  end
732
+ content = match_data[1]
733
+ else
734
+ content = nil
735
+ unless @source.match("?>", true)
736
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
737
+ end
738
+ end
739
+ if name == "xml"
740
+ if @document_status
741
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
742
+ end
743
+ version = VERSION.match(content)
744
+ version = version[1] unless version.nil?
745
+ encoding = ENCODING.match(content)
746
+ encoding = encoding[1] unless encoding.nil?
747
+ if need_source_encoding_update?(encoding)
748
+ @source.encoding = encoding
749
+ end
750
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
751
+ encoding = "UTF-16"
752
+ end
753
+ standalone = STANDALONE.match(content)
754
+ standalone = standalone[1] unless standalone.nil?
755
+ return [ :xmldecl, version, encoding, standalone ]
756
+ end
757
+ [:processing_instruction, name, content]
758
+ end
508
759
 
509
- pos = scanner.pos
510
- loop do
511
- break if scanner.scan(ATTRIBUTE_PATTERN)
512
- unless scanner.scan(QNAME)
513
- message = "Invalid attribute name: <#{scanner.rest}>"
514
- raise REXML::ParseException.new(message, @source)
515
- end
516
- name = scanner[0]
517
- unless scanner.scan(/\s*=\s*/um)
760
+ def parse_attributes(prefixes)
761
+ attributes = {}
762
+ expanded_names = {}
763
+ closed = false
764
+ while true
765
+ if @source.match(">", true)
766
+ return attributes, closed
767
+ elsif @source.match("/>", true)
768
+ closed = true
769
+ return attributes, closed
770
+ elsif match = @source.match(QNAME, true)
771
+ name = match[1]
772
+ prefix = match[2]
773
+ local_part = match[3]
774
+
775
+ unless @source.match(/\s*=\s*/um, true)
518
776
  message = "Missing attribute equal: <#{name}>"
519
777
  raise REXML::ParseException.new(message, @source)
520
778
  end
521
- quote = scanner.scan(/['"]/)
522
- unless quote
779
+ unless match = @source.match(/(['"])/, true)
523
780
  message = "Missing attribute value start quote: <#{name}>"
524
781
  raise REXML::ParseException.new(message, @source)
525
782
  end
526
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
527
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
528
- if match_data
529
- scanner << "/" if closed
530
- scanner << ">"
531
- scanner << match_data[1]
532
- scanner.pos = pos
533
- closed = !match_data[2].nil?
534
- next
535
- end
536
- message =
537
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
783
+ quote = match[1]
784
+ start_position = @source.position
785
+ value = @source.read_until(quote)
786
+ unless value.chomp!(quote)
787
+ @source.position = start_position
788
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
538
789
  raise REXML::ParseException.new(message, @source)
539
790
  end
540
- end
541
- name = scanner[1]
542
- prefix = scanner[2]
543
- local_part = scanner[3]
544
- # quote = scanner[4]
545
- value = scanner[5]
546
- if prefix == "xmlns"
547
- if local_part == "xml"
548
- if value != "http://www.w3.org/XML/1998/namespace"
549
- msg = "The 'xml' prefix must not be bound to any other namespace "+
791
+ @source.match(/\s*/um, true)
792
+ if prefix == "xmlns"
793
+ if local_part == "xml"
794
+ if value != Private::XML_PREFIXED_NAMESPACE
795
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
796
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
797
+ raise REXML::ParseException.new( msg, @source, self )
798
+ end
799
+ elsif local_part == "xmlns"
800
+ msg = "The 'xmlns' prefix must not be declared "+
550
801
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
551
- raise REXML::ParseException.new( msg, @source, self )
802
+ raise REXML::ParseException.new( msg, @source, self)
552
803
  end
553
- elsif local_part == "xmlns"
554
- msg = "The 'xmlns' prefix must not be declared "+
555
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
556
- raise REXML::ParseException.new( msg, @source, self)
804
+ add_namespace(local_part, value)
805
+ elsif prefix
806
+ prefixes << prefix unless prefix == "xml"
557
807
  end
558
- curr_ns << local_part
559
- elsif prefix
560
- prefixes << prefix unless prefix == "xml"
561
- end
562
808
 
563
- if attributes.has_key?(name)
564
- msg = "Duplicate attribute #{name.inspect}"
565
- raise REXML::ParseException.new(msg, @source, self)
566
- end
809
+ if attributes[name]
810
+ msg = "Duplicate attribute #{name.inspect}"
811
+ raise REXML::ParseException.new(msg, @source, self)
812
+ end
567
813
 
568
- attributes[name] = value
814
+ unless prefix == "xmlns"
815
+ uri = @namespaces[prefix]
816
+ expanded_name = [uri, local_part]
817
+ existing_prefix = expanded_names[expanded_name]
818
+ if existing_prefix
819
+ message = "Namespace conflict in adding attribute " +
820
+ "\"#{local_part}\": " +
821
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
822
+ "prefix \"#{prefix}\" = \"#{uri}\""
823
+ raise REXML::ParseException.new(message, @source, self)
824
+ end
825
+ expanded_names[expanded_name] = prefix
826
+ end
827
+
828
+ attributes[name] = value
829
+ else
830
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
831
+ raise REXML::ParseException.new(message, @source)
832
+ end
569
833
  end
570
- return attributes, closed
571
834
  end
572
835
  end
573
836
  end