rexml 3.2.4 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +558 -0
  3. data/README.md +11 -14
  4. data/doc/rexml/context.rdoc +143 -0
  5. data/doc/rexml/tasks/rdoc/child.rdoc +87 -0
  6. data/doc/rexml/tasks/rdoc/document.rdoc +276 -0
  7. data/doc/rexml/tasks/rdoc/element.rdoc +602 -0
  8. data/doc/rexml/tasks/rdoc/node.rdoc +97 -0
  9. data/doc/rexml/tasks/rdoc/parent.rdoc +267 -0
  10. data/doc/rexml/tasks/tocs/child_toc.rdoc +12 -0
  11. data/doc/rexml/tasks/tocs/document_toc.rdoc +30 -0
  12. data/doc/rexml/tasks/tocs/element_toc.rdoc +55 -0
  13. data/doc/rexml/tasks/tocs/master_toc.rdoc +135 -0
  14. data/doc/rexml/tasks/tocs/node_toc.rdoc +16 -0
  15. data/doc/rexml/tasks/tocs/parent_toc.rdoc +25 -0
  16. data/doc/rexml/tutorial.rdoc +1358 -0
  17. data/lib/rexml/attribute.rb +17 -11
  18. data/lib/rexml/doctype.rb +55 -31
  19. data/lib/rexml/document.rb +199 -35
  20. data/lib/rexml/element.rb +1802 -487
  21. data/lib/rexml/entity.rb +9 -38
  22. data/lib/rexml/formatters/pretty.rb +3 -3
  23. data/lib/rexml/functions.rb +1 -2
  24. data/lib/rexml/light/node.rb +0 -8
  25. data/lib/rexml/namespace.rb +8 -4
  26. data/lib/rexml/node.rb +8 -4
  27. data/lib/rexml/parseexception.rb +1 -0
  28. data/lib/rexml/parsers/baseparser.rb +545 -252
  29. data/lib/rexml/parsers/pullparser.rb +16 -0
  30. data/lib/rexml/parsers/sax2parser.rb +16 -19
  31. data/lib/rexml/parsers/streamparser.rb +16 -10
  32. data/lib/rexml/parsers/treeparser.rb +9 -21
  33. data/lib/rexml/parsers/xpathparser.rb +161 -97
  34. data/lib/rexml/rexml.rb +29 -22
  35. data/lib/rexml/source.rb +185 -100
  36. data/lib/rexml/text.rb +60 -61
  37. data/lib/rexml/xpath_parser.rb +43 -33
  38. data/lib/rexml.rb +3 -0
  39. metadata +42 -46
  40. data/.gitignore +0 -9
  41. data/.travis.yml +0 -24
  42. data/Gemfile +0 -6
  43. data/Rakefile +0 -8
  44. data/rexml.gemspec +0 -84
@@ -1,12 +1,40 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
+ require_relative '../security'
4
5
  require_relative '../source'
5
6
  require 'set'
6
7
  require "strscan"
7
8
 
8
9
  module REXML
9
10
  module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
10
38
  # = Using the Pull Parser
11
39
  # <em>This API is experimental, and subject to change.</em>
12
40
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -50,7 +78,6 @@ module REXML
50
78
 
51
79
  DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
52
80
  DOCTYPE_END = /\A\s*\]\s*>/um
53
- DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
54
81
  ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
55
82
  COMMENT_START = /\A<!--/u
56
83
  COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -61,15 +88,14 @@ module REXML
61
88
  XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
62
89
  INSTRUCTION_START = /\A<\?/u
63
90
  INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
64
- TAG_MATCH = /^<((?>#{QNAME_STR}))/um
65
- CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
91
+ TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
92
+ CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
66
93
 
67
94
  VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
68
95
  ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
69
96
  STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
70
97
 
71
98
  ENTITY_START = /\A\s*<!ENTITY/
72
- IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
73
99
  ELEMENTDECL_START = /\A\s*<!ELEMENT/um
74
100
  ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
75
101
  SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
@@ -83,9 +109,6 @@ module REXML
83
109
  ATTDEF_RE = /#{ATTDEF}/
84
110
  ATTLISTDECL_START = /\A\s*<!ATTLIST/um
85
111
  ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
86
- NOTATIONDECL_START = /\A\s*<!NOTATION/um
87
- PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
88
- SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
89
112
 
90
113
  TEXT_PATTERN = /\A([^<]*)/um
91
114
 
@@ -101,7 +124,12 @@ module REXML
101
124
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
102
125
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
103
126
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
104
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
128
+
129
+ NOTATIONDECL_START = /\A\s*<!NOTATION/um
130
+ EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
131
+ EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
132
+ PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
105
133
 
106
134
  EREFERENCE = /&(?!#{NAME};)/
107
135
 
@@ -112,9 +140,34 @@ module REXML
112
140
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
141
  }
114
142
 
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148
+ NAME_PATTERN = /#{NAME}/um
149
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
150
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
151
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
152
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
153
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
154
+ DEFAULT_ENTITIES_PATTERNS = {}
155
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
156
+ default_entities.each do |term|
157
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
158
+ end
159
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
160
+ end
161
+ private_constant :Private
162
+
115
163
  def initialize( source )
116
164
  self.stream = source
117
165
  @listeners = []
166
+ @prefixes = Set.new
167
+ @entity_expansion_count = 0
168
+ @entity_expansion_limit = Security.entity_expansion_limit
169
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
170
+ @source.ensure_buffer
118
171
  end
119
172
 
120
173
  def add_listener( listener )
@@ -122,15 +175,24 @@ module REXML
122
175
  end
123
176
 
124
177
  attr_reader :source
178
+ attr_reader :entity_expansion_count
179
+ attr_writer :entity_expansion_limit
180
+ attr_writer :entity_expansion_text_limit
125
181
 
126
182
  def stream=( source )
127
183
  @source = SourceFactory.create_from( source )
184
+ reset
185
+ end
186
+
187
+ def reset
128
188
  @closed = nil
189
+ @have_root = false
129
190
  @document_status = nil
130
191
  @tags = []
131
192
  @stack = []
132
193
  @entities = []
133
- @nsstack = []
194
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
195
+ @namespaces_restore_stack = []
134
196
  end
135
197
 
136
198
  def position
@@ -180,6 +242,8 @@ module REXML
180
242
 
181
243
  # Returns the next event. This is a +PullEvent+ object.
182
244
  def pull
245
+ @source.drop_parsed_content
246
+
183
247
  pull_event.tap do |event|
184
248
  @listeners.each do |listener|
185
249
  listener.receive event
@@ -192,215 +256,277 @@ module REXML
192
256
  x, @closed = @closed, nil
193
257
  return [ :end_element, x ]
194
258
  end
195
- return [ :end_document ] if empty?
259
+ if empty?
260
+ if @document_status == :in_doctype
261
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
262
+ end
263
+ unless @tags.empty?
264
+ path = "/" + @tags.join("/")
265
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
266
+ end
267
+ return [ :end_document ]
268
+ end
196
269
  return @stack.shift if @stack.size > 0
197
270
  #STDERR.puts @source.encoding
198
- @source.read if @source.buffer.size<2
199
271
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
272
+
273
+ @source.ensure_buffer
200
274
  if @document_status == nil
201
- #@source.consume( /^\s*/um )
202
- word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
203
- word = word[1] unless word.nil?
204
- #STDERR.puts "WORD = #{word.inspect}"
205
- case word
206
- when COMMENT_START
207
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
208
- when XMLDECL_START
209
- #STDERR.puts "XMLDECL"
210
- results = @source.match( XMLDECL_PATTERN, true )[1]
211
- version = VERSION.match( results )
212
- version = version[1] unless version.nil?
213
- encoding = ENCODING.match(results)
214
- encoding = encoding[1] unless encoding.nil?
215
- if need_source_encoding_update?(encoding)
216
- @source.encoding = encoding
217
- end
218
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
219
- encoding = "UTF-16"
220
- end
221
- standalone = STANDALONE.match(results)
222
- standalone = standalone[1] unless standalone.nil?
223
- return [ :xmldecl, version, encoding, standalone ]
224
- when INSTRUCTION_START
275
+ start_position = @source.position
276
+ if @source.match?("<?", true)
225
277
  return process_instruction
226
- when DOCTYPE_START
227
- md = @source.match( DOCTYPE_PATTERN, true )
228
- @nsstack.unshift(curr_ns=Set.new)
229
- identity = md[1]
230
- close = md[2]
231
- identity =~ IDENTITY
232
- name = $1
233
- raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
234
- pub_sys = $2.nil? ? nil : $2.strip
235
- long_name = $4.nil? ? nil : $4.strip
236
- uri = $6.nil? ? nil : $6.strip
237
- args = [ :start_doctype, name, pub_sys, long_name, uri ]
238
- if close == ">"
239
- @document_status = :after_doctype
240
- @source.read if @source.buffer.size<2
241
- md = @source.match(/^\s*/um, true)
242
- @stack << [ :end_doctype ]
278
+ elsif @source.match?("<!", true)
279
+ if @source.match?("--", true)
280
+ md = @source.match(/(.*?)-->/um, true)
281
+ if md.nil?
282
+ raise REXML::ParseException.new("Unclosed comment", @source)
283
+ end
284
+ if /--|-\z/.match?(md[1])
285
+ raise REXML::ParseException.new("Malformed comment", @source)
286
+ end
287
+ return [ :comment, md[1] ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.match?(/\s+/um, true)
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
298
+ end
299
+ name = parse_name(base_error_message)
300
+ @source.match?(/\s*/um, true) # skip spaces
301
+ if @source.match?("[", true)
302
+ id = [nil, nil, nil]
303
+ @document_status = :in_doctype
304
+ elsif @source.match?(">", true)
305
+ id = [nil, nil, nil]
306
+ @document_status = :after_doctype
307
+ @source.ensure_buffer
308
+ else
309
+ id = parse_id(base_error_message,
310
+ accept_external_id: true,
311
+ accept_public_id: false)
312
+ if id[0] == "SYSTEM"
313
+ # For backward compatibility
314
+ id[1], id[2] = id[2], nil
315
+ end
316
+ @source.match?(/\s*/um, true) # skip spaces
317
+ if @source.match?("[", true)
318
+ @document_status = :in_doctype
319
+ elsif @source.match?(">", true)
320
+ @document_status = :after_doctype
321
+ @source.ensure_buffer
322
+ else
323
+ message = "#{base_error_message}: garbage after external ID"
324
+ raise REXML::ParseException.new(message, @source)
325
+ end
326
+ end
327
+ args = [:start_doctype, name, *id]
328
+ if @document_status == :after_doctype
329
+ @source.match?(/\s*/um, true)
330
+ @stack << [ :end_doctype ]
331
+ end
332
+ return args
243
333
  else
244
- @document_status = :in_doctype
245
- end
246
- return args
247
- when /^\s+/
248
- else
249
- @document_status = :after_doctype
250
- @source.read if @source.buffer.size<2
251
- md = @source.match(/\s*/um, true)
252
- if @source.encoding == "UTF-8"
253
- @source.buffer.force_encoding(::Encoding::UTF_8)
334
+ message = "Invalid XML"
335
+ raise REXML::ParseException.new(message, @source)
254
336
  end
255
337
  end
256
338
  end
257
339
  if @document_status == :in_doctype
258
- md = @source.match(/\s*(.*?>)/um)
259
- case md[1]
260
- when SYSTEMENTITY
261
- match = @source.match( SYSTEMENTITY, true )[1]
262
- return [ :externalentity, match ]
263
-
264
- when ELEMENTDECL_START
265
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
266
-
267
- when ENTITY_START
268
- match = @source.match( ENTITYDECL, true ).to_a.compact
269
- match[0] = :entitydecl
270
- ref = false
271
- if match[1] == '%'
272
- ref = true
273
- match.delete_at 1
274
- end
275
- # Now we have to sort out what kind of entity reference this is
276
- if match[2] == 'SYSTEM'
277
- # External reference
278
- match[3] = match[3][1..-2] # PUBID
279
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
280
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
281
- elsif match[2] == 'PUBLIC'
282
- # External reference
283
- match[3] = match[3][1..-2] # PUBID
284
- match[4] = match[4][1..-2] # HREF
285
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
286
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
287
- else
288
- match[2] = match[2][1..-2]
289
- match.pop if match.size == 4
290
- # match is [ :entity, name, value ]
291
- end
292
- match << '%' if ref
293
- return match
294
- when ATTLISTDECL_START
295
- md = @source.match( ATTLISTDECL_PATTERN, true )
296
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
297
- element = md[1]
298
- contents = md[0]
299
-
300
- pairs = {}
301
- values = md[0].scan( ATTDEF_RE )
302
- values.each do |attdef|
303
- unless attdef[3] == "#IMPLIED"
304
- attdef.compact!
305
- val = attdef[3]
306
- val = attdef[4] if val == "#FIXED "
307
- pairs[attdef[0]] = val
308
- if attdef[0] =~ /^xmlns:(.*)/
309
- @nsstack[0] << $1
340
+ @source.match?(/\s*/um, true) # skip spaces
341
+ start_position = @source.position
342
+ if @source.match?("<!", true)
343
+ if @source.match?("ELEMENT", true)
344
+ md = @source.match(/(.*?)>/um, true)
345
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
346
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
347
+ elsif @source.match?("ENTITY", true)
348
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
349
+ unless match_data
350
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
351
+ end
352
+ match = [:entitydecl, *match_data.captures.compact]
353
+ ref = false
354
+ if match[1] == '%'
355
+ ref = true
356
+ match.delete_at 1
357
+ end
358
+ # Now we have to sort out what kind of entity reference this is
359
+ if match[2] == 'SYSTEM'
360
+ # External reference
361
+ match[3] = match[3][1..-2] # PUBID
362
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
363
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
364
+ elsif match[2] == 'PUBLIC'
365
+ # External reference
366
+ match[3] = match[3][1..-2] # PUBID
367
+ match[4] = match[4][1..-2] # HREF
368
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
369
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
370
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
371
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
372
+ else
373
+ match[2] = match[2][1..-2]
374
+ match.pop if match.size == 4
375
+ # match is [ :entity, name, value ]
376
+ end
377
+ match << '%' if ref
378
+ return match
379
+ elsif @source.match?("ATTLIST", true)
380
+ md = @source.match(Private::ATTLISTDECL_END, true)
381
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
382
+ element = md[1]
383
+ contents = "<!ATTLIST" + md[0]
384
+
385
+ pairs = {}
386
+ values = md[0].strip.scan( ATTDEF_RE )
387
+ values.each do |attdef|
388
+ unless attdef[3] == "#IMPLIED"
389
+ attdef.compact!
390
+ val = attdef[3]
391
+ val = attdef[4] if val == "#FIXED "
392
+ pairs[attdef[0]] = val
393
+ if attdef[0] =~ /^xmlns:(.*)/
394
+ @namespaces[$1] = val
395
+ end
310
396
  end
311
397
  end
398
+ return [ :attlistdecl, element, pairs, contents ]
399
+ elsif @source.match?("NOTATION", true)
400
+ base_error_message = "Malformed notation declaration"
401
+ unless @source.match?(/\s+/um, true)
402
+ if @source.match?(">")
403
+ message = "#{base_error_message}: name is missing"
404
+ else
405
+ message = "#{base_error_message}: invalid name"
406
+ end
407
+ @source.position = start_position
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ name = parse_name(base_error_message)
411
+ id = parse_id(base_error_message,
412
+ accept_external_id: true,
413
+ accept_public_id: true)
414
+ @source.match?(/\s*/um, true) # skip spaces
415
+ unless @source.match?(">", true)
416
+ message = "#{base_error_message}: garbage before end >"
417
+ raise REXML::ParseException.new(message, @source)
418
+ end
419
+ return [:notationdecl, name, *id]
420
+ elsif md = @source.match(/--(.*?)-->/um, true)
421
+ case md[1]
422
+ when /--/, /-\z/
423
+ raise REXML::ParseException.new("Malformed comment", @source)
424
+ end
425
+ return [ :comment, md[1] ] if md
312
426
  end
313
- return [ :attlistdecl, element, pairs, contents ]
314
- when NOTATIONDECL_START
315
- md = nil
316
- if @source.match( PUBLIC )
317
- md = @source.match( PUBLIC, true )
318
- vals = [md[1],md[2],md[4],md[6]]
319
- elsif @source.match( SYSTEM )
320
- md = @source.match( SYSTEM, true )
321
- vals = [md[1],md[2],nil,md[4]]
322
- else
323
- raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
324
- end
325
- return [ :notationdecl, *vals ]
326
- when DOCTYPE_END
427
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
428
+ return [ :externalentity, match[1] ]
429
+ elsif @source.match?(/\]\s*>/um, true)
327
430
  @document_status = :after_doctype
328
- @source.match( DOCTYPE_END, true )
329
431
  return [ :end_doctype ]
330
432
  end
433
+ if @document_status == :in_doctype
434
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
435
+ end
436
+ end
437
+ if @document_status == :after_doctype
438
+ @source.match?(/\s*/um, true)
331
439
  end
332
440
  begin
333
- if @source.buffer[0] == ?<
334
- if @source.buffer[1] == ?/
335
- @nsstack.shift
441
+ start_position = @source.position
442
+ if @source.match?("<", true)
443
+ # :text's read_until may remain only "<" in buffer. In the
444
+ # case, buffer is empty here. So we need to fill buffer
445
+ # here explicitly.
446
+ @source.ensure_buffer
447
+ if @source.match?("/", true)
448
+ @namespaces_restore_stack.pop
336
449
  last_tag = @tags.pop
337
- md = @source.match( CLOSE_MATCH, true )
450
+ md = @source.match(Private::CLOSE_PATTERN, true)
338
451
  if md and !last_tag
339
452
  message = "Unexpected top-level end tag (got '#{md[1]}')"
340
453
  raise REXML::ParseException.new(message, @source)
341
454
  end
342
455
  if md.nil? or last_tag != md[1]
343
456
  message = "Missing end tag for '#{last_tag}'"
344
- message << " (got '#{md[1]}')" if md
457
+ message += " (got '#{md[1]}')" if md
458
+ @source.position = start_position if md.nil?
345
459
  raise REXML::ParseException.new(message, @source)
346
460
  end
347
461
  return [ :end_element, last_tag ]
348
- elsif @source.buffer[1] == ?!
349
- md = @source.match(/\A(\s*[^>]*>)/um)
462
+ elsif @source.match?("!", true)
463
+ md = @source.match(/([^>]*>)/um)
350
464
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
351
465
  raise REXML::ParseException.new("Malformed node", @source) unless md
352
- if md[0][2] == ?-
353
- md = @source.match( COMMENT_PATTERN, true )
466
+ if md[0][0] == ?-
467
+ md = @source.match(/--(.*?)-->/um, true)
354
468
 
355
- case md[1]
356
- when /--/, /-\z/
469
+ if md.nil? || /--|-\z/.match?(md[1])
357
470
  raise REXML::ParseException.new("Malformed comment", @source)
358
471
  end
359
472
 
360
- return [ :comment, md[1] ] if md
473
+ return [ :comment, md[1] ]
361
474
  else
362
- md = @source.match( CDATA_PATTERN, true )
475
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
363
476
  return [ :cdata, md[1] ] if md
364
477
  end
365
478
  raise REXML::ParseException.new( "Declarations can only occur "+
366
479
  "in the doctype declaration.", @source)
367
- elsif @source.buffer[1] == ??
480
+ elsif @source.match?("?", true)
368
481
  return process_instruction
369
482
  else
370
483
  # Get the next tag
371
- md = @source.match(TAG_MATCH, true)
484
+ md = @source.match(Private::TAG_PATTERN, true)
372
485
  unless md
486
+ @source.position = start_position
373
487
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
374
488
  end
375
- prefixes = Set.new
376
- prefixes << md[2] if md[2]
377
- @nsstack.unshift(curr_ns=Set.new)
378
- attributes, closed = parse_attributes(prefixes, curr_ns)
489
+ tag = md[1]
490
+ @document_status = :in_element
491
+ @prefixes.clear
492
+ @prefixes << md[2] if md[2]
493
+ push_namespaces_restore
494
+ attributes, closed = parse_attributes(@prefixes)
379
495
  # Verify that all of the prefixes have been defined
380
- for prefix in prefixes
381
- unless @nsstack.find{|k| k.member?(prefix)}
496
+ for prefix in @prefixes
497
+ unless @namespaces.key?(prefix)
382
498
  raise UndefinedNamespaceException.new(prefix,@source,self)
383
499
  end
384
500
  end
385
501
 
386
502
  if closed
387
- @closed = md[1]
388
- @nsstack.shift
503
+ @closed = tag
504
+ pop_namespaces_restore
389
505
  else
390
- @tags.push( md[1] )
506
+ if @tags.empty? and @have_root
507
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
508
+ end
509
+ @tags.push( tag )
391
510
  end
392
- return [ :start_element, md[1], attributes ]
511
+ @have_root = true
512
+ return [ :start_element, tag, attributes ]
393
513
  end
394
514
  else
395
- md = @source.match( TEXT_PATTERN, true )
396
- if md[0].length == 0
397
- @source.match( /(\s+)/, true )
515
+ text = @source.read_until("<")
516
+ if text.chomp!("<")
517
+ @source.position -= "<".bytesize
398
518
  end
399
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
400
- #return [ :text, "" ] if md[0].length == 0
401
- # unnormalized = Text::unnormalize( md[1], self )
402
- # return PullEvent.new( :text, md[1], unnormalized )
403
- return [ :text, md[1] ]
519
+ if @tags.empty?
520
+ unless /\A\s*\z/.match?(text)
521
+ if @have_root
522
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
523
+ else
524
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
525
+ end
526
+ end
527
+ return pull_event if @have_root
528
+ end
529
+ return [ :text, text ]
404
530
  end
405
531
  rescue REXML::UndefinedNamespaceException
406
532
  raise
@@ -415,13 +541,13 @@ module REXML
415
541
  private :pull_event
416
542
 
417
543
  def entity( reference, entities )
418
- value = nil
419
- value = entities[ reference ] if entities
420
- if not value
421
- value = DEFAULT_ENTITIES[ reference ]
422
- value = value[2] if value
423
- end
424
- unnormalize( value, entities ) if value
544
+ return unless entities
545
+
546
+ value = entities[ reference ]
547
+ return if value.nil?
548
+
549
+ record_entity_expansion
550
+ unnormalize( value, entities )
425
551
  end
426
552
 
427
553
  # Escapes all possible entities
@@ -442,132 +568,299 @@ module REXML
442
568
 
443
569
  # Unescapes all possible entities
444
570
  def unnormalize( string, entities=nil, filter=nil )
445
- rv = string.clone
446
- rv.gsub!( /\r\n?/, "\n" )
571
+ if string.include?("\r")
572
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
573
+ else
574
+ rv = string.dup
575
+ end
447
576
  matches = rv.scan( REFERENCE_RE )
448
577
  return rv if matches.size == 0
449
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
578
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
450
579
  m=$1
451
- m = "0#{m}" if m[0] == ?x
452
- [Integer(m)].pack('U*')
580
+ if m.start_with?("x")
581
+ code_point = Integer(m[1..-1], 16)
582
+ else
583
+ code_point = Integer(m, 10)
584
+ end
585
+ [code_point].pack('U*')
453
586
  }
454
587
  matches.collect!{|x|x[0]}.compact!
588
+ if filter
589
+ matches.reject! do |entity_reference|
590
+ filter.include?(entity_reference)
591
+ end
592
+ end
455
593
  if matches.size > 0
456
- matches.each do |entity_reference|
457
- unless filter and filter.include?(entity_reference)
458
- entity_value = entity( entity_reference, entities )
459
- if entity_value
460
- re = /&#{entity_reference};/
461
- rv.gsub!( re, entity_value )
462
- else
463
- er = DEFAULT_ENTITIES[entity_reference]
464
- rv.gsub!( er[0], er[2] ) if er
594
+ matches.tally.each do |entity_reference, n|
595
+ entity_expansion_count_before = @entity_expansion_count
596
+ entity_value = entity( entity_reference, entities )
597
+ if entity_value
598
+ if n > 1
599
+ entity_expansion_count_delta =
600
+ @entity_expansion_count - entity_expansion_count_before
601
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
465
602
  end
603
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
604
+ rv.gsub!( re, entity_value )
605
+ if rv.bytesize > @entity_expansion_text_limit
606
+ raise "entity expansion has grown too large"
607
+ end
608
+ else
609
+ er = DEFAULT_ENTITIES[entity_reference]
610
+ rv.gsub!( er[0], er[2] ) if er
466
611
  end
467
612
  end
468
- rv.gsub!( /&amp;/, '&' )
613
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
469
614
  end
470
615
  rv
471
616
  end
472
617
 
473
618
  private
619
+ def add_namespace(prefix, uri)
620
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
621
+ if uri.nil?
622
+ @namespaces.delete(prefix)
623
+ else
624
+ @namespaces[prefix] = uri
625
+ end
626
+ end
627
+
628
+ def push_namespaces_restore
629
+ namespaces_restore = {}
630
+ @namespaces_restore_stack.push(namespaces_restore)
631
+ namespaces_restore
632
+ end
633
+
634
+ def pop_namespaces_restore
635
+ namespaces_restore = @namespaces_restore_stack.pop
636
+ namespaces_restore.each do |prefix, uri|
637
+ if uri.nil?
638
+ @namespaces.delete(prefix)
639
+ else
640
+ @namespaces[prefix] = uri
641
+ end
642
+ end
643
+ end
644
+
645
+ def record_entity_expansion(delta=1)
646
+ @entity_expansion_count += delta
647
+ if @entity_expansion_count > @entity_expansion_limit
648
+ raise "number of entity expansions exceeded, processing aborted."
649
+ end
650
+ end
651
+
474
652
  def need_source_encoding_update?(xml_declaration_encoding)
475
653
  return false if xml_declaration_encoding.nil?
476
654
  return false if /\AUTF-16\z/i =~ xml_declaration_encoding
477
655
  true
478
656
  end
479
657
 
480
- def process_instruction
481
- match_data = @source.match(INSTRUCTION_PATTERN, true)
482
- unless match_data
483
- message = "Invalid processing instruction node"
658
+ def parse_name(base_error_message)
659
+ md = @source.match(Private::NAME_PATTERN, true)
660
+ unless md
661
+ if @source.match?(/\S/um)
662
+ message = "#{base_error_message}: invalid name"
663
+ else
664
+ message = "#{base_error_message}: name is missing"
665
+ end
484
666
  raise REXML::ParseException.new(message, @source)
485
667
  end
486
- [:processing_instruction, match_data[1], match_data[2]]
668
+ md[0]
487
669
  end
488
670
 
489
- def parse_attributes(prefixes, curr_ns)
490
- attributes = {}
491
- closed = false
492
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
493
- if match_data.nil?
494
- message = "Start tag isn't ended"
671
+ def parse_id(base_error_message,
672
+ accept_external_id:,
673
+ accept_public_id:)
674
+ if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
675
+ pubid = system = nil
676
+ pubid_literal = md[1]
677
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
678
+ system_literal = md[2]
679
+ system = system_literal[1..-2] if system_literal # Remove quote
680
+ ["PUBLIC", pubid, system]
681
+ elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
682
+ pubid = system = nil
683
+ pubid_literal = md[1]
684
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
685
+ ["PUBLIC", pubid, nil]
686
+ elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
687
+ system = nil
688
+ system_literal = md[1]
689
+ system = system_literal[1..-2] if system_literal # Remove quote
690
+ ["SYSTEM", nil, system]
691
+ else
692
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
693
+ accept_public_id: accept_public_id)
694
+ message = "#{base_error_message}: #{details}"
495
695
  raise REXML::ParseException.new(message, @source)
496
696
  end
697
+ end
698
+
699
+ def parse_id_invalid_details(accept_external_id:,
700
+ accept_public_id:)
701
+ public = /\A\s*PUBLIC/um
702
+ system = /\A\s*SYSTEM/um
703
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
704
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
705
+ return "public ID literal is missing"
706
+ end
707
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
708
+ return "invalid public ID literal"
709
+ end
710
+ if accept_public_id
711
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
712
+ return "system ID literal is missing"
713
+ end
714
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
715
+ return "invalid system literal"
716
+ end
717
+ "garbage after system literal"
718
+ else
719
+ "garbage after public ID literal"
720
+ end
721
+ elsif accept_external_id and @source.match?(/#{system}/um)
722
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
723
+ return "system literal is missing"
724
+ end
725
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
726
+ return "invalid system literal"
727
+ end
728
+ "garbage after system literal"
729
+ else
730
+ unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
731
+ return "invalid ID type"
732
+ end
733
+ "ID type is missing"
734
+ end
735
+ end
497
736
 
498
- raw_attributes = match_data[1]
499
- closed = !match_data[2].nil?
500
- return attributes, closed if raw_attributes.nil?
501
- return attributes, closed if raw_attributes.empty?
737
+ def process_instruction
738
+ name = parse_name("Malformed XML: Invalid processing instruction node")
739
+ if @source.match?(/\s+/um, true)
740
+ match_data = @source.match(/(.*?)\?>/um, true)
741
+ unless match_data
742
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
743
+ end
744
+ content = match_data[1]
745
+ else
746
+ content = nil
747
+ unless @source.match?("?>", true)
748
+ raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
749
+ end
750
+ end
751
+ if name == "xml"
752
+ if @document_status
753
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
754
+ end
755
+ version = VERSION.match(content)
756
+ version = version[1] unless version.nil?
757
+ encoding = ENCODING.match(content)
758
+ encoding = encoding[1] unless encoding.nil?
759
+ if need_source_encoding_update?(encoding)
760
+ @source.encoding = encoding
761
+ end
762
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
763
+ encoding = "UTF-16"
764
+ end
765
+ standalone = STANDALONE.match(content)
766
+ standalone = standalone[1] unless standalone.nil?
767
+ return [ :xmldecl, version, encoding, standalone ]
768
+ end
769
+ [:processing_instruction, name, content]
770
+ end
502
771
 
503
- scanner = StringScanner.new(raw_attributes)
504
- until scanner.eos?
505
- if scanner.scan(/\s+/)
506
- break if scanner.eos?
772
+ if StringScanner::Version < "3.1.1"
773
+ def scan_quote
774
+ @source.match(/(['"])/, true)&.[](1)
775
+ end
776
+ else
777
+ def scan_quote
778
+ case @source.peek_byte
779
+ when 34 # '"'.ord
780
+ @source.scan_byte
781
+ '"'
782
+ when 39 # "'".ord
783
+ @source.scan_byte
784
+ "'"
785
+ else
786
+ nil
507
787
  end
788
+ end
789
+ end
508
790
 
509
- pos = scanner.pos
510
- loop do
511
- break if scanner.scan(ATTRIBUTE_PATTERN)
512
- unless scanner.scan(QNAME)
513
- message = "Invalid attribute name: <#{scanner.rest}>"
514
- raise REXML::ParseException.new(message, @source)
515
- end
516
- name = scanner[0]
517
- unless scanner.scan(/\s*=\s*/um)
791
+ def parse_attributes(prefixes)
792
+ attributes = {}
793
+ expanded_names = {}
794
+ closed = false
795
+ while true
796
+ if @source.match?(">", true)
797
+ return attributes, closed
798
+ elsif @source.match?("/>", true)
799
+ closed = true
800
+ return attributes, closed
801
+ elsif match = @source.match(QNAME, true)
802
+ name = match[1]
803
+ prefix = match[2]
804
+ local_part = match[3]
805
+
806
+ unless @source.match?(/\s*=\s*/um, true)
518
807
  message = "Missing attribute equal: <#{name}>"
519
808
  raise REXML::ParseException.new(message, @source)
520
809
  end
521
- quote = scanner.scan(/['"]/)
522
- unless quote
810
+ unless quote = scan_quote
523
811
  message = "Missing attribute value start quote: <#{name}>"
524
812
  raise REXML::ParseException.new(message, @source)
525
813
  end
526
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
527
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
528
- if match_data
529
- scanner << "/" if closed
530
- scanner << ">"
531
- scanner << match_data[1]
532
- scanner.pos = pos
533
- closed = !match_data[2].nil?
534
- next
535
- end
536
- message =
537
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
814
+ start_position = @source.position
815
+ value = @source.read_until(quote)
816
+ unless value.chomp!(quote)
817
+ @source.position = start_position
818
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
538
819
  raise REXML::ParseException.new(message, @source)
539
820
  end
540
- end
541
- name = scanner[1]
542
- prefix = scanner[2]
543
- local_part = scanner[3]
544
- # quote = scanner[4]
545
- value = scanner[5]
546
- if prefix == "xmlns"
547
- if local_part == "xml"
548
- if value != "http://www.w3.org/XML/1998/namespace"
549
- msg = "The 'xml' prefix must not be bound to any other namespace "+
821
+ @source.match?(/\s*/um, true)
822
+ if prefix == "xmlns"
823
+ if local_part == "xml"
824
+ if value != Private::XML_PREFIXED_NAMESPACE
825
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
826
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
827
+ raise REXML::ParseException.new( msg, @source, self )
828
+ end
829
+ elsif local_part == "xmlns"
830
+ msg = "The 'xmlns' prefix must not be declared "+
550
831
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
551
- raise REXML::ParseException.new( msg, @source, self )
832
+ raise REXML::ParseException.new( msg, @source, self)
552
833
  end
553
- elsif local_part == "xmlns"
554
- msg = "The 'xmlns' prefix must not be declared "+
555
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
556
- raise REXML::ParseException.new( msg, @source, self)
834
+ add_namespace(local_part, value)
835
+ elsif prefix
836
+ prefixes << prefix unless prefix == "xml"
557
837
  end
558
- curr_ns << local_part
559
- elsif prefix
560
- prefixes << prefix unless prefix == "xml"
561
- end
562
838
 
563
- if attributes.has_key?(name)
564
- msg = "Duplicate attribute #{name.inspect}"
565
- raise REXML::ParseException.new(msg, @source, self)
566
- end
839
+ if attributes[name]
840
+ msg = "Duplicate attribute #{name.inspect}"
841
+ raise REXML::ParseException.new(msg, @source, self)
842
+ end
843
+
844
+ unless prefix == "xmlns"
845
+ uri = @namespaces[prefix]
846
+ expanded_name = [uri, local_part]
847
+ existing_prefix = expanded_names[expanded_name]
848
+ if existing_prefix
849
+ message = "Namespace conflict in adding attribute " +
850
+ "\"#{local_part}\": " +
851
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
852
+ "prefix \"#{prefix}\" = \"#{uri}\""
853
+ raise REXML::ParseException.new(message, @source, self)
854
+ end
855
+ expanded_names[expanded_name] = prefix
856
+ end
567
857
 
568
- attributes[name] = value
858
+ attributes[name] = value
859
+ else
860
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
861
+ raise REXML::ParseException.new(message, @source)
862
+ end
569
863
  end
570
- return attributes, closed
571
864
  end
572
865
  end
573
866
  end