rexml 3.2.5 → 3.3.6
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/NEWS.md +406 -2
- data/README.md +10 -1
- data/doc/rexml/tasks/rdoc/element.rdoc +2 -2
- data/doc/rexml/tutorial.rdoc +1358 -0
- data/lib/rexml/attribute.rb +14 -9
- data/lib/rexml/document.rb +1 -1
- data/lib/rexml/element.rb +19 -34
- data/lib/rexml/entity.rb +5 -37
- data/lib/rexml/formatters/pretty.rb +3 -3
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/namespace.rb +8 -4
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +421 -263
- data/lib/rexml/parsers/pullparser.rb +4 -0
- data/lib/rexml/parsers/sax2parser.rb +6 -19
- data/lib/rexml/parsers/streamparser.rb +8 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/parsers/xpathparser.rb +136 -86
- data/lib/rexml/rexml.rb +3 -1
- data/lib/rexml/source.rb +128 -98
- data/lib/rexml/text.rb +40 -18
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +11 -39
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,30 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
end
|
160
|
+
private_constant :Private
|
161
|
+
|
115
162
|
def initialize( source )
|
116
163
|
self.stream = source
|
117
164
|
@listeners = []
|
165
|
+
@prefixes = Set.new
|
166
|
+
@entity_expansion_count = 0
|
118
167
|
end
|
119
168
|
|
120
169
|
def add_listener( listener )
|
@@ -122,15 +171,18 @@ module REXML
|
|
122
171
|
end
|
123
172
|
|
124
173
|
attr_reader :source
|
174
|
+
attr_reader :entity_expansion_count
|
125
175
|
|
126
176
|
def stream=( source )
|
127
177
|
@source = SourceFactory.create_from( source )
|
128
178
|
@closed = nil
|
179
|
+
@have_root = false
|
129
180
|
@document_status = nil
|
130
181
|
@tags = []
|
131
182
|
@stack = []
|
132
183
|
@entities = []
|
133
|
-
@
|
184
|
+
@namespaces = {}
|
185
|
+
@namespaces_restore_stack = []
|
134
186
|
end
|
135
187
|
|
136
188
|
def position
|
@@ -180,6 +232,8 @@ module REXML
|
|
180
232
|
|
181
233
|
# Returns the next event. This is a +PullEvent+ object.
|
182
234
|
def pull
|
235
|
+
@source.drop_parsed_content
|
236
|
+
|
183
237
|
pull_event.tap do |event|
|
184
238
|
@listeners.each do |listener|
|
185
239
|
listener.receive event
|
@@ -192,236 +246,274 @@ module REXML
|
|
192
246
|
x, @closed = @closed, nil
|
193
247
|
return [ :end_element, x ]
|
194
248
|
end
|
195
|
-
|
249
|
+
if empty?
|
250
|
+
if @document_status == :in_doctype
|
251
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
252
|
+
end
|
253
|
+
unless @tags.empty?
|
254
|
+
path = "/" + @tags.join("/")
|
255
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
256
|
+
end
|
257
|
+
return [ :end_document ]
|
258
|
+
end
|
196
259
|
return @stack.shift if @stack.size > 0
|
197
260
|
#STDERR.puts @source.encoding
|
198
261
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
262
|
+
|
263
|
+
@source.ensure_buffer
|
199
264
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
265
|
+
start_position = @source.position
|
266
|
+
if @source.match("<?", true)
|
223
267
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
268
|
+
elsif @source.match("<!", true)
|
269
|
+
if @source.match("--", true)
|
270
|
+
md = @source.match(/(.*?)-->/um, true)
|
271
|
+
if md.nil?
|
272
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
242
273
|
end
|
243
|
-
if
|
274
|
+
if /--|-\z/.match?(md[1])
|
275
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
276
|
+
end
|
277
|
+
return [ :comment, md[1] ]
|
278
|
+
elsif @source.match("DOCTYPE", true)
|
279
|
+
base_error_message = "Malformed DOCTYPE"
|
280
|
+
unless @source.match(/\s+/um, true)
|
281
|
+
if @source.match(">")
|
282
|
+
message = "#{base_error_message}: name is missing"
|
283
|
+
else
|
284
|
+
message = "#{base_error_message}: invalid name"
|
285
|
+
end
|
286
|
+
@source.position = start_position
|
287
|
+
raise REXML::ParseException.new(message, @source)
|
288
|
+
end
|
289
|
+
name = parse_name(base_error_message)
|
290
|
+
if @source.match(/\s*\[/um, true)
|
291
|
+
id = [nil, nil, nil]
|
244
292
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
293
|
+
elsif @source.match(/\s*>/um, true)
|
294
|
+
id = [nil, nil, nil]
|
246
295
|
@document_status = :after_doctype
|
296
|
+
@source.ensure_buffer
|
247
297
|
else
|
248
|
-
|
249
|
-
|
298
|
+
id = parse_id(base_error_message,
|
299
|
+
accept_external_id: true,
|
300
|
+
accept_public_id: false)
|
301
|
+
if id[0] == "SYSTEM"
|
302
|
+
# For backward compatibility
|
303
|
+
id[1], id[2] = id[2], nil
|
304
|
+
end
|
305
|
+
if @source.match(/\s*\[/um, true)
|
306
|
+
@document_status = :in_doctype
|
307
|
+
elsif @source.match(/\s*>/um, true)
|
308
|
+
@document_status = :after_doctype
|
309
|
+
@source.ensure_buffer
|
310
|
+
else
|
311
|
+
message = "#{base_error_message}: garbage after external ID"
|
312
|
+
raise REXML::ParseException.new(message, @source)
|
313
|
+
end
|
250
314
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
315
|
+
args = [:start_doctype, name, *id]
|
316
|
+
if @document_status == :after_doctype
|
317
|
+
@source.match(/\s*/um, true)
|
318
|
+
@stack << [ :end_doctype ]
|
319
|
+
end
|
320
|
+
return args
|
321
|
+
else
|
322
|
+
message = "Invalid XML"
|
323
|
+
raise REXML::ParseException.new(message, @source)
|
263
324
|
end
|
264
325
|
end
|
265
326
|
end
|
266
327
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
328
|
+
@source.match(/\s*/um, true) # skip spaces
|
329
|
+
start_position = @source.position
|
330
|
+
if @source.match("<!", true)
|
331
|
+
if @source.match("ELEMENT", true)
|
332
|
+
md = @source.match(/(.*?)>/um, true)
|
333
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
334
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
335
|
+
elsif @source.match("ENTITY", true)
|
336
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
337
|
+
unless match_data
|
338
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
339
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
340
|
+
match = [:entitydecl, *match_data.captures.compact]
|
341
|
+
ref = false
|
342
|
+
if match[1] == '%'
|
343
|
+
ref = true
|
344
|
+
match.delete_at 1
|
345
|
+
end
|
346
|
+
# Now we have to sort out what kind of entity reference this is
|
347
|
+
if match[2] == 'SYSTEM'
|
348
|
+
# External reference
|
349
|
+
match[3] = match[3][1..-2] # PUBID
|
350
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
351
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
352
|
+
elsif match[2] == 'PUBLIC'
|
353
|
+
# External reference
|
354
|
+
match[3] = match[3][1..-2] # PUBID
|
355
|
+
match[4] = match[4][1..-2] # HREF
|
356
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
357
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
358
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
359
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
360
|
else
|
329
|
-
|
361
|
+
match[2] = match[2][1..-2]
|
362
|
+
match.pop if match.size == 4
|
363
|
+
# match is [ :entity, name, value ]
|
330
364
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
365
|
+
match << '%' if ref
|
366
|
+
return match
|
367
|
+
elsif @source.match("ATTLIST", true)
|
368
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
369
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
370
|
+
element = md[1]
|
371
|
+
contents = md[0]
|
372
|
+
|
373
|
+
pairs = {}
|
374
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
375
|
+
values.each do |attdef|
|
376
|
+
unless attdef[3] == "#IMPLIED"
|
377
|
+
attdef.compact!
|
378
|
+
val = attdef[3]
|
379
|
+
val = attdef[4] if val == "#FIXED "
|
380
|
+
pairs[attdef[0]] = val
|
381
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
382
|
+
@namespaces[$1] = val
|
383
|
+
end
|
384
|
+
end
|
385
|
+
end
|
386
|
+
return [ :attlistdecl, element, pairs, contents ]
|
387
|
+
elsif @source.match("NOTATION", true)
|
388
|
+
base_error_message = "Malformed notation declaration"
|
389
|
+
unless @source.match(/\s+/um, true)
|
390
|
+
if @source.match(">")
|
391
|
+
message = "#{base_error_message}: name is missing"
|
392
|
+
else
|
393
|
+
message = "#{base_error_message}: invalid name"
|
394
|
+
end
|
395
|
+
@source.position = start_position
|
396
|
+
raise REXML::ParseException.new(message, @source)
|
397
|
+
end
|
398
|
+
name = parse_name(base_error_message)
|
399
|
+
id = parse_id(base_error_message,
|
400
|
+
accept_external_id: true,
|
401
|
+
accept_public_id: true)
|
402
|
+
unless @source.match(/\s*>/um, true)
|
403
|
+
message = "#{base_error_message}: garbage before end >"
|
404
|
+
raise REXML::ParseException.new(message, @source)
|
405
|
+
end
|
406
|
+
return [:notationdecl, name, *id]
|
407
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
408
|
+
case md[1]
|
409
|
+
when /--/, /-\z/
|
410
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
411
|
+
end
|
412
|
+
return [ :comment, md[1] ] if md
|
340
413
|
end
|
341
|
-
|
342
|
-
|
414
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
415
|
+
return [ :externalentity, match[1] ]
|
416
|
+
elsif @source.match(/\]\s*>/um, true)
|
343
417
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
418
|
return [ :end_doctype ]
|
346
419
|
end
|
420
|
+
if @document_status == :in_doctype
|
421
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
422
|
+
end
|
347
423
|
end
|
348
424
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
425
|
+
@source.match(/\s*/um, true)
|
350
426
|
end
|
351
427
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
428
|
+
start_position = @source.position
|
429
|
+
if @source.match("<", true)
|
430
|
+
# :text's read_until may remain only "<" in buffer. In the
|
431
|
+
# case, buffer is empty here. So we need to fill buffer
|
432
|
+
# here explicitly.
|
433
|
+
@source.ensure_buffer
|
434
|
+
if @source.match("/", true)
|
435
|
+
@namespaces_restore_stack.pop
|
356
436
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
437
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
438
|
if md and !last_tag
|
359
439
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
440
|
raise REXML::ParseException.new(message, @source)
|
361
441
|
end
|
362
442
|
if md.nil? or last_tag != md[1]
|
363
443
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
444
|
+
message += " (got '#{md[1]}')" if md
|
445
|
+
@source.position = start_position if md.nil?
|
365
446
|
raise REXML::ParseException.new(message, @source)
|
366
447
|
end
|
367
448
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
449
|
+
elsif @source.match("!", true)
|
450
|
+
md = @source.match(/([^>]*>)/um)
|
370
451
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
452
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
453
|
+
if md[0][0] == ?-
|
454
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
455
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
456
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
457
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
458
|
end
|
379
459
|
|
380
|
-
return [ :comment, md[1] ]
|
460
|
+
return [ :comment, md[1] ]
|
381
461
|
else
|
382
|
-
md = @source.match(
|
462
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
463
|
return [ :cdata, md[1] ] if md
|
384
464
|
end
|
385
465
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
466
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
467
|
+
elsif @source.match("?", true)
|
388
468
|
return process_instruction
|
389
469
|
else
|
390
470
|
# Get the next tag
|
391
|
-
md = @source.match(
|
471
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
472
|
unless md
|
473
|
+
@source.position = start_position
|
393
474
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
475
|
end
|
476
|
+
tag = md[1]
|
395
477
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
478
|
+
@prefixes.clear
|
479
|
+
@prefixes << md[2] if md[2]
|
480
|
+
push_namespaces_restore
|
481
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
482
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
483
|
+
for prefix in @prefixes
|
484
|
+
unless @namespaces.key?(prefix)
|
403
485
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
486
|
end
|
405
487
|
end
|
406
488
|
|
407
489
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
490
|
+
@closed = tag
|
491
|
+
pop_namespaces_restore
|
410
492
|
else
|
411
|
-
@tags.
|
493
|
+
if @tags.empty? and @have_root
|
494
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
495
|
+
end
|
496
|
+
@tags.push( tag )
|
412
497
|
end
|
413
|
-
|
498
|
+
@have_root = true
|
499
|
+
return [ :start_element, tag, attributes ]
|
414
500
|
end
|
415
501
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
502
|
+
text = @source.read_until("<")
|
503
|
+
if text.chomp!("<")
|
504
|
+
@source.position -= "<".bytesize
|
419
505
|
end
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
506
|
+
if @tags.empty?
|
507
|
+
unless /\A\s*\z/.match?(text)
|
508
|
+
if @have_root
|
509
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
510
|
+
else
|
511
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
512
|
+
end
|
513
|
+
end
|
514
|
+
return pull_event if @have_root
|
515
|
+
end
|
516
|
+
return [ :text, text ]
|
425
517
|
end
|
426
518
|
rescue REXML::UndefinedNamespaceException
|
427
519
|
raise
|
@@ -436,13 +528,13 @@ module REXML
|
|
436
528
|
private :pull_event
|
437
529
|
|
438
530
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
531
|
+
return unless entities
|
532
|
+
|
533
|
+
value = entities[ reference ]
|
534
|
+
return if value.nil?
|
535
|
+
|
536
|
+
record_entity_expansion
|
537
|
+
unnormalize( value, entities )
|
446
538
|
end
|
447
539
|
|
448
540
|
# Escapes all possible entities
|
@@ -463,35 +555,83 @@ module REXML
|
|
463
555
|
|
464
556
|
# Unescapes all possible entities
|
465
557
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
558
|
+
if string.include?("\r")
|
559
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
560
|
+
else
|
561
|
+
rv = string.dup
|
562
|
+
end
|
468
563
|
matches = rv.scan( REFERENCE_RE )
|
469
564
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
565
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
566
|
m=$1
|
472
567
|
m = "0#{m}" if m[0] == ?x
|
473
568
|
[Integer(m)].pack('U*')
|
474
569
|
}
|
475
570
|
matches.collect!{|x|x[0]}.compact!
|
571
|
+
if filter
|
572
|
+
matches.reject! do |entity_reference|
|
573
|
+
filter.include?(entity_reference)
|
574
|
+
end
|
575
|
+
end
|
476
576
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
577
|
+
matches.tally.each do |entity_reference, n|
|
578
|
+
entity_expansion_count_before = @entity_expansion_count
|
579
|
+
entity_value = entity( entity_reference, entities )
|
580
|
+
if entity_value
|
581
|
+
if n > 1
|
582
|
+
entity_expansion_count_delta =
|
583
|
+
@entity_expansion_count - entity_expansion_count_before
|
584
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
585
|
+
end
|
586
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
587
|
+
rv.gsub!( re, entity_value )
|
588
|
+
if rv.bytesize > Security.entity_expansion_text_limit
|
589
|
+
raise "entity expansion has grown too large"
|
486
590
|
end
|
591
|
+
else
|
592
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
593
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
594
|
end
|
488
595
|
end
|
489
|
-
rv.gsub!(
|
596
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
597
|
end
|
491
598
|
rv
|
492
599
|
end
|
493
600
|
|
494
601
|
private
|
602
|
+
def add_namespace(prefix, uri)
|
603
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
604
|
+
if uri.nil?
|
605
|
+
@namespaces.delete(prefix)
|
606
|
+
else
|
607
|
+
@namespaces[prefix] = uri
|
608
|
+
end
|
609
|
+
end
|
610
|
+
|
611
|
+
def push_namespaces_restore
|
612
|
+
namespaces_restore = {}
|
613
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
614
|
+
namespaces_restore
|
615
|
+
end
|
616
|
+
|
617
|
+
def pop_namespaces_restore
|
618
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
619
|
+
namespaces_restore.each do |prefix, uri|
|
620
|
+
if uri.nil?
|
621
|
+
@namespaces.delete(prefix)
|
622
|
+
else
|
623
|
+
@namespaces[prefix] = uri
|
624
|
+
end
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
def record_entity_expansion(delta=1)
|
629
|
+
@entity_expansion_count += delta
|
630
|
+
if @entity_expansion_count > Security.entity_expansion_limit
|
631
|
+
raise "number of entity expansions exceeded, processing aborted."
|
632
|
+
end
|
633
|
+
end
|
634
|
+
|
495
635
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
636
|
return false if xml_declaration_encoding.nil?
|
497
637
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +639,16 @@ module REXML
|
|
499
639
|
end
|
500
640
|
|
501
641
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
642
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
643
|
unless md
|
504
|
-
if @source.match(/\
|
644
|
+
if @source.match(/\S/um)
|
505
645
|
message = "#{base_error_message}: invalid name"
|
506
646
|
else
|
507
647
|
message = "#{base_error_message}: name is missing"
|
508
648
|
end
|
509
649
|
raise REXML::ParseException.new(message, @source)
|
510
650
|
end
|
511
|
-
md[
|
651
|
+
md[0]
|
512
652
|
end
|
513
653
|
|
514
654
|
def parse_id(base_error_message,
|
@@ -578,96 +718,114 @@ module REXML
|
|
578
718
|
end
|
579
719
|
|
580
720
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
721
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
722
|
+
if @source.match(/\s+/um, true)
|
723
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
724
|
+
unless match_data
|
725
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
726
|
+
end
|
727
|
+
content = match_data[1]
|
728
|
+
else
|
729
|
+
content = nil
|
730
|
+
unless @source.match("?>", true)
|
731
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
732
|
+
end
|
733
|
+
end
|
734
|
+
if name == "xml"
|
735
|
+
if @document_status
|
736
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
737
|
+
end
|
738
|
+
version = VERSION.match(content)
|
739
|
+
version = version[1] unless version.nil?
|
740
|
+
encoding = ENCODING.match(content)
|
741
|
+
encoding = encoding[1] unless encoding.nil?
|
742
|
+
if need_source_encoding_update?(encoding)
|
743
|
+
@source.encoding = encoding
|
744
|
+
end
|
745
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
746
|
+
encoding = "UTF-16"
|
747
|
+
end
|
748
|
+
standalone = STANDALONE.match(content)
|
749
|
+
standalone = standalone[1] unless standalone.nil?
|
750
|
+
return [ :xmldecl, version, encoding, standalone ]
|
585
751
|
end
|
586
|
-
[:processing_instruction,
|
752
|
+
[:processing_instruction, name, content]
|
587
753
|
end
|
588
754
|
|
589
|
-
def parse_attributes(prefixes
|
755
|
+
def parse_attributes(prefixes)
|
590
756
|
attributes = {}
|
757
|
+
expanded_names = {}
|
591
758
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
759
|
+
while true
|
760
|
+
if @source.match(">", true)
|
761
|
+
return attributes, closed
|
762
|
+
elsif @source.match("/>", true)
|
763
|
+
closed = true
|
764
|
+
return attributes, closed
|
765
|
+
elsif match = @source.match(QNAME, true)
|
766
|
+
name = match[1]
|
767
|
+
prefix = match[2]
|
768
|
+
local_part = match[3]
|
769
|
+
|
770
|
+
unless @source.match(/\s*=\s*/um, true)
|
618
771
|
message = "Missing attribute equal: <#{name}>"
|
619
772
|
raise REXML::ParseException.new(message, @source)
|
620
773
|
end
|
621
|
-
|
622
|
-
unless quote
|
774
|
+
unless match = @source.match(/(['"])/, true)
|
623
775
|
message = "Missing attribute value start quote: <#{name}>"
|
624
776
|
raise REXML::ParseException.new(message, @source)
|
625
777
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
778
|
+
quote = match[1]
|
779
|
+
start_position = @source.position
|
780
|
+
value = @source.read_until(quote)
|
781
|
+
unless value.chomp!(quote)
|
782
|
+
@source.position = start_position
|
783
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
784
|
raise REXML::ParseException.new(message, @source)
|
639
785
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
786
|
+
@source.match(/\s*/um, true)
|
787
|
+
if prefix == "xmlns"
|
788
|
+
if local_part == "xml"
|
789
|
+
if value != "http://www.w3.org/XML/1998/namespace"
|
790
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
791
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
792
|
+
raise REXML::ParseException.new( msg, @source, self )
|
793
|
+
end
|
794
|
+
elsif local_part == "xmlns"
|
795
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
796
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
797
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
798
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
799
|
+
add_namespace(local_part, value)
|
800
|
+
elsif prefix
|
801
|
+
prefixes << prefix unless prefix == "xml"
|
657
802
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
803
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
804
|
+
if attributes[name]
|
805
|
+
msg = "Duplicate attribute #{name.inspect}"
|
806
|
+
raise REXML::ParseException.new(msg, @source, self)
|
807
|
+
end
|
667
808
|
|
668
|
-
|
809
|
+
unless prefix == "xmlns"
|
810
|
+
uri = @namespaces[prefix]
|
811
|
+
expanded_name = [uri, local_part]
|
812
|
+
existing_prefix = expanded_names[expanded_name]
|
813
|
+
if existing_prefix
|
814
|
+
message = "Namespace conflict in adding attribute " +
|
815
|
+
"\"#{local_part}\": " +
|
816
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
817
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
818
|
+
raise REXML::ParseException.new(message, @source, self)
|
819
|
+
end
|
820
|
+
expanded_names[expanded_name] = prefix
|
821
|
+
end
|
822
|
+
|
823
|
+
attributes[name] = value
|
824
|
+
else
|
825
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
826
|
+
raise REXML::ParseException.new(message, @source)
|
827
|
+
end
|
669
828
|
end
|
670
|
-
return attributes, closed
|
671
829
|
end
|
672
830
|
end
|
673
831
|
end
|