rexml 3.2.5 → 3.3.8
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/NEWS.md +449 -2
- data/README.md +10 -1
- data/doc/rexml/tasks/rdoc/element.rdoc +2 -2
- data/doc/rexml/tutorial.rdoc +1358 -0
- data/lib/rexml/attribute.rb +17 -11
- data/lib/rexml/document.rb +6 -2
- data/lib/rexml/element.rb +19 -34
- data/lib/rexml/entity.rb +9 -38
- data/lib/rexml/formatters/pretty.rb +3 -3
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/namespace.rb +8 -4
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +426 -263
- data/lib/rexml/parsers/pullparser.rb +12 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/parsers/xpathparser.rb +136 -86
- data/lib/rexml/rexml.rb +3 -1
- data/lib/rexml/source.rb +128 -98
- data/lib/rexml/text.rb +45 -21
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +10 -52
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,33 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
118
170
|
end
|
119
171
|
|
120
172
|
def add_listener( listener )
|
@@ -122,15 +174,20 @@ module REXML
|
|
122
174
|
end
|
123
175
|
|
124
176
|
attr_reader :source
|
177
|
+
attr_reader :entity_expansion_count
|
178
|
+
attr_writer :entity_expansion_limit
|
179
|
+
attr_writer :entity_expansion_text_limit
|
125
180
|
|
126
181
|
def stream=( source )
|
127
182
|
@source = SourceFactory.create_from( source )
|
128
183
|
@closed = nil
|
184
|
+
@have_root = false
|
129
185
|
@document_status = nil
|
130
186
|
@tags = []
|
131
187
|
@stack = []
|
132
188
|
@entities = []
|
133
|
-
@
|
189
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
190
|
+
@namespaces_restore_stack = []
|
134
191
|
end
|
135
192
|
|
136
193
|
def position
|
@@ -180,6 +237,8 @@ module REXML
|
|
180
237
|
|
181
238
|
# Returns the next event. This is a +PullEvent+ object.
|
182
239
|
def pull
|
240
|
+
@source.drop_parsed_content
|
241
|
+
|
183
242
|
pull_event.tap do |event|
|
184
243
|
@listeners.each do |listener|
|
185
244
|
listener.receive event
|
@@ -192,236 +251,274 @@ module REXML
|
|
192
251
|
x, @closed = @closed, nil
|
193
252
|
return [ :end_element, x ]
|
194
253
|
end
|
195
|
-
|
254
|
+
if empty?
|
255
|
+
if @document_status == :in_doctype
|
256
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
257
|
+
end
|
258
|
+
unless @tags.empty?
|
259
|
+
path = "/" + @tags.join("/")
|
260
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
261
|
+
end
|
262
|
+
return [ :end_document ]
|
263
|
+
end
|
196
264
|
return @stack.shift if @stack.size > 0
|
197
265
|
#STDERR.puts @source.encoding
|
198
266
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
267
|
+
|
268
|
+
@source.ensure_buffer
|
199
269
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
270
|
+
start_position = @source.position
|
271
|
+
if @source.match("<?", true)
|
223
272
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
273
|
+
elsif @source.match("<!", true)
|
274
|
+
if @source.match("--", true)
|
275
|
+
md = @source.match(/(.*?)-->/um, true)
|
276
|
+
if md.nil?
|
277
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
242
278
|
end
|
243
|
-
if
|
279
|
+
if /--|-\z/.match?(md[1])
|
280
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
281
|
+
end
|
282
|
+
return [ :comment, md[1] ]
|
283
|
+
elsif @source.match("DOCTYPE", true)
|
284
|
+
base_error_message = "Malformed DOCTYPE"
|
285
|
+
unless @source.match(/\s+/um, true)
|
286
|
+
if @source.match(">")
|
287
|
+
message = "#{base_error_message}: name is missing"
|
288
|
+
else
|
289
|
+
message = "#{base_error_message}: invalid name"
|
290
|
+
end
|
291
|
+
@source.position = start_position
|
292
|
+
raise REXML::ParseException.new(message, @source)
|
293
|
+
end
|
294
|
+
name = parse_name(base_error_message)
|
295
|
+
if @source.match(/\s*\[/um, true)
|
296
|
+
id = [nil, nil, nil]
|
244
297
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
298
|
+
elsif @source.match(/\s*>/um, true)
|
299
|
+
id = [nil, nil, nil]
|
246
300
|
@document_status = :after_doctype
|
301
|
+
@source.ensure_buffer
|
247
302
|
else
|
248
|
-
|
249
|
-
|
303
|
+
id = parse_id(base_error_message,
|
304
|
+
accept_external_id: true,
|
305
|
+
accept_public_id: false)
|
306
|
+
if id[0] == "SYSTEM"
|
307
|
+
# For backward compatibility
|
308
|
+
id[1], id[2] = id[2], nil
|
309
|
+
end
|
310
|
+
if @source.match(/\s*\[/um, true)
|
311
|
+
@document_status = :in_doctype
|
312
|
+
elsif @source.match(/\s*>/um, true)
|
313
|
+
@document_status = :after_doctype
|
314
|
+
@source.ensure_buffer
|
315
|
+
else
|
316
|
+
message = "#{base_error_message}: garbage after external ID"
|
317
|
+
raise REXML::ParseException.new(message, @source)
|
318
|
+
end
|
250
319
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
320
|
+
args = [:start_doctype, name, *id]
|
321
|
+
if @document_status == :after_doctype
|
322
|
+
@source.match(/\s*/um, true)
|
323
|
+
@stack << [ :end_doctype ]
|
324
|
+
end
|
325
|
+
return args
|
326
|
+
else
|
327
|
+
message = "Invalid XML"
|
328
|
+
raise REXML::ParseException.new(message, @source)
|
263
329
|
end
|
264
330
|
end
|
265
331
|
end
|
266
332
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
333
|
+
@source.match(/\s*/um, true) # skip spaces
|
334
|
+
start_position = @source.position
|
335
|
+
if @source.match("<!", true)
|
336
|
+
if @source.match("ELEMENT", true)
|
337
|
+
md = @source.match(/(.*?)>/um, true)
|
338
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
339
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
340
|
+
elsif @source.match("ENTITY", true)
|
341
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
342
|
+
unless match_data
|
343
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
344
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
345
|
+
match = [:entitydecl, *match_data.captures.compact]
|
346
|
+
ref = false
|
347
|
+
if match[1] == '%'
|
348
|
+
ref = true
|
349
|
+
match.delete_at 1
|
350
|
+
end
|
351
|
+
# Now we have to sort out what kind of entity reference this is
|
352
|
+
if match[2] == 'SYSTEM'
|
353
|
+
# External reference
|
354
|
+
match[3] = match[3][1..-2] # PUBID
|
355
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
356
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
357
|
+
elsif match[2] == 'PUBLIC'
|
358
|
+
# External reference
|
359
|
+
match[3] = match[3][1..-2] # PUBID
|
360
|
+
match[4] = match[4][1..-2] # HREF
|
361
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
362
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
363
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
364
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
365
|
else
|
329
|
-
|
366
|
+
match[2] = match[2][1..-2]
|
367
|
+
match.pop if match.size == 4
|
368
|
+
# match is [ :entity, name, value ]
|
330
369
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
370
|
+
match << '%' if ref
|
371
|
+
return match
|
372
|
+
elsif @source.match("ATTLIST", true)
|
373
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
374
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
375
|
+
element = md[1]
|
376
|
+
contents = md[0]
|
377
|
+
|
378
|
+
pairs = {}
|
379
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
380
|
+
values.each do |attdef|
|
381
|
+
unless attdef[3] == "#IMPLIED"
|
382
|
+
attdef.compact!
|
383
|
+
val = attdef[3]
|
384
|
+
val = attdef[4] if val == "#FIXED "
|
385
|
+
pairs[attdef[0]] = val
|
386
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
387
|
+
@namespaces[$1] = val
|
388
|
+
end
|
389
|
+
end
|
390
|
+
end
|
391
|
+
return [ :attlistdecl, element, pairs, contents ]
|
392
|
+
elsif @source.match("NOTATION", true)
|
393
|
+
base_error_message = "Malformed notation declaration"
|
394
|
+
unless @source.match(/\s+/um, true)
|
395
|
+
if @source.match(">")
|
396
|
+
message = "#{base_error_message}: name is missing"
|
397
|
+
else
|
398
|
+
message = "#{base_error_message}: invalid name"
|
399
|
+
end
|
400
|
+
@source.position = start_position
|
401
|
+
raise REXML::ParseException.new(message, @source)
|
402
|
+
end
|
403
|
+
name = parse_name(base_error_message)
|
404
|
+
id = parse_id(base_error_message,
|
405
|
+
accept_external_id: true,
|
406
|
+
accept_public_id: true)
|
407
|
+
unless @source.match(/\s*>/um, true)
|
408
|
+
message = "#{base_error_message}: garbage before end >"
|
409
|
+
raise REXML::ParseException.new(message, @source)
|
410
|
+
end
|
411
|
+
return [:notationdecl, name, *id]
|
412
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
413
|
+
case md[1]
|
414
|
+
when /--/, /-\z/
|
415
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
416
|
+
end
|
417
|
+
return [ :comment, md[1] ] if md
|
340
418
|
end
|
341
|
-
|
342
|
-
|
419
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
420
|
+
return [ :externalentity, match[1] ]
|
421
|
+
elsif @source.match(/\]\s*>/um, true)
|
343
422
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
423
|
return [ :end_doctype ]
|
346
424
|
end
|
425
|
+
if @document_status == :in_doctype
|
426
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
427
|
+
end
|
347
428
|
end
|
348
429
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
430
|
+
@source.match(/\s*/um, true)
|
350
431
|
end
|
351
432
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
433
|
+
start_position = @source.position
|
434
|
+
if @source.match("<", true)
|
435
|
+
# :text's read_until may remain only "<" in buffer. In the
|
436
|
+
# case, buffer is empty here. So we need to fill buffer
|
437
|
+
# here explicitly.
|
438
|
+
@source.ensure_buffer
|
439
|
+
if @source.match("/", true)
|
440
|
+
@namespaces_restore_stack.pop
|
356
441
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
442
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
443
|
if md and !last_tag
|
359
444
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
445
|
raise REXML::ParseException.new(message, @source)
|
361
446
|
end
|
362
447
|
if md.nil? or last_tag != md[1]
|
363
448
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
449
|
+
message += " (got '#{md[1]}')" if md
|
450
|
+
@source.position = start_position if md.nil?
|
365
451
|
raise REXML::ParseException.new(message, @source)
|
366
452
|
end
|
367
453
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
454
|
+
elsif @source.match("!", true)
|
455
|
+
md = @source.match(/([^>]*>)/um)
|
370
456
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
457
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
458
|
+
if md[0][0] == ?-
|
459
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
460
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
461
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
462
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
463
|
end
|
379
464
|
|
380
|
-
return [ :comment, md[1] ]
|
465
|
+
return [ :comment, md[1] ]
|
381
466
|
else
|
382
|
-
md = @source.match(
|
467
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
468
|
return [ :cdata, md[1] ] if md
|
384
469
|
end
|
385
470
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
471
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
472
|
+
elsif @source.match("?", true)
|
388
473
|
return process_instruction
|
389
474
|
else
|
390
475
|
# Get the next tag
|
391
|
-
md = @source.match(
|
476
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
477
|
unless md
|
478
|
+
@source.position = start_position
|
393
479
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
480
|
end
|
481
|
+
tag = md[1]
|
395
482
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
483
|
+
@prefixes.clear
|
484
|
+
@prefixes << md[2] if md[2]
|
485
|
+
push_namespaces_restore
|
486
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
487
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
488
|
+
for prefix in @prefixes
|
489
|
+
unless @namespaces.key?(prefix)
|
403
490
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
491
|
end
|
405
492
|
end
|
406
493
|
|
407
494
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
495
|
+
@closed = tag
|
496
|
+
pop_namespaces_restore
|
410
497
|
else
|
411
|
-
@tags.
|
498
|
+
if @tags.empty? and @have_root
|
499
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
500
|
+
end
|
501
|
+
@tags.push( tag )
|
412
502
|
end
|
413
|
-
|
503
|
+
@have_root = true
|
504
|
+
return [ :start_element, tag, attributes ]
|
414
505
|
end
|
415
506
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
507
|
+
text = @source.read_until("<")
|
508
|
+
if text.chomp!("<")
|
509
|
+
@source.position -= "<".bytesize
|
419
510
|
end
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
511
|
+
if @tags.empty?
|
512
|
+
unless /\A\s*\z/.match?(text)
|
513
|
+
if @have_root
|
514
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
515
|
+
else
|
516
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
517
|
+
end
|
518
|
+
end
|
519
|
+
return pull_event if @have_root
|
520
|
+
end
|
521
|
+
return [ :text, text ]
|
425
522
|
end
|
426
523
|
rescue REXML::UndefinedNamespaceException
|
427
524
|
raise
|
@@ -436,13 +533,13 @@ module REXML
|
|
436
533
|
private :pull_event
|
437
534
|
|
438
535
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
536
|
+
return unless entities
|
537
|
+
|
538
|
+
value = entities[ reference ]
|
539
|
+
return if value.nil?
|
540
|
+
|
541
|
+
record_entity_expansion
|
542
|
+
unnormalize( value, entities )
|
446
543
|
end
|
447
544
|
|
448
545
|
# Escapes all possible entities
|
@@ -463,35 +560,83 @@ module REXML
|
|
463
560
|
|
464
561
|
# Unescapes all possible entities
|
465
562
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
563
|
+
if string.include?("\r")
|
564
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
565
|
+
else
|
566
|
+
rv = string.dup
|
567
|
+
end
|
468
568
|
matches = rv.scan( REFERENCE_RE )
|
469
569
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
570
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
571
|
m=$1
|
472
572
|
m = "0#{m}" if m[0] == ?x
|
473
573
|
[Integer(m)].pack('U*')
|
474
574
|
}
|
475
575
|
matches.collect!{|x|x[0]}.compact!
|
576
|
+
if filter
|
577
|
+
matches.reject! do |entity_reference|
|
578
|
+
filter.include?(entity_reference)
|
579
|
+
end
|
580
|
+
end
|
476
581
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
582
|
+
matches.tally.each do |entity_reference, n|
|
583
|
+
entity_expansion_count_before = @entity_expansion_count
|
584
|
+
entity_value = entity( entity_reference, entities )
|
585
|
+
if entity_value
|
586
|
+
if n > 1
|
587
|
+
entity_expansion_count_delta =
|
588
|
+
@entity_expansion_count - entity_expansion_count_before
|
589
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
590
|
+
end
|
591
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
592
|
+
rv.gsub!( re, entity_value )
|
593
|
+
if rv.bytesize > @entity_expansion_text_limit
|
594
|
+
raise "entity expansion has grown too large"
|
486
595
|
end
|
596
|
+
else
|
597
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
598
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
599
|
end
|
488
600
|
end
|
489
|
-
rv.gsub!(
|
601
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
602
|
end
|
491
603
|
rv
|
492
604
|
end
|
493
605
|
|
494
606
|
private
|
607
|
+
def add_namespace(prefix, uri)
|
608
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
609
|
+
if uri.nil?
|
610
|
+
@namespaces.delete(prefix)
|
611
|
+
else
|
612
|
+
@namespaces[prefix] = uri
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
def push_namespaces_restore
|
617
|
+
namespaces_restore = {}
|
618
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
619
|
+
namespaces_restore
|
620
|
+
end
|
621
|
+
|
622
|
+
def pop_namespaces_restore
|
623
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
624
|
+
namespaces_restore.each do |prefix, uri|
|
625
|
+
if uri.nil?
|
626
|
+
@namespaces.delete(prefix)
|
627
|
+
else
|
628
|
+
@namespaces[prefix] = uri
|
629
|
+
end
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def record_entity_expansion(delta=1)
|
634
|
+
@entity_expansion_count += delta
|
635
|
+
if @entity_expansion_count > @entity_expansion_limit
|
636
|
+
raise "number of entity expansions exceeded, processing aborted."
|
637
|
+
end
|
638
|
+
end
|
639
|
+
|
495
640
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
641
|
return false if xml_declaration_encoding.nil?
|
497
642
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +644,16 @@ module REXML
|
|
499
644
|
end
|
500
645
|
|
501
646
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
647
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
648
|
unless md
|
504
|
-
if @source.match(/\
|
649
|
+
if @source.match(/\S/um)
|
505
650
|
message = "#{base_error_message}: invalid name"
|
506
651
|
else
|
507
652
|
message = "#{base_error_message}: name is missing"
|
508
653
|
end
|
509
654
|
raise REXML::ParseException.new(message, @source)
|
510
655
|
end
|
511
|
-
md[
|
656
|
+
md[0]
|
512
657
|
end
|
513
658
|
|
514
659
|
def parse_id(base_error_message,
|
@@ -578,96 +723,114 @@ module REXML
|
|
578
723
|
end
|
579
724
|
|
580
725
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
726
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
727
|
+
if @source.match(/\s+/um, true)
|
728
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
729
|
+
unless match_data
|
730
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
731
|
+
end
|
732
|
+
content = match_data[1]
|
733
|
+
else
|
734
|
+
content = nil
|
735
|
+
unless @source.match("?>", true)
|
736
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
737
|
+
end
|
738
|
+
end
|
739
|
+
if name == "xml"
|
740
|
+
if @document_status
|
741
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
742
|
+
end
|
743
|
+
version = VERSION.match(content)
|
744
|
+
version = version[1] unless version.nil?
|
745
|
+
encoding = ENCODING.match(content)
|
746
|
+
encoding = encoding[1] unless encoding.nil?
|
747
|
+
if need_source_encoding_update?(encoding)
|
748
|
+
@source.encoding = encoding
|
749
|
+
end
|
750
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
751
|
+
encoding = "UTF-16"
|
752
|
+
end
|
753
|
+
standalone = STANDALONE.match(content)
|
754
|
+
standalone = standalone[1] unless standalone.nil?
|
755
|
+
return [ :xmldecl, version, encoding, standalone ]
|
585
756
|
end
|
586
|
-
[:processing_instruction,
|
757
|
+
[:processing_instruction, name, content]
|
587
758
|
end
|
588
759
|
|
589
|
-
def parse_attributes(prefixes
|
760
|
+
def parse_attributes(prefixes)
|
590
761
|
attributes = {}
|
762
|
+
expanded_names = {}
|
591
763
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
764
|
+
while true
|
765
|
+
if @source.match(">", true)
|
766
|
+
return attributes, closed
|
767
|
+
elsif @source.match("/>", true)
|
768
|
+
closed = true
|
769
|
+
return attributes, closed
|
770
|
+
elsif match = @source.match(QNAME, true)
|
771
|
+
name = match[1]
|
772
|
+
prefix = match[2]
|
773
|
+
local_part = match[3]
|
774
|
+
|
775
|
+
unless @source.match(/\s*=\s*/um, true)
|
618
776
|
message = "Missing attribute equal: <#{name}>"
|
619
777
|
raise REXML::ParseException.new(message, @source)
|
620
778
|
end
|
621
|
-
|
622
|
-
unless quote
|
779
|
+
unless match = @source.match(/(['"])/, true)
|
623
780
|
message = "Missing attribute value start quote: <#{name}>"
|
624
781
|
raise REXML::ParseException.new(message, @source)
|
625
782
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
783
|
+
quote = match[1]
|
784
|
+
start_position = @source.position
|
785
|
+
value = @source.read_until(quote)
|
786
|
+
unless value.chomp!(quote)
|
787
|
+
@source.position = start_position
|
788
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
789
|
raise REXML::ParseException.new(message, @source)
|
639
790
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
791
|
+
@source.match(/\s*/um, true)
|
792
|
+
if prefix == "xmlns"
|
793
|
+
if local_part == "xml"
|
794
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
795
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
796
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
797
|
+
raise REXML::ParseException.new( msg, @source, self )
|
798
|
+
end
|
799
|
+
elsif local_part == "xmlns"
|
800
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
801
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
802
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
803
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
804
|
+
add_namespace(local_part, value)
|
805
|
+
elsif prefix
|
806
|
+
prefixes << prefix unless prefix == "xml"
|
657
807
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
808
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
809
|
+
if attributes[name]
|
810
|
+
msg = "Duplicate attribute #{name.inspect}"
|
811
|
+
raise REXML::ParseException.new(msg, @source, self)
|
812
|
+
end
|
667
813
|
|
668
|
-
|
814
|
+
unless prefix == "xmlns"
|
815
|
+
uri = @namespaces[prefix]
|
816
|
+
expanded_name = [uri, local_part]
|
817
|
+
existing_prefix = expanded_names[expanded_name]
|
818
|
+
if existing_prefix
|
819
|
+
message = "Namespace conflict in adding attribute " +
|
820
|
+
"\"#{local_part}\": " +
|
821
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
822
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
823
|
+
raise REXML::ParseException.new(message, @source, self)
|
824
|
+
end
|
825
|
+
expanded_names[expanded_name] = prefix
|
826
|
+
end
|
827
|
+
|
828
|
+
attributes[name] = value
|
829
|
+
else
|
830
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
831
|
+
raise REXML::ParseException.new(message, @source)
|
832
|
+
end
|
669
833
|
end
|
670
|
-
return attributes, closed
|
671
834
|
end
|
672
835
|
end
|
673
836
|
end
|