rexml 3.2.6 → 3.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS.md +399 -0
- data/lib/rexml/attribute.rb +3 -2
- data/lib/rexml/document.rb +5 -1
- data/lib/rexml/element.rb +16 -31
- data/lib/rexml/entity.rb +9 -48
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +446 -274
- data/lib/rexml/parsers/pullparser.rb +16 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +171 -100
- data/lib/rexml/text.rb +54 -57
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +6 -47
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,34 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
170
|
+
@source.ensure_buffer
|
118
171
|
end
|
119
172
|
|
120
173
|
def add_listener( listener )
|
@@ -122,15 +175,24 @@ module REXML
|
|
122
175
|
end
|
123
176
|
|
124
177
|
attr_reader :source
|
178
|
+
attr_reader :entity_expansion_count
|
179
|
+
attr_writer :entity_expansion_limit
|
180
|
+
attr_writer :entity_expansion_text_limit
|
125
181
|
|
126
182
|
def stream=( source )
|
127
183
|
@source = SourceFactory.create_from( source )
|
184
|
+
reset
|
185
|
+
end
|
186
|
+
|
187
|
+
def reset
|
128
188
|
@closed = nil
|
189
|
+
@have_root = false
|
129
190
|
@document_status = nil
|
130
191
|
@tags = []
|
131
192
|
@stack = []
|
132
193
|
@entities = []
|
133
|
-
@
|
194
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
195
|
+
@namespaces_restore_stack = []
|
134
196
|
end
|
135
197
|
|
136
198
|
def position
|
@@ -180,6 +242,8 @@ module REXML
|
|
180
242
|
|
181
243
|
# Returns the next event. This is a +PullEvent+ object.
|
182
244
|
def pull
|
245
|
+
@source.drop_parsed_content
|
246
|
+
|
183
247
|
pull_event.tap do |event|
|
184
248
|
@listeners.each do |listener|
|
185
249
|
listener.receive event
|
@@ -192,236 +256,274 @@ module REXML
|
|
192
256
|
x, @closed = @closed, nil
|
193
257
|
return [ :end_element, x ]
|
194
258
|
end
|
195
|
-
|
259
|
+
if empty?
|
260
|
+
if @document_status == :in_doctype
|
261
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
262
|
+
end
|
263
|
+
unless @tags.empty?
|
264
|
+
path = "/" + @tags.join("/")
|
265
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
266
|
+
end
|
267
|
+
return [ :end_document ]
|
268
|
+
end
|
196
269
|
return @stack.shift if @stack.size > 0
|
197
270
|
#STDERR.puts @source.encoding
|
198
271
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
272
|
+
|
273
|
+
@source.ensure_buffer
|
199
274
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
275
|
+
start_position = @source.position
|
276
|
+
if @source.match?("<?", true)
|
223
277
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
278
|
+
elsif @source.match?("<!", true)
|
279
|
+
if @source.match?("--", true)
|
280
|
+
md = @source.match(/(.*?)-->/um, true)
|
281
|
+
if md.nil?
|
282
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
283
|
+
end
|
284
|
+
if /--|-\z/.match?(md[1])
|
285
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
286
|
+
end
|
287
|
+
return [ :comment, md[1] ]
|
288
|
+
elsif @source.match?("DOCTYPE", true)
|
289
|
+
base_error_message = "Malformed DOCTYPE"
|
290
|
+
unless @source.match?(/\s+/um, true)
|
291
|
+
if @source.match?(">")
|
292
|
+
message = "#{base_error_message}: name is missing"
|
293
|
+
else
|
294
|
+
message = "#{base_error_message}: invalid name"
|
295
|
+
end
|
296
|
+
@source.position = start_position
|
297
|
+
raise REXML::ParseException.new(message, @source)
|
242
298
|
end
|
243
|
-
|
299
|
+
name = parse_name(base_error_message)
|
300
|
+
if @source.match?(/\s*\[/um, true)
|
301
|
+
id = [nil, nil, nil]
|
244
302
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
303
|
+
elsif @source.match?(/\s*>/um, true)
|
304
|
+
id = [nil, nil, nil]
|
246
305
|
@document_status = :after_doctype
|
306
|
+
@source.ensure_buffer
|
247
307
|
else
|
248
|
-
|
249
|
-
|
308
|
+
id = parse_id(base_error_message,
|
309
|
+
accept_external_id: true,
|
310
|
+
accept_public_id: false)
|
311
|
+
if id[0] == "SYSTEM"
|
312
|
+
# For backward compatibility
|
313
|
+
id[1], id[2] = id[2], nil
|
314
|
+
end
|
315
|
+
if @source.match?(/\s*\[/um, true)
|
316
|
+
@document_status = :in_doctype
|
317
|
+
elsif @source.match?(/\s*>/um, true)
|
318
|
+
@document_status = :after_doctype
|
319
|
+
@source.ensure_buffer
|
320
|
+
else
|
321
|
+
message = "#{base_error_message}: garbage after external ID"
|
322
|
+
raise REXML::ParseException.new(message, @source)
|
323
|
+
end
|
250
324
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
325
|
+
args = [:start_doctype, name, *id]
|
326
|
+
if @document_status == :after_doctype
|
327
|
+
@source.match?(/\s*/um, true)
|
328
|
+
@stack << [ :end_doctype ]
|
329
|
+
end
|
330
|
+
return args
|
331
|
+
else
|
332
|
+
message = "Invalid XML"
|
333
|
+
raise REXML::ParseException.new(message, @source)
|
263
334
|
end
|
264
335
|
end
|
265
336
|
end
|
266
337
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
338
|
+
@source.match?(/\s*/um, true) # skip spaces
|
339
|
+
start_position = @source.position
|
340
|
+
if @source.match?("<!", true)
|
341
|
+
if @source.match?("ELEMENT", true)
|
342
|
+
md = @source.match(/(.*?)>/um, true)
|
343
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
344
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
345
|
+
elsif @source.match?("ENTITY", true)
|
346
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
347
|
+
unless match_data
|
348
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
349
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
350
|
+
match = [:entitydecl, *match_data.captures.compact]
|
351
|
+
ref = false
|
352
|
+
if match[1] == '%'
|
353
|
+
ref = true
|
354
|
+
match.delete_at 1
|
355
|
+
end
|
356
|
+
# Now we have to sort out what kind of entity reference this is
|
357
|
+
if match[2] == 'SYSTEM'
|
358
|
+
# External reference
|
359
|
+
match[3] = match[3][1..-2] # PUBID
|
360
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
361
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
362
|
+
elsif match[2] == 'PUBLIC'
|
363
|
+
# External reference
|
364
|
+
match[3] = match[3][1..-2] # PUBID
|
365
|
+
match[4] = match[4][1..-2] # HREF
|
366
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
367
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
368
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
369
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
370
|
else
|
329
|
-
|
371
|
+
match[2] = match[2][1..-2]
|
372
|
+
match.pop if match.size == 4
|
373
|
+
# match is [ :entity, name, value ]
|
330
374
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
375
|
+
match << '%' if ref
|
376
|
+
return match
|
377
|
+
elsif @source.match?("ATTLIST", true)
|
378
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
379
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
380
|
+
element = md[1]
|
381
|
+
contents = md[0]
|
382
|
+
|
383
|
+
pairs = {}
|
384
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
385
|
+
values.each do |attdef|
|
386
|
+
unless attdef[3] == "#IMPLIED"
|
387
|
+
attdef.compact!
|
388
|
+
val = attdef[3]
|
389
|
+
val = attdef[4] if val == "#FIXED "
|
390
|
+
pairs[attdef[0]] = val
|
391
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
392
|
+
@namespaces[$1] = val
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
return [ :attlistdecl, element, pairs, contents ]
|
397
|
+
elsif @source.match?("NOTATION", true)
|
398
|
+
base_error_message = "Malformed notation declaration"
|
399
|
+
unless @source.match?(/\s+/um, true)
|
400
|
+
if @source.match?(">")
|
401
|
+
message = "#{base_error_message}: name is missing"
|
402
|
+
else
|
403
|
+
message = "#{base_error_message}: invalid name"
|
404
|
+
end
|
405
|
+
@source.position = start_position
|
406
|
+
raise REXML::ParseException.new(message, @source)
|
407
|
+
end
|
408
|
+
name = parse_name(base_error_message)
|
409
|
+
id = parse_id(base_error_message,
|
410
|
+
accept_external_id: true,
|
411
|
+
accept_public_id: true)
|
412
|
+
unless @source.match?(/\s*>/um, true)
|
413
|
+
message = "#{base_error_message}: garbage before end >"
|
414
|
+
raise REXML::ParseException.new(message, @source)
|
415
|
+
end
|
416
|
+
return [:notationdecl, name, *id]
|
417
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
418
|
+
case md[1]
|
419
|
+
when /--/, /-\z/
|
420
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
421
|
+
end
|
422
|
+
return [ :comment, md[1] ] if md
|
340
423
|
end
|
341
|
-
|
342
|
-
|
424
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
425
|
+
return [ :externalentity, match[1] ]
|
426
|
+
elsif @source.match?(/\]\s*>/um, true)
|
343
427
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
428
|
return [ :end_doctype ]
|
346
429
|
end
|
430
|
+
if @document_status == :in_doctype
|
431
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
432
|
+
end
|
347
433
|
end
|
348
434
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
435
|
+
@source.match?(/\s*/um, true)
|
350
436
|
end
|
351
437
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
438
|
+
start_position = @source.position
|
439
|
+
if @source.match?("<", true)
|
440
|
+
# :text's read_until may remain only "<" in buffer. In the
|
441
|
+
# case, buffer is empty here. So we need to fill buffer
|
442
|
+
# here explicitly.
|
443
|
+
@source.ensure_buffer
|
444
|
+
if @source.match?("/", true)
|
445
|
+
@namespaces_restore_stack.pop
|
356
446
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
447
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
448
|
if md and !last_tag
|
359
449
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
450
|
raise REXML::ParseException.new(message, @source)
|
361
451
|
end
|
362
452
|
if md.nil? or last_tag != md[1]
|
363
453
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
454
|
+
message += " (got '#{md[1]}')" if md
|
455
|
+
@source.position = start_position if md.nil?
|
365
456
|
raise REXML::ParseException.new(message, @source)
|
366
457
|
end
|
367
458
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
459
|
+
elsif @source.match?("!", true)
|
460
|
+
md = @source.match(/([^>]*>)/um)
|
370
461
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
462
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
463
|
+
if md[0][0] == ?-
|
464
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
465
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
466
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
467
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
468
|
end
|
379
469
|
|
380
|
-
return [ :comment, md[1] ]
|
470
|
+
return [ :comment, md[1] ]
|
381
471
|
else
|
382
|
-
md = @source.match(
|
472
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
473
|
return [ :cdata, md[1] ] if md
|
384
474
|
end
|
385
475
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
476
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
477
|
+
elsif @source.match?("?", true)
|
388
478
|
return process_instruction
|
389
479
|
else
|
390
480
|
# Get the next tag
|
391
|
-
md = @source.match(
|
481
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
482
|
unless md
|
483
|
+
@source.position = start_position
|
393
484
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
485
|
end
|
486
|
+
tag = md[1]
|
395
487
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
488
|
+
@prefixes.clear
|
489
|
+
@prefixes << md[2] if md[2]
|
490
|
+
push_namespaces_restore
|
491
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
492
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
493
|
+
for prefix in @prefixes
|
494
|
+
unless @namespaces.key?(prefix)
|
403
495
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
496
|
end
|
405
497
|
end
|
406
498
|
|
407
499
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
500
|
+
@closed = tag
|
501
|
+
pop_namespaces_restore
|
410
502
|
else
|
411
|
-
@tags.
|
503
|
+
if @tags.empty? and @have_root
|
504
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
505
|
+
end
|
506
|
+
@tags.push( tag )
|
412
507
|
end
|
413
|
-
|
508
|
+
@have_root = true
|
509
|
+
return [ :start_element, tag, attributes ]
|
414
510
|
end
|
415
511
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
512
|
+
text = @source.read_until("<")
|
513
|
+
if text.chomp!("<")
|
514
|
+
@source.position -= "<".bytesize
|
515
|
+
end
|
516
|
+
if @tags.empty?
|
517
|
+
unless /\A\s*\z/.match?(text)
|
518
|
+
if @have_root
|
519
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
520
|
+
else
|
521
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
522
|
+
end
|
523
|
+
end
|
524
|
+
return pull_event if @have_root
|
419
525
|
end
|
420
|
-
|
421
|
-
#return [ :text, "" ] if md[0].length == 0
|
422
|
-
# unnormalized = Text::unnormalize( md[1], self )
|
423
|
-
# return PullEvent.new( :text, md[1], unnormalized )
|
424
|
-
return [ :text, md[1] ]
|
526
|
+
return [ :text, text ]
|
425
527
|
end
|
426
528
|
rescue REXML::UndefinedNamespaceException
|
427
529
|
raise
|
@@ -436,13 +538,13 @@ module REXML
|
|
436
538
|
private :pull_event
|
437
539
|
|
438
540
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
541
|
+
return unless entities
|
542
|
+
|
543
|
+
value = entities[ reference ]
|
544
|
+
return if value.nil?
|
545
|
+
|
546
|
+
record_entity_expansion
|
547
|
+
unnormalize( value, entities )
|
446
548
|
end
|
447
549
|
|
448
550
|
# Escapes all possible entities
|
@@ -463,35 +565,87 @@ module REXML
|
|
463
565
|
|
464
566
|
# Unescapes all possible entities
|
465
567
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
568
|
+
if string.include?("\r")
|
569
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
570
|
+
else
|
571
|
+
rv = string.dup
|
572
|
+
end
|
468
573
|
matches = rv.scan( REFERENCE_RE )
|
469
574
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
575
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
576
|
m=$1
|
472
|
-
|
473
|
-
|
577
|
+
if m.start_with?("x")
|
578
|
+
code_point = Integer(m[1..-1], 16)
|
579
|
+
else
|
580
|
+
code_point = Integer(m, 10)
|
581
|
+
end
|
582
|
+
[code_point].pack('U*')
|
474
583
|
}
|
475
584
|
matches.collect!{|x|x[0]}.compact!
|
585
|
+
if filter
|
586
|
+
matches.reject! do |entity_reference|
|
587
|
+
filter.include?(entity_reference)
|
588
|
+
end
|
589
|
+
end
|
476
590
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
rv.gsub!( er[0], er[2] ) if er
|
591
|
+
matches.tally.each do |entity_reference, n|
|
592
|
+
entity_expansion_count_before = @entity_expansion_count
|
593
|
+
entity_value = entity( entity_reference, entities )
|
594
|
+
if entity_value
|
595
|
+
if n > 1
|
596
|
+
entity_expansion_count_delta =
|
597
|
+
@entity_expansion_count - entity_expansion_count_before
|
598
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
486
599
|
end
|
600
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
601
|
+
rv.gsub!( re, entity_value )
|
602
|
+
if rv.bytesize > @entity_expansion_text_limit
|
603
|
+
raise "entity expansion has grown too large"
|
604
|
+
end
|
605
|
+
else
|
606
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
607
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
608
|
end
|
488
609
|
end
|
489
|
-
rv.gsub!(
|
610
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
611
|
end
|
491
612
|
rv
|
492
613
|
end
|
493
614
|
|
494
615
|
private
|
616
|
+
def add_namespace(prefix, uri)
|
617
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
618
|
+
if uri.nil?
|
619
|
+
@namespaces.delete(prefix)
|
620
|
+
else
|
621
|
+
@namespaces[prefix] = uri
|
622
|
+
end
|
623
|
+
end
|
624
|
+
|
625
|
+
def push_namespaces_restore
|
626
|
+
namespaces_restore = {}
|
627
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
628
|
+
namespaces_restore
|
629
|
+
end
|
630
|
+
|
631
|
+
def pop_namespaces_restore
|
632
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
633
|
+
namespaces_restore.each do |prefix, uri|
|
634
|
+
if uri.nil?
|
635
|
+
@namespaces.delete(prefix)
|
636
|
+
else
|
637
|
+
@namespaces[prefix] = uri
|
638
|
+
end
|
639
|
+
end
|
640
|
+
end
|
641
|
+
|
642
|
+
def record_entity_expansion(delta=1)
|
643
|
+
@entity_expansion_count += delta
|
644
|
+
if @entity_expansion_count > @entity_expansion_limit
|
645
|
+
raise "number of entity expansions exceeded, processing aborted."
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
495
649
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
650
|
return false if xml_declaration_encoding.nil?
|
497
651
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +653,16 @@ module REXML
|
|
499
653
|
end
|
500
654
|
|
501
655
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
656
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
657
|
unless md
|
504
|
-
if @source.match(/\
|
658
|
+
if @source.match?(/\S/um)
|
505
659
|
message = "#{base_error_message}: invalid name"
|
506
660
|
else
|
507
661
|
message = "#{base_error_message}: name is missing"
|
508
662
|
end
|
509
663
|
raise REXML::ParseException.new(message, @source)
|
510
664
|
end
|
511
|
-
md[
|
665
|
+
md[0]
|
512
666
|
end
|
513
667
|
|
514
668
|
def parse_id(base_error_message,
|
@@ -543,34 +697,34 @@ module REXML
|
|
543
697
|
accept_public_id:)
|
544
698
|
public = /\A\s*PUBLIC/um
|
545
699
|
system = /\A\s*SYSTEM/um
|
546
|
-
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
|
547
|
-
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
700
|
+
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
|
701
|
+
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
548
702
|
return "public ID literal is missing"
|
549
703
|
end
|
550
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
|
704
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
|
551
705
|
return "invalid public ID literal"
|
552
706
|
end
|
553
707
|
if accept_public_id
|
554
|
-
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
708
|
+
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
555
709
|
return "system ID literal is missing"
|
556
710
|
end
|
557
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
711
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
558
712
|
return "invalid system literal"
|
559
713
|
end
|
560
714
|
"garbage after system literal"
|
561
715
|
else
|
562
716
|
"garbage after public ID literal"
|
563
717
|
end
|
564
|
-
elsif accept_external_id and @source.match(/#{system}/um)
|
565
|
-
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
718
|
+
elsif accept_external_id and @source.match?(/#{system}/um)
|
719
|
+
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
566
720
|
return "system literal is missing"
|
567
721
|
end
|
568
|
-
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
|
722
|
+
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
|
569
723
|
return "invalid system literal"
|
570
724
|
end
|
571
725
|
"garbage after system literal"
|
572
726
|
else
|
573
|
-
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
727
|
+
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
574
728
|
return "invalid ID type"
|
575
729
|
end
|
576
730
|
"ID type is missing"
|
@@ -578,96 +732,114 @@ module REXML
|
|
578
732
|
end
|
579
733
|
|
580
734
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
735
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
736
|
+
if @source.match?(/\s+/um, true)
|
737
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
738
|
+
unless match_data
|
739
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
740
|
+
end
|
741
|
+
content = match_data[1]
|
742
|
+
else
|
743
|
+
content = nil
|
744
|
+
unless @source.match?("?>", true)
|
745
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
746
|
+
end
|
585
747
|
end
|
586
|
-
|
748
|
+
if name == "xml"
|
749
|
+
if @document_status
|
750
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
751
|
+
end
|
752
|
+
version = VERSION.match(content)
|
753
|
+
version = version[1] unless version.nil?
|
754
|
+
encoding = ENCODING.match(content)
|
755
|
+
encoding = encoding[1] unless encoding.nil?
|
756
|
+
if need_source_encoding_update?(encoding)
|
757
|
+
@source.encoding = encoding
|
758
|
+
end
|
759
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
760
|
+
encoding = "UTF-16"
|
761
|
+
end
|
762
|
+
standalone = STANDALONE.match(content)
|
763
|
+
standalone = standalone[1] unless standalone.nil?
|
764
|
+
return [ :xmldecl, version, encoding, standalone ]
|
765
|
+
end
|
766
|
+
[:processing_instruction, name, content]
|
587
767
|
end
|
588
768
|
|
589
|
-
def parse_attributes(prefixes
|
769
|
+
def parse_attributes(prefixes)
|
590
770
|
attributes = {}
|
771
|
+
expanded_names = {}
|
591
772
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
773
|
+
while true
|
774
|
+
if @source.match?(">", true)
|
775
|
+
return attributes, closed
|
776
|
+
elsif @source.match?("/>", true)
|
777
|
+
closed = true
|
778
|
+
return attributes, closed
|
779
|
+
elsif match = @source.match(QNAME, true)
|
780
|
+
name = match[1]
|
781
|
+
prefix = match[2]
|
782
|
+
local_part = match[3]
|
783
|
+
|
784
|
+
unless @source.match?(/\s*=\s*/um, true)
|
618
785
|
message = "Missing attribute equal: <#{name}>"
|
619
786
|
raise REXML::ParseException.new(message, @source)
|
620
787
|
end
|
621
|
-
|
622
|
-
unless quote
|
788
|
+
unless match = @source.match(/(['"])/, true)
|
623
789
|
message = "Missing attribute value start quote: <#{name}>"
|
624
790
|
raise REXML::ParseException.new(message, @source)
|
625
791
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
792
|
+
quote = match[1]
|
793
|
+
start_position = @source.position
|
794
|
+
value = @source.read_until(quote)
|
795
|
+
unless value.chomp!(quote)
|
796
|
+
@source.position = start_position
|
797
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
798
|
raise REXML::ParseException.new(message, @source)
|
639
799
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
800
|
+
@source.match?(/\s*/um, true)
|
801
|
+
if prefix == "xmlns"
|
802
|
+
if local_part == "xml"
|
803
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
804
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
805
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
806
|
+
raise REXML::ParseException.new( msg, @source, self )
|
807
|
+
end
|
808
|
+
elsif local_part == "xmlns"
|
809
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
810
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
811
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
812
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
813
|
+
add_namespace(local_part, value)
|
814
|
+
elsif prefix
|
815
|
+
prefixes << prefix unless prefix == "xml"
|
657
816
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
817
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
818
|
+
if attributes[name]
|
819
|
+
msg = "Duplicate attribute #{name.inspect}"
|
820
|
+
raise REXML::ParseException.new(msg, @source, self)
|
821
|
+
end
|
667
822
|
|
668
|
-
|
823
|
+
unless prefix == "xmlns"
|
824
|
+
uri = @namespaces[prefix]
|
825
|
+
expanded_name = [uri, local_part]
|
826
|
+
existing_prefix = expanded_names[expanded_name]
|
827
|
+
if existing_prefix
|
828
|
+
message = "Namespace conflict in adding attribute " +
|
829
|
+
"\"#{local_part}\": " +
|
830
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
831
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
832
|
+
raise REXML::ParseException.new(message, @source, self)
|
833
|
+
end
|
834
|
+
expanded_names[expanded_name] = prefix
|
835
|
+
end
|
836
|
+
|
837
|
+
attributes[name] = value
|
838
|
+
else
|
839
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
840
|
+
raise REXML::ParseException.new(message, @source)
|
841
|
+
end
|
669
842
|
end
|
670
|
-
return attributes, closed
|
671
843
|
end
|
672
844
|
end
|
673
845
|
end
|