rexml 3.2.6 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +423 -0
- data/lib/rexml/attribute.rb +3 -2
- data/lib/rexml/document.rb +5 -1
- data/lib/rexml/element.rb +16 -31
- data/lib/rexml/entity.rb +9 -48
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +466 -273
- data/lib/rexml/parsers/pullparser.rb +16 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +185 -100
- data/lib/rexml/text.rb +54 -57
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +6 -50
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,34 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
170
|
+
@source.ensure_buffer
|
118
171
|
end
|
119
172
|
|
120
173
|
def add_listener( listener )
|
@@ -122,15 +175,24 @@ module REXML
|
|
122
175
|
end
|
123
176
|
|
124
177
|
attr_reader :source
|
178
|
+
attr_reader :entity_expansion_count
|
179
|
+
attr_writer :entity_expansion_limit
|
180
|
+
attr_writer :entity_expansion_text_limit
|
125
181
|
|
126
182
|
def stream=( source )
|
127
183
|
@source = SourceFactory.create_from( source )
|
184
|
+
reset
|
185
|
+
end
|
186
|
+
|
187
|
+
def reset
|
128
188
|
@closed = nil
|
189
|
+
@have_root = false
|
129
190
|
@document_status = nil
|
130
191
|
@tags = []
|
131
192
|
@stack = []
|
132
193
|
@entities = []
|
133
|
-
@
|
194
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
195
|
+
@namespaces_restore_stack = []
|
134
196
|
end
|
135
197
|
|
136
198
|
def position
|
@@ -180,6 +242,8 @@ module REXML
|
|
180
242
|
|
181
243
|
# Returns the next event. This is a +PullEvent+ object.
|
182
244
|
def pull
|
245
|
+
@source.drop_parsed_content
|
246
|
+
|
183
247
|
pull_event.tap do |event|
|
184
248
|
@listeners.each do |listener|
|
185
249
|
listener.receive event
|
@@ -192,236 +256,277 @@ module REXML
|
|
192
256
|
x, @closed = @closed, nil
|
193
257
|
return [ :end_element, x ]
|
194
258
|
end
|
195
|
-
|
259
|
+
if empty?
|
260
|
+
if @document_status == :in_doctype
|
261
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
262
|
+
end
|
263
|
+
unless @tags.empty?
|
264
|
+
path = "/" + @tags.join("/")
|
265
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
266
|
+
end
|
267
|
+
return [ :end_document ]
|
268
|
+
end
|
196
269
|
return @stack.shift if @stack.size > 0
|
197
270
|
#STDERR.puts @source.encoding
|
198
271
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
272
|
+
|
273
|
+
@source.ensure_buffer
|
199
274
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
275
|
+
start_position = @source.position
|
276
|
+
if @source.match?("<?", true)
|
223
277
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
278
|
+
elsif @source.match?("<!", true)
|
279
|
+
if @source.match?("--", true)
|
280
|
+
md = @source.match(/(.*?)-->/um, true)
|
281
|
+
if md.nil?
|
282
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
242
283
|
end
|
243
|
-
if
|
284
|
+
if /--|-\z/.match?(md[1])
|
285
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
286
|
+
end
|
287
|
+
return [ :comment, md[1] ]
|
288
|
+
elsif @source.match?("DOCTYPE", true)
|
289
|
+
base_error_message = "Malformed DOCTYPE"
|
290
|
+
unless @source.match?(/\s+/um, true)
|
291
|
+
if @source.match?(">")
|
292
|
+
message = "#{base_error_message}: name is missing"
|
293
|
+
else
|
294
|
+
message = "#{base_error_message}: invalid name"
|
295
|
+
end
|
296
|
+
@source.position = start_position
|
297
|
+
raise REXML::ParseException.new(message, @source)
|
298
|
+
end
|
299
|
+
name = parse_name(base_error_message)
|
300
|
+
@source.match?(/\s*/um, true) # skip spaces
|
301
|
+
if @source.match?("[", true)
|
302
|
+
id = [nil, nil, nil]
|
244
303
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(
|
304
|
+
elsif @source.match?(">", true)
|
305
|
+
id = [nil, nil, nil]
|
246
306
|
@document_status = :after_doctype
|
307
|
+
@source.ensure_buffer
|
247
308
|
else
|
248
|
-
|
249
|
-
|
309
|
+
id = parse_id(base_error_message,
|
310
|
+
accept_external_id: true,
|
311
|
+
accept_public_id: false)
|
312
|
+
if id[0] == "SYSTEM"
|
313
|
+
# For backward compatibility
|
314
|
+
id[1], id[2] = id[2], nil
|
315
|
+
end
|
316
|
+
@source.match?(/\s*/um, true) # skip spaces
|
317
|
+
if @source.match?("[", true)
|
318
|
+
@document_status = :in_doctype
|
319
|
+
elsif @source.match?(">", true)
|
320
|
+
@document_status = :after_doctype
|
321
|
+
@source.ensure_buffer
|
322
|
+
else
|
323
|
+
message = "#{base_error_message}: garbage after external ID"
|
324
|
+
raise REXML::ParseException.new(message, @source)
|
325
|
+
end
|
250
326
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
327
|
+
args = [:start_doctype, name, *id]
|
328
|
+
if @document_status == :after_doctype
|
329
|
+
@source.match?(/\s*/um, true)
|
330
|
+
@stack << [ :end_doctype ]
|
331
|
+
end
|
332
|
+
return args
|
333
|
+
else
|
334
|
+
message = "Invalid XML"
|
335
|
+
raise REXML::ParseException.new(message, @source)
|
263
336
|
end
|
264
337
|
end
|
265
338
|
end
|
266
339
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
340
|
+
@source.match?(/\s*/um, true) # skip spaces
|
341
|
+
start_position = @source.position
|
342
|
+
if @source.match?("<!", true)
|
343
|
+
if @source.match?("ELEMENT", true)
|
344
|
+
md = @source.match(/(.*?)>/um, true)
|
345
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
346
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
347
|
+
elsif @source.match?("ENTITY", true)
|
348
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
349
|
+
unless match_data
|
350
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
351
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
352
|
+
match = [:entitydecl, *match_data.captures.compact]
|
353
|
+
ref = false
|
354
|
+
if match[1] == '%'
|
355
|
+
ref = true
|
356
|
+
match.delete_at 1
|
357
|
+
end
|
358
|
+
# Now we have to sort out what kind of entity reference this is
|
359
|
+
if match[2] == 'SYSTEM'
|
360
|
+
# External reference
|
361
|
+
match[3] = match[3][1..-2] # PUBID
|
362
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
363
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
364
|
+
elsif match[2] == 'PUBLIC'
|
365
|
+
# External reference
|
366
|
+
match[3] = match[3][1..-2] # PUBID
|
367
|
+
match[4] = match[4][1..-2] # HREF
|
368
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
369
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
370
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
371
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
372
|
else
|
329
|
-
|
373
|
+
match[2] = match[2][1..-2]
|
374
|
+
match.pop if match.size == 4
|
375
|
+
# match is [ :entity, name, value ]
|
330
376
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
377
|
+
match << '%' if ref
|
378
|
+
return match
|
379
|
+
elsif @source.match?("ATTLIST", true)
|
380
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
381
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
382
|
+
element = md[1]
|
383
|
+
contents = "<!ATTLIST" + md[0]
|
384
|
+
|
385
|
+
pairs = {}
|
386
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
387
|
+
values.each do |attdef|
|
388
|
+
unless attdef[3] == "#IMPLIED"
|
389
|
+
attdef.compact!
|
390
|
+
val = attdef[3]
|
391
|
+
val = attdef[4] if val == "#FIXED "
|
392
|
+
pairs[attdef[0]] = val
|
393
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
394
|
+
@namespaces[$1] = val
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
return [ :attlistdecl, element, pairs, contents ]
|
399
|
+
elsif @source.match?("NOTATION", true)
|
400
|
+
base_error_message = "Malformed notation declaration"
|
401
|
+
unless @source.match?(/\s+/um, true)
|
402
|
+
if @source.match?(">")
|
403
|
+
message = "#{base_error_message}: name is missing"
|
404
|
+
else
|
405
|
+
message = "#{base_error_message}: invalid name"
|
406
|
+
end
|
407
|
+
@source.position = start_position
|
408
|
+
raise REXML::ParseException.new(message, @source)
|
409
|
+
end
|
410
|
+
name = parse_name(base_error_message)
|
411
|
+
id = parse_id(base_error_message,
|
412
|
+
accept_external_id: true,
|
413
|
+
accept_public_id: true)
|
414
|
+
@source.match?(/\s*/um, true) # skip spaces
|
415
|
+
unless @source.match?(">", true)
|
416
|
+
message = "#{base_error_message}: garbage before end >"
|
417
|
+
raise REXML::ParseException.new(message, @source)
|
418
|
+
end
|
419
|
+
return [:notationdecl, name, *id]
|
420
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
421
|
+
case md[1]
|
422
|
+
when /--/, /-\z/
|
423
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
424
|
+
end
|
425
|
+
return [ :comment, md[1] ] if md
|
340
426
|
end
|
341
|
-
|
342
|
-
|
427
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
428
|
+
return [ :externalentity, match[1] ]
|
429
|
+
elsif @source.match?(/\]\s*>/um, true)
|
343
430
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
431
|
return [ :end_doctype ]
|
346
432
|
end
|
433
|
+
if @document_status == :in_doctype
|
434
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
435
|
+
end
|
347
436
|
end
|
348
437
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
438
|
+
@source.match?(/\s*/um, true)
|
350
439
|
end
|
351
440
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
441
|
+
start_position = @source.position
|
442
|
+
if @source.match?("<", true)
|
443
|
+
# :text's read_until may remain only "<" in buffer. In the
|
444
|
+
# case, buffer is empty here. So we need to fill buffer
|
445
|
+
# here explicitly.
|
446
|
+
@source.ensure_buffer
|
447
|
+
if @source.match?("/", true)
|
448
|
+
@namespaces_restore_stack.pop
|
356
449
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
450
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
451
|
if md and !last_tag
|
359
452
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
453
|
raise REXML::ParseException.new(message, @source)
|
361
454
|
end
|
362
455
|
if md.nil? or last_tag != md[1]
|
363
456
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
457
|
+
message += " (got '#{md[1]}')" if md
|
458
|
+
@source.position = start_position if md.nil?
|
365
459
|
raise REXML::ParseException.new(message, @source)
|
366
460
|
end
|
367
461
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
462
|
+
elsif @source.match?("!", true)
|
463
|
+
md = @source.match(/([^>]*>)/um)
|
370
464
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
465
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
466
|
+
if md[0][0] == ?-
|
467
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
468
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
469
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
470
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
471
|
end
|
379
472
|
|
380
|
-
return [ :comment, md[1] ]
|
473
|
+
return [ :comment, md[1] ]
|
381
474
|
else
|
382
|
-
md = @source.match(
|
475
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
476
|
return [ :cdata, md[1] ] if md
|
384
477
|
end
|
385
478
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
479
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
480
|
+
elsif @source.match?("?", true)
|
388
481
|
return process_instruction
|
389
482
|
else
|
390
483
|
# Get the next tag
|
391
|
-
md = @source.match(
|
484
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
485
|
unless md
|
486
|
+
@source.position = start_position
|
393
487
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
488
|
end
|
489
|
+
tag = md[1]
|
395
490
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
491
|
+
@prefixes.clear
|
492
|
+
@prefixes << md[2] if md[2]
|
493
|
+
push_namespaces_restore
|
494
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
495
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
496
|
+
for prefix in @prefixes
|
497
|
+
unless @namespaces.key?(prefix)
|
403
498
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
499
|
end
|
405
500
|
end
|
406
501
|
|
407
502
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
503
|
+
@closed = tag
|
504
|
+
pop_namespaces_restore
|
410
505
|
else
|
411
|
-
@tags.
|
506
|
+
if @tags.empty? and @have_root
|
507
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
508
|
+
end
|
509
|
+
@tags.push( tag )
|
412
510
|
end
|
413
|
-
|
511
|
+
@have_root = true
|
512
|
+
return [ :start_element, tag, attributes ]
|
414
513
|
end
|
415
514
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
515
|
+
text = @source.read_until("<")
|
516
|
+
if text.chomp!("<")
|
517
|
+
@source.position -= "<".bytesize
|
419
518
|
end
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
519
|
+
if @tags.empty?
|
520
|
+
unless /\A\s*\z/.match?(text)
|
521
|
+
if @have_root
|
522
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
523
|
+
else
|
524
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
525
|
+
end
|
526
|
+
end
|
527
|
+
return pull_event if @have_root
|
528
|
+
end
|
529
|
+
return [ :text, text ]
|
425
530
|
end
|
426
531
|
rescue REXML::UndefinedNamespaceException
|
427
532
|
raise
|
@@ -436,13 +541,13 @@ module REXML
|
|
436
541
|
private :pull_event
|
437
542
|
|
438
543
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
544
|
+
return unless entities
|
545
|
+
|
546
|
+
value = entities[ reference ]
|
547
|
+
return if value.nil?
|
548
|
+
|
549
|
+
record_entity_expansion
|
550
|
+
unnormalize( value, entities )
|
446
551
|
end
|
447
552
|
|
448
553
|
# Escapes all possible entities
|
@@ -463,35 +568,87 @@ module REXML
|
|
463
568
|
|
464
569
|
# Unescapes all possible entities
|
465
570
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
571
|
+
if string.include?("\r")
|
572
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
573
|
+
else
|
574
|
+
rv = string.dup
|
575
|
+
end
|
468
576
|
matches = rv.scan( REFERENCE_RE )
|
469
577
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
578
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
579
|
m=$1
|
472
|
-
|
473
|
-
|
580
|
+
if m.start_with?("x")
|
581
|
+
code_point = Integer(m[1..-1], 16)
|
582
|
+
else
|
583
|
+
code_point = Integer(m, 10)
|
584
|
+
end
|
585
|
+
[code_point].pack('U*')
|
474
586
|
}
|
475
587
|
matches.collect!{|x|x[0]}.compact!
|
588
|
+
if filter
|
589
|
+
matches.reject! do |entity_reference|
|
590
|
+
filter.include?(entity_reference)
|
591
|
+
end
|
592
|
+
end
|
476
593
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
594
|
+
matches.tally.each do |entity_reference, n|
|
595
|
+
entity_expansion_count_before = @entity_expansion_count
|
596
|
+
entity_value = entity( entity_reference, entities )
|
597
|
+
if entity_value
|
598
|
+
if n > 1
|
599
|
+
entity_expansion_count_delta =
|
600
|
+
@entity_expansion_count - entity_expansion_count_before
|
601
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
602
|
+
end
|
603
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
604
|
+
rv.gsub!( re, entity_value )
|
605
|
+
if rv.bytesize > @entity_expansion_text_limit
|
606
|
+
raise "entity expansion has grown too large"
|
486
607
|
end
|
608
|
+
else
|
609
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
610
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
611
|
end
|
488
612
|
end
|
489
|
-
rv.gsub!(
|
613
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
614
|
end
|
491
615
|
rv
|
492
616
|
end
|
493
617
|
|
494
618
|
private
|
619
|
+
def add_namespace(prefix, uri)
|
620
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
621
|
+
if uri.nil?
|
622
|
+
@namespaces.delete(prefix)
|
623
|
+
else
|
624
|
+
@namespaces[prefix] = uri
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
def push_namespaces_restore
|
629
|
+
namespaces_restore = {}
|
630
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
631
|
+
namespaces_restore
|
632
|
+
end
|
633
|
+
|
634
|
+
def pop_namespaces_restore
|
635
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
636
|
+
namespaces_restore.each do |prefix, uri|
|
637
|
+
if uri.nil?
|
638
|
+
@namespaces.delete(prefix)
|
639
|
+
else
|
640
|
+
@namespaces[prefix] = uri
|
641
|
+
end
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
def record_entity_expansion(delta=1)
|
646
|
+
@entity_expansion_count += delta
|
647
|
+
if @entity_expansion_count > @entity_expansion_limit
|
648
|
+
raise "number of entity expansions exceeded, processing aborted."
|
649
|
+
end
|
650
|
+
end
|
651
|
+
|
495
652
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
653
|
return false if xml_declaration_encoding.nil?
|
497
654
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +656,16 @@ module REXML
|
|
499
656
|
end
|
500
657
|
|
501
658
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
659
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
660
|
unless md
|
504
|
-
if @source.match(/\
|
661
|
+
if @source.match?(/\S/um)
|
505
662
|
message = "#{base_error_message}: invalid name"
|
506
663
|
else
|
507
664
|
message = "#{base_error_message}: name is missing"
|
508
665
|
end
|
509
666
|
raise REXML::ParseException.new(message, @source)
|
510
667
|
end
|
511
|
-
md[
|
668
|
+
md[0]
|
512
669
|
end
|
513
670
|
|
514
671
|
def parse_id(base_error_message,
|
@@ -543,34 +700,34 @@ module REXML
|
|
543
700
|
accept_public_id:)
|
544
701
|
public = /\A\s*PUBLIC/um
|
545
702
|
system = /\A\s*SYSTEM/um
|
546
|
-
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
|
547
|
-
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
703
|
+
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
|
704
|
+
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
548
705
|
return "public ID literal is missing"
|
549
706
|
end
|
550
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
|
707
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
|
551
708
|
return "invalid public ID literal"
|
552
709
|
end
|
553
710
|
if accept_public_id
|
554
|
-
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
711
|
+
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
555
712
|
return "system ID literal is missing"
|
556
713
|
end
|
557
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
714
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
558
715
|
return "invalid system literal"
|
559
716
|
end
|
560
717
|
"garbage after system literal"
|
561
718
|
else
|
562
719
|
"garbage after public ID literal"
|
563
720
|
end
|
564
|
-
elsif accept_external_id and @source.match(/#{system}/um)
|
565
|
-
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
721
|
+
elsif accept_external_id and @source.match?(/#{system}/um)
|
722
|
+
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
566
723
|
return "system literal is missing"
|
567
724
|
end
|
568
|
-
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
|
725
|
+
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
|
569
726
|
return "invalid system literal"
|
570
727
|
end
|
571
728
|
"garbage after system literal"
|
572
729
|
else
|
573
|
-
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
730
|
+
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
574
731
|
return "invalid ID type"
|
575
732
|
end
|
576
733
|
"ID type is missing"
|
@@ -578,96 +735,132 @@ module REXML
|
|
578
735
|
end
|
579
736
|
|
580
737
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
738
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
739
|
+
if @source.match?(/\s+/um, true)
|
740
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
741
|
+
unless match_data
|
742
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
743
|
+
end
|
744
|
+
content = match_data[1]
|
745
|
+
else
|
746
|
+
content = nil
|
747
|
+
unless @source.match?("?>", true)
|
748
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
749
|
+
end
|
750
|
+
end
|
751
|
+
if name == "xml"
|
752
|
+
if @document_status
|
753
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
754
|
+
end
|
755
|
+
version = VERSION.match(content)
|
756
|
+
version = version[1] unless version.nil?
|
757
|
+
encoding = ENCODING.match(content)
|
758
|
+
encoding = encoding[1] unless encoding.nil?
|
759
|
+
if need_source_encoding_update?(encoding)
|
760
|
+
@source.encoding = encoding
|
761
|
+
end
|
762
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
763
|
+
encoding = "UTF-16"
|
764
|
+
end
|
765
|
+
standalone = STANDALONE.match(content)
|
766
|
+
standalone = standalone[1] unless standalone.nil?
|
767
|
+
return [ :xmldecl, version, encoding, standalone ]
|
585
768
|
end
|
586
|
-
[:processing_instruction,
|
769
|
+
[:processing_instruction, name, content]
|
587
770
|
end
|
588
771
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
593
|
-
if match_data.nil?
|
594
|
-
message = "Start tag isn't ended"
|
595
|
-
raise REXML::ParseException.new(message, @source)
|
772
|
+
if StringScanner::Version < "3.1.1"
|
773
|
+
def scan_quote
|
774
|
+
@source.match(/(['"])/, true)&.[](1)
|
596
775
|
end
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
776
|
+
else
|
777
|
+
def scan_quote
|
778
|
+
case @source.peek_byte
|
779
|
+
when 34 # '"'.ord
|
780
|
+
@source.scan_byte
|
781
|
+
'"'
|
782
|
+
when 39 # "'".ord
|
783
|
+
@source.scan_byte
|
784
|
+
"'"
|
785
|
+
else
|
786
|
+
nil
|
607
787
|
end
|
788
|
+
end
|
789
|
+
end
|
608
790
|
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
791
|
+
def parse_attributes(prefixes)
|
792
|
+
attributes = {}
|
793
|
+
expanded_names = {}
|
794
|
+
closed = false
|
795
|
+
while true
|
796
|
+
if @source.match?(">", true)
|
797
|
+
return attributes, closed
|
798
|
+
elsif @source.match?("/>", true)
|
799
|
+
closed = true
|
800
|
+
return attributes, closed
|
801
|
+
elsif match = @source.match(QNAME, true)
|
802
|
+
name = match[1]
|
803
|
+
prefix = match[2]
|
804
|
+
local_part = match[3]
|
805
|
+
|
806
|
+
unless @source.match?(/\s*=\s*/um, true)
|
618
807
|
message = "Missing attribute equal: <#{name}>"
|
619
808
|
raise REXML::ParseException.new(message, @source)
|
620
809
|
end
|
621
|
-
quote =
|
622
|
-
unless quote
|
810
|
+
unless quote = scan_quote
|
623
811
|
message = "Missing attribute value start quote: <#{name}>"
|
624
812
|
raise REXML::ParseException.new(message, @source)
|
625
813
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
scanner << match_data[1]
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
814
|
+
start_position = @source.position
|
815
|
+
value = @source.read_until(quote)
|
816
|
+
unless value.chomp!(quote)
|
817
|
+
@source.position = start_position
|
818
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
819
|
raise REXML::ParseException.new(message, @source)
|
639
820
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
821
|
+
@source.match?(/\s*/um, true)
|
822
|
+
if prefix == "xmlns"
|
823
|
+
if local_part == "xml"
|
824
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
825
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
826
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
827
|
+
raise REXML::ParseException.new( msg, @source, self )
|
828
|
+
end
|
829
|
+
elsif local_part == "xmlns"
|
830
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
831
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
832
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
833
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
834
|
+
add_namespace(local_part, value)
|
835
|
+
elsif prefix
|
836
|
+
prefixes << prefix unless prefix == "xml"
|
657
837
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
838
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
839
|
+
if attributes[name]
|
840
|
+
msg = "Duplicate attribute #{name.inspect}"
|
841
|
+
raise REXML::ParseException.new(msg, @source, self)
|
842
|
+
end
|
667
843
|
|
668
|
-
|
844
|
+
unless prefix == "xmlns"
|
845
|
+
uri = @namespaces[prefix]
|
846
|
+
expanded_name = [uri, local_part]
|
847
|
+
existing_prefix = expanded_names[expanded_name]
|
848
|
+
if existing_prefix
|
849
|
+
message = "Namespace conflict in adding attribute " +
|
850
|
+
"\"#{local_part}\": " +
|
851
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
852
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
853
|
+
raise REXML::ParseException.new(message, @source, self)
|
854
|
+
end
|
855
|
+
expanded_names[expanded_name] = prefix
|
856
|
+
end
|
857
|
+
|
858
|
+
attributes[name] = value
|
859
|
+
else
|
860
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
861
|
+
raise REXML::ParseException.new(message, @source)
|
862
|
+
end
|
669
863
|
end
|
670
|
-
return attributes, closed
|
671
864
|
end
|
672
865
|
end
|
673
866
|
end
|