rexml 3.2.6 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +399 -0
- data/lib/rexml/attribute.rb +3 -2
- data/lib/rexml/document.rb +5 -1
- data/lib/rexml/element.rb +16 -31
- data/lib/rexml/entity.rb +9 -48
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +446 -274
- data/lib/rexml/parsers/pullparser.rb +16 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +171 -100
- data/lib/rexml/text.rb +54 -57
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +6 -47
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,34 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
170
|
+
@source.ensure_buffer
|
118
171
|
end
|
119
172
|
|
120
173
|
def add_listener( listener )
|
@@ -122,15 +175,24 @@ module REXML
|
|
122
175
|
end
|
123
176
|
|
124
177
|
attr_reader :source
|
178
|
+
attr_reader :entity_expansion_count
|
179
|
+
attr_writer :entity_expansion_limit
|
180
|
+
attr_writer :entity_expansion_text_limit
|
125
181
|
|
126
182
|
def stream=( source )
|
127
183
|
@source = SourceFactory.create_from( source )
|
184
|
+
reset
|
185
|
+
end
|
186
|
+
|
187
|
+
def reset
|
128
188
|
@closed = nil
|
189
|
+
@have_root = false
|
129
190
|
@document_status = nil
|
130
191
|
@tags = []
|
131
192
|
@stack = []
|
132
193
|
@entities = []
|
133
|
-
@
|
194
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
195
|
+
@namespaces_restore_stack = []
|
134
196
|
end
|
135
197
|
|
136
198
|
def position
|
@@ -180,6 +242,8 @@ module REXML
|
|
180
242
|
|
181
243
|
# Returns the next event. This is a +PullEvent+ object.
|
182
244
|
def pull
|
245
|
+
@source.drop_parsed_content
|
246
|
+
|
183
247
|
pull_event.tap do |event|
|
184
248
|
@listeners.each do |listener|
|
185
249
|
listener.receive event
|
@@ -192,236 +256,274 @@ module REXML
|
|
192
256
|
x, @closed = @closed, nil
|
193
257
|
return [ :end_element, x ]
|
194
258
|
end
|
195
|
-
|
259
|
+
if empty?
|
260
|
+
if @document_status == :in_doctype
|
261
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
262
|
+
end
|
263
|
+
unless @tags.empty?
|
264
|
+
path = "/" + @tags.join("/")
|
265
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
266
|
+
end
|
267
|
+
return [ :end_document ]
|
268
|
+
end
|
196
269
|
return @stack.shift if @stack.size > 0
|
197
270
|
#STDERR.puts @source.encoding
|
198
271
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
272
|
+
|
273
|
+
@source.ensure_buffer
|
199
274
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
275
|
+
start_position = @source.position
|
276
|
+
if @source.match?("<?", true)
|
223
277
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
278
|
+
elsif @source.match?("<!", true)
|
279
|
+
if @source.match?("--", true)
|
280
|
+
md = @source.match(/(.*?)-->/um, true)
|
281
|
+
if md.nil?
|
282
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
283
|
+
end
|
284
|
+
if /--|-\z/.match?(md[1])
|
285
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
286
|
+
end
|
287
|
+
return [ :comment, md[1] ]
|
288
|
+
elsif @source.match?("DOCTYPE", true)
|
289
|
+
base_error_message = "Malformed DOCTYPE"
|
290
|
+
unless @source.match?(/\s+/um, true)
|
291
|
+
if @source.match?(">")
|
292
|
+
message = "#{base_error_message}: name is missing"
|
293
|
+
else
|
294
|
+
message = "#{base_error_message}: invalid name"
|
295
|
+
end
|
296
|
+
@source.position = start_position
|
297
|
+
raise REXML::ParseException.new(message, @source)
|
242
298
|
end
|
243
|
-
|
299
|
+
name = parse_name(base_error_message)
|
300
|
+
if @source.match?(/\s*\[/um, true)
|
301
|
+
id = [nil, nil, nil]
|
244
302
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
303
|
+
elsif @source.match?(/\s*>/um, true)
|
304
|
+
id = [nil, nil, nil]
|
246
305
|
@document_status = :after_doctype
|
306
|
+
@source.ensure_buffer
|
247
307
|
else
|
248
|
-
|
249
|
-
|
308
|
+
id = parse_id(base_error_message,
|
309
|
+
accept_external_id: true,
|
310
|
+
accept_public_id: false)
|
311
|
+
if id[0] == "SYSTEM"
|
312
|
+
# For backward compatibility
|
313
|
+
id[1], id[2] = id[2], nil
|
314
|
+
end
|
315
|
+
if @source.match?(/\s*\[/um, true)
|
316
|
+
@document_status = :in_doctype
|
317
|
+
elsif @source.match?(/\s*>/um, true)
|
318
|
+
@document_status = :after_doctype
|
319
|
+
@source.ensure_buffer
|
320
|
+
else
|
321
|
+
message = "#{base_error_message}: garbage after external ID"
|
322
|
+
raise REXML::ParseException.new(message, @source)
|
323
|
+
end
|
250
324
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
325
|
+
args = [:start_doctype, name, *id]
|
326
|
+
if @document_status == :after_doctype
|
327
|
+
@source.match?(/\s*/um, true)
|
328
|
+
@stack << [ :end_doctype ]
|
329
|
+
end
|
330
|
+
return args
|
331
|
+
else
|
332
|
+
message = "Invalid XML"
|
333
|
+
raise REXML::ParseException.new(message, @source)
|
263
334
|
end
|
264
335
|
end
|
265
336
|
end
|
266
337
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
338
|
+
@source.match?(/\s*/um, true) # skip spaces
|
339
|
+
start_position = @source.position
|
340
|
+
if @source.match?("<!", true)
|
341
|
+
if @source.match?("ELEMENT", true)
|
342
|
+
md = @source.match(/(.*?)>/um, true)
|
343
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
344
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
345
|
+
elsif @source.match?("ENTITY", true)
|
346
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
347
|
+
unless match_data
|
348
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
349
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
350
|
+
match = [:entitydecl, *match_data.captures.compact]
|
351
|
+
ref = false
|
352
|
+
if match[1] == '%'
|
353
|
+
ref = true
|
354
|
+
match.delete_at 1
|
355
|
+
end
|
356
|
+
# Now we have to sort out what kind of entity reference this is
|
357
|
+
if match[2] == 'SYSTEM'
|
358
|
+
# External reference
|
359
|
+
match[3] = match[3][1..-2] # PUBID
|
360
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
361
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
362
|
+
elsif match[2] == 'PUBLIC'
|
363
|
+
# External reference
|
364
|
+
match[3] = match[3][1..-2] # PUBID
|
365
|
+
match[4] = match[4][1..-2] # HREF
|
366
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
367
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
368
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
369
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
370
|
else
|
329
|
-
|
371
|
+
match[2] = match[2][1..-2]
|
372
|
+
match.pop if match.size == 4
|
373
|
+
# match is [ :entity, name, value ]
|
330
374
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
375
|
+
match << '%' if ref
|
376
|
+
return match
|
377
|
+
elsif @source.match?("ATTLIST", true)
|
378
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
379
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
380
|
+
element = md[1]
|
381
|
+
contents = md[0]
|
382
|
+
|
383
|
+
pairs = {}
|
384
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
385
|
+
values.each do |attdef|
|
386
|
+
unless attdef[3] == "#IMPLIED"
|
387
|
+
attdef.compact!
|
388
|
+
val = attdef[3]
|
389
|
+
val = attdef[4] if val == "#FIXED "
|
390
|
+
pairs[attdef[0]] = val
|
391
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
392
|
+
@namespaces[$1] = val
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
return [ :attlistdecl, element, pairs, contents ]
|
397
|
+
elsif @source.match?("NOTATION", true)
|
398
|
+
base_error_message = "Malformed notation declaration"
|
399
|
+
unless @source.match?(/\s+/um, true)
|
400
|
+
if @source.match?(">")
|
401
|
+
message = "#{base_error_message}: name is missing"
|
402
|
+
else
|
403
|
+
message = "#{base_error_message}: invalid name"
|
404
|
+
end
|
405
|
+
@source.position = start_position
|
406
|
+
raise REXML::ParseException.new(message, @source)
|
407
|
+
end
|
408
|
+
name = parse_name(base_error_message)
|
409
|
+
id = parse_id(base_error_message,
|
410
|
+
accept_external_id: true,
|
411
|
+
accept_public_id: true)
|
412
|
+
unless @source.match?(/\s*>/um, true)
|
413
|
+
message = "#{base_error_message}: garbage before end >"
|
414
|
+
raise REXML::ParseException.new(message, @source)
|
415
|
+
end
|
416
|
+
return [:notationdecl, name, *id]
|
417
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
418
|
+
case md[1]
|
419
|
+
when /--/, /-\z/
|
420
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
421
|
+
end
|
422
|
+
return [ :comment, md[1] ] if md
|
340
423
|
end
|
341
|
-
|
342
|
-
|
424
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
425
|
+
return [ :externalentity, match[1] ]
|
426
|
+
elsif @source.match?(/\]\s*>/um, true)
|
343
427
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
428
|
return [ :end_doctype ]
|
346
429
|
end
|
430
|
+
if @document_status == :in_doctype
|
431
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
432
|
+
end
|
347
433
|
end
|
348
434
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
435
|
+
@source.match?(/\s*/um, true)
|
350
436
|
end
|
351
437
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
438
|
+
start_position = @source.position
|
439
|
+
if @source.match?("<", true)
|
440
|
+
# :text's read_until may remain only "<" in buffer. In the
|
441
|
+
# case, buffer is empty here. So we need to fill buffer
|
442
|
+
# here explicitly.
|
443
|
+
@source.ensure_buffer
|
444
|
+
if @source.match?("/", true)
|
445
|
+
@namespaces_restore_stack.pop
|
356
446
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
447
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
448
|
if md and !last_tag
|
359
449
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
450
|
raise REXML::ParseException.new(message, @source)
|
361
451
|
end
|
362
452
|
if md.nil? or last_tag != md[1]
|
363
453
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
454
|
+
message += " (got '#{md[1]}')" if md
|
455
|
+
@source.position = start_position if md.nil?
|
365
456
|
raise REXML::ParseException.new(message, @source)
|
366
457
|
end
|
367
458
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
459
|
+
elsif @source.match?("!", true)
|
460
|
+
md = @source.match(/([^>]*>)/um)
|
370
461
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
462
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
463
|
+
if md[0][0] == ?-
|
464
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
465
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
466
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
467
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
468
|
end
|
379
469
|
|
380
|
-
return [ :comment, md[1] ]
|
470
|
+
return [ :comment, md[1] ]
|
381
471
|
else
|
382
|
-
md = @source.match(
|
472
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
473
|
return [ :cdata, md[1] ] if md
|
384
474
|
end
|
385
475
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
476
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
477
|
+
elsif @source.match?("?", true)
|
388
478
|
return process_instruction
|
389
479
|
else
|
390
480
|
# Get the next tag
|
391
|
-
md = @source.match(
|
481
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
482
|
unless md
|
483
|
+
@source.position = start_position
|
393
484
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
485
|
end
|
486
|
+
tag = md[1]
|
395
487
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
488
|
+
@prefixes.clear
|
489
|
+
@prefixes << md[2] if md[2]
|
490
|
+
push_namespaces_restore
|
491
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
492
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
493
|
+
for prefix in @prefixes
|
494
|
+
unless @namespaces.key?(prefix)
|
403
495
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
496
|
end
|
405
497
|
end
|
406
498
|
|
407
499
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
500
|
+
@closed = tag
|
501
|
+
pop_namespaces_restore
|
410
502
|
else
|
411
|
-
@tags.
|
503
|
+
if @tags.empty? and @have_root
|
504
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
505
|
+
end
|
506
|
+
@tags.push( tag )
|
412
507
|
end
|
413
|
-
|
508
|
+
@have_root = true
|
509
|
+
return [ :start_element, tag, attributes ]
|
414
510
|
end
|
415
511
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
512
|
+
text = @source.read_until("<")
|
513
|
+
if text.chomp!("<")
|
514
|
+
@source.position -= "<".bytesize
|
515
|
+
end
|
516
|
+
if @tags.empty?
|
517
|
+
unless /\A\s*\z/.match?(text)
|
518
|
+
if @have_root
|
519
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
520
|
+
else
|
521
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
522
|
+
end
|
523
|
+
end
|
524
|
+
return pull_event if @have_root
|
419
525
|
end
|
420
|
-
|
421
|
-
#return [ :text, "" ] if md[0].length == 0
|
422
|
-
# unnormalized = Text::unnormalize( md[1], self )
|
423
|
-
# return PullEvent.new( :text, md[1], unnormalized )
|
424
|
-
return [ :text, md[1] ]
|
526
|
+
return [ :text, text ]
|
425
527
|
end
|
426
528
|
rescue REXML::UndefinedNamespaceException
|
427
529
|
raise
|
@@ -436,13 +538,13 @@ module REXML
|
|
436
538
|
private :pull_event
|
437
539
|
|
438
540
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
541
|
+
return unless entities
|
542
|
+
|
543
|
+
value = entities[ reference ]
|
544
|
+
return if value.nil?
|
545
|
+
|
546
|
+
record_entity_expansion
|
547
|
+
unnormalize( value, entities )
|
446
548
|
end
|
447
549
|
|
448
550
|
# Escapes all possible entities
|
@@ -463,35 +565,87 @@ module REXML
|
|
463
565
|
|
464
566
|
# Unescapes all possible entities
|
465
567
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
568
|
+
if string.include?("\r")
|
569
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
570
|
+
else
|
571
|
+
rv = string.dup
|
572
|
+
end
|
468
573
|
matches = rv.scan( REFERENCE_RE )
|
469
574
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
575
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
576
|
m=$1
|
472
|
-
|
473
|
-
|
577
|
+
if m.start_with?("x")
|
578
|
+
code_point = Integer(m[1..-1], 16)
|
579
|
+
else
|
580
|
+
code_point = Integer(m, 10)
|
581
|
+
end
|
582
|
+
[code_point].pack('U*')
|
474
583
|
}
|
475
584
|
matches.collect!{|x|x[0]}.compact!
|
585
|
+
if filter
|
586
|
+
matches.reject! do |entity_reference|
|
587
|
+
filter.include?(entity_reference)
|
588
|
+
end
|
589
|
+
end
|
476
590
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
rv.gsub!( er[0], er[2] ) if er
|
591
|
+
matches.tally.each do |entity_reference, n|
|
592
|
+
entity_expansion_count_before = @entity_expansion_count
|
593
|
+
entity_value = entity( entity_reference, entities )
|
594
|
+
if entity_value
|
595
|
+
if n > 1
|
596
|
+
entity_expansion_count_delta =
|
597
|
+
@entity_expansion_count - entity_expansion_count_before
|
598
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
486
599
|
end
|
600
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
601
|
+
rv.gsub!( re, entity_value )
|
602
|
+
if rv.bytesize > @entity_expansion_text_limit
|
603
|
+
raise "entity expansion has grown too large"
|
604
|
+
end
|
605
|
+
else
|
606
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
607
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
608
|
end
|
488
609
|
end
|
489
|
-
rv.gsub!(
|
610
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
611
|
end
|
491
612
|
rv
|
492
613
|
end
|
493
614
|
|
494
615
|
private
|
616
|
+
def add_namespace(prefix, uri)
|
617
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
618
|
+
if uri.nil?
|
619
|
+
@namespaces.delete(prefix)
|
620
|
+
else
|
621
|
+
@namespaces[prefix] = uri
|
622
|
+
end
|
623
|
+
end
|
624
|
+
|
625
|
+
def push_namespaces_restore
|
626
|
+
namespaces_restore = {}
|
627
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
628
|
+
namespaces_restore
|
629
|
+
end
|
630
|
+
|
631
|
+
def pop_namespaces_restore
|
632
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
633
|
+
namespaces_restore.each do |prefix, uri|
|
634
|
+
if uri.nil?
|
635
|
+
@namespaces.delete(prefix)
|
636
|
+
else
|
637
|
+
@namespaces[prefix] = uri
|
638
|
+
end
|
639
|
+
end
|
640
|
+
end
|
641
|
+
|
642
|
+
def record_entity_expansion(delta=1)
|
643
|
+
@entity_expansion_count += delta
|
644
|
+
if @entity_expansion_count > @entity_expansion_limit
|
645
|
+
raise "number of entity expansions exceeded, processing aborted."
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
495
649
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
650
|
return false if xml_declaration_encoding.nil?
|
497
651
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +653,16 @@ module REXML
|
|
499
653
|
end
|
500
654
|
|
501
655
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
656
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
657
|
unless md
|
504
|
-
if @source.match(/\
|
658
|
+
if @source.match?(/\S/um)
|
505
659
|
message = "#{base_error_message}: invalid name"
|
506
660
|
else
|
507
661
|
message = "#{base_error_message}: name is missing"
|
508
662
|
end
|
509
663
|
raise REXML::ParseException.new(message, @source)
|
510
664
|
end
|
511
|
-
md[
|
665
|
+
md[0]
|
512
666
|
end
|
513
667
|
|
514
668
|
def parse_id(base_error_message,
|
@@ -543,34 +697,34 @@ module REXML
|
|
543
697
|
accept_public_id:)
|
544
698
|
public = /\A\s*PUBLIC/um
|
545
699
|
system = /\A\s*SYSTEM/um
|
546
|
-
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
|
547
|
-
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
700
|
+
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
|
701
|
+
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
548
702
|
return "public ID literal is missing"
|
549
703
|
end
|
550
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
|
704
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
|
551
705
|
return "invalid public ID literal"
|
552
706
|
end
|
553
707
|
if accept_public_id
|
554
|
-
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
708
|
+
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
555
709
|
return "system ID literal is missing"
|
556
710
|
end
|
557
|
-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
711
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
558
712
|
return "invalid system literal"
|
559
713
|
end
|
560
714
|
"garbage after system literal"
|
561
715
|
else
|
562
716
|
"garbage after public ID literal"
|
563
717
|
end
|
564
|
-
elsif accept_external_id and @source.match(/#{system}/um)
|
565
|
-
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
718
|
+
elsif accept_external_id and @source.match?(/#{system}/um)
|
719
|
+
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
566
720
|
return "system literal is missing"
|
567
721
|
end
|
568
|
-
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
|
722
|
+
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
|
569
723
|
return "invalid system literal"
|
570
724
|
end
|
571
725
|
"garbage after system literal"
|
572
726
|
else
|
573
|
-
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
727
|
+
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
574
728
|
return "invalid ID type"
|
575
729
|
end
|
576
730
|
"ID type is missing"
|
@@ -578,96 +732,114 @@ module REXML
|
|
578
732
|
end
|
579
733
|
|
580
734
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
735
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
736
|
+
if @source.match?(/\s+/um, true)
|
737
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
738
|
+
unless match_data
|
739
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
740
|
+
end
|
741
|
+
content = match_data[1]
|
742
|
+
else
|
743
|
+
content = nil
|
744
|
+
unless @source.match?("?>", true)
|
745
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
746
|
+
end
|
585
747
|
end
|
586
|
-
|
748
|
+
if name == "xml"
|
749
|
+
if @document_status
|
750
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
751
|
+
end
|
752
|
+
version = VERSION.match(content)
|
753
|
+
version = version[1] unless version.nil?
|
754
|
+
encoding = ENCODING.match(content)
|
755
|
+
encoding = encoding[1] unless encoding.nil?
|
756
|
+
if need_source_encoding_update?(encoding)
|
757
|
+
@source.encoding = encoding
|
758
|
+
end
|
759
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
760
|
+
encoding = "UTF-16"
|
761
|
+
end
|
762
|
+
standalone = STANDALONE.match(content)
|
763
|
+
standalone = standalone[1] unless standalone.nil?
|
764
|
+
return [ :xmldecl, version, encoding, standalone ]
|
765
|
+
end
|
766
|
+
[:processing_instruction, name, content]
|
587
767
|
end
|
588
768
|
|
589
|
-
def parse_attributes(prefixes
|
769
|
+
def parse_attributes(prefixes)
|
590
770
|
attributes = {}
|
771
|
+
expanded_names = {}
|
591
772
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
773
|
+
while true
|
774
|
+
if @source.match?(">", true)
|
775
|
+
return attributes, closed
|
776
|
+
elsif @source.match?("/>", true)
|
777
|
+
closed = true
|
778
|
+
return attributes, closed
|
779
|
+
elsif match = @source.match(QNAME, true)
|
780
|
+
name = match[1]
|
781
|
+
prefix = match[2]
|
782
|
+
local_part = match[3]
|
783
|
+
|
784
|
+
unless @source.match?(/\s*=\s*/um, true)
|
618
785
|
message = "Missing attribute equal: <#{name}>"
|
619
786
|
raise REXML::ParseException.new(message, @source)
|
620
787
|
end
|
621
|
-
|
622
|
-
unless quote
|
788
|
+
unless match = @source.match(/(['"])/, true)
|
623
789
|
message = "Missing attribute value start quote: <#{name}>"
|
624
790
|
raise REXML::ParseException.new(message, @source)
|
625
791
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
792
|
+
quote = match[1]
|
793
|
+
start_position = @source.position
|
794
|
+
value = @source.read_until(quote)
|
795
|
+
unless value.chomp!(quote)
|
796
|
+
@source.position = start_position
|
797
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
798
|
raise REXML::ParseException.new(message, @source)
|
639
799
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
800
|
+
@source.match?(/\s*/um, true)
|
801
|
+
if prefix == "xmlns"
|
802
|
+
if local_part == "xml"
|
803
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
804
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
805
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
806
|
+
raise REXML::ParseException.new( msg, @source, self )
|
807
|
+
end
|
808
|
+
elsif local_part == "xmlns"
|
809
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
810
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
811
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
812
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
813
|
+
add_namespace(local_part, value)
|
814
|
+
elsif prefix
|
815
|
+
prefixes << prefix unless prefix == "xml"
|
657
816
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
817
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
818
|
+
if attributes[name]
|
819
|
+
msg = "Duplicate attribute #{name.inspect}"
|
820
|
+
raise REXML::ParseException.new(msg, @source, self)
|
821
|
+
end
|
667
822
|
|
668
|
-
|
823
|
+
unless prefix == "xmlns"
|
824
|
+
uri = @namespaces[prefix]
|
825
|
+
expanded_name = [uri, local_part]
|
826
|
+
existing_prefix = expanded_names[expanded_name]
|
827
|
+
if existing_prefix
|
828
|
+
message = "Namespace conflict in adding attribute " +
|
829
|
+
"\"#{local_part}\": " +
|
830
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
831
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
832
|
+
raise REXML::ParseException.new(message, @source, self)
|
833
|
+
end
|
834
|
+
expanded_names[expanded_name] = prefix
|
835
|
+
end
|
836
|
+
|
837
|
+
attributes[name] = value
|
838
|
+
else
|
839
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
840
|
+
raise REXML::ParseException.new(message, @source)
|
841
|
+
end
|
669
842
|
end
|
670
|
-
return attributes, closed
|
671
843
|
end
|
672
844
|
end
|
673
845
|
end
|