rexml 3.2.6 → 3.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +370 -0
- data/lib/rexml/attribute.rb +3 -2
- data/lib/rexml/document.rb +5 -1
- data/lib/rexml/element.rb +16 -31
- data/lib/rexml/entity.rb +9 -48
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +433 -265
- data/lib/rexml/parsers/pullparser.rb +12 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +134 -98
- data/lib/rexml/text.rb +39 -17
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +6 -50
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +124,7 @@ module REXML
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
128
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +140,34 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
170
|
+
@source.ensure_buffer
|
118
171
|
end
|
119
172
|
|
120
173
|
def add_listener( listener )
|
@@ -122,15 +175,20 @@ module REXML
|
|
122
175
|
end
|
123
176
|
|
124
177
|
attr_reader :source
|
178
|
+
attr_reader :entity_expansion_count
|
179
|
+
attr_writer :entity_expansion_limit
|
180
|
+
attr_writer :entity_expansion_text_limit
|
125
181
|
|
126
182
|
def stream=( source )
|
127
183
|
@source = SourceFactory.create_from( source )
|
128
184
|
@closed = nil
|
185
|
+
@have_root = false
|
129
186
|
@document_status = nil
|
130
187
|
@tags = []
|
131
188
|
@stack = []
|
132
189
|
@entities = []
|
133
|
-
@
|
190
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
191
|
+
@namespaces_restore_stack = []
|
134
192
|
end
|
135
193
|
|
136
194
|
def position
|
@@ -180,6 +238,8 @@ module REXML
|
|
180
238
|
|
181
239
|
# Returns the next event. This is a +PullEvent+ object.
|
182
240
|
def pull
|
241
|
+
@source.drop_parsed_content
|
242
|
+
|
183
243
|
pull_event.tap do |event|
|
184
244
|
@listeners.each do |listener|
|
185
245
|
listener.receive event
|
@@ -192,236 +252,274 @@ module REXML
|
|
192
252
|
x, @closed = @closed, nil
|
193
253
|
return [ :end_element, x ]
|
194
254
|
end
|
195
|
-
|
255
|
+
if empty?
|
256
|
+
if @document_status == :in_doctype
|
257
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
258
|
+
end
|
259
|
+
unless @tags.empty?
|
260
|
+
path = "/" + @tags.join("/")
|
261
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
262
|
+
end
|
263
|
+
return [ :end_document ]
|
264
|
+
end
|
196
265
|
return @stack.shift if @stack.size > 0
|
197
266
|
#STDERR.puts @source.encoding
|
198
267
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
268
|
+
|
269
|
+
@source.ensure_buffer
|
199
270
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
-
case word
|
204
|
-
when COMMENT_START
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
-
when XMLDECL_START
|
207
|
-
#STDERR.puts "XMLDECL"
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
-
version = VERSION.match( results )
|
210
|
-
version = version[1] unless version.nil?
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
271
|
+
start_position = @source.position
|
272
|
+
if @source.match("<?", true)
|
223
273
|
return process_instruction
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
274
|
+
elsif @source.match("<!", true)
|
275
|
+
if @source.match("--", true)
|
276
|
+
md = @source.match(/(.*?)-->/um, true)
|
277
|
+
if md.nil?
|
278
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
242
279
|
end
|
243
|
-
if
|
280
|
+
if /--|-\z/.match?(md[1])
|
281
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
282
|
+
end
|
283
|
+
return [ :comment, md[1] ]
|
284
|
+
elsif @source.match("DOCTYPE", true)
|
285
|
+
base_error_message = "Malformed DOCTYPE"
|
286
|
+
unless @source.match(/\s+/um, true)
|
287
|
+
if @source.match(">")
|
288
|
+
message = "#{base_error_message}: name is missing"
|
289
|
+
else
|
290
|
+
message = "#{base_error_message}: invalid name"
|
291
|
+
end
|
292
|
+
@source.position = start_position
|
293
|
+
raise REXML::ParseException.new(message, @source)
|
294
|
+
end
|
295
|
+
name = parse_name(base_error_message)
|
296
|
+
if @source.match(/\s*\[/um, true)
|
297
|
+
id = [nil, nil, nil]
|
244
298
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
299
|
+
elsif @source.match(/\s*>/um, true)
|
300
|
+
id = [nil, nil, nil]
|
246
301
|
@document_status = :after_doctype
|
302
|
+
@source.ensure_buffer
|
247
303
|
else
|
248
|
-
|
249
|
-
|
304
|
+
id = parse_id(base_error_message,
|
305
|
+
accept_external_id: true,
|
306
|
+
accept_public_id: false)
|
307
|
+
if id[0] == "SYSTEM"
|
308
|
+
# For backward compatibility
|
309
|
+
id[1], id[2] = id[2], nil
|
310
|
+
end
|
311
|
+
if @source.match(/\s*\[/um, true)
|
312
|
+
@document_status = :in_doctype
|
313
|
+
elsif @source.match(/\s*>/um, true)
|
314
|
+
@document_status = :after_doctype
|
315
|
+
@source.ensure_buffer
|
316
|
+
else
|
317
|
+
message = "#{base_error_message}: garbage after external ID"
|
318
|
+
raise REXML::ParseException.new(message, @source)
|
319
|
+
end
|
250
320
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
321
|
+
args = [:start_doctype, name, *id]
|
322
|
+
if @document_status == :after_doctype
|
323
|
+
@source.match(/\s*/um, true)
|
324
|
+
@stack << [ :end_doctype ]
|
325
|
+
end
|
326
|
+
return args
|
327
|
+
else
|
328
|
+
message = "Invalid XML"
|
329
|
+
raise REXML::ParseException.new(message, @source)
|
263
330
|
end
|
264
331
|
end
|
265
332
|
end
|
266
333
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
match[0] = :entitydecl
|
279
|
-
ref = false
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
334
|
+
@source.match(/\s*/um, true) # skip spaces
|
335
|
+
start_position = @source.position
|
336
|
+
if @source.match("<!", true)
|
337
|
+
if @source.match("ELEMENT", true)
|
338
|
+
md = @source.match(/(.*?)>/um, true)
|
339
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
340
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
341
|
+
elsif @source.match("ENTITY", true)
|
342
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
343
|
+
unless match_data
|
344
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
320
345
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
346
|
+
match = [:entitydecl, *match_data.captures.compact]
|
347
|
+
ref = false
|
348
|
+
if match[1] == '%'
|
349
|
+
ref = true
|
350
|
+
match.delete_at 1
|
351
|
+
end
|
352
|
+
# Now we have to sort out what kind of entity reference this is
|
353
|
+
if match[2] == 'SYSTEM'
|
354
|
+
# External reference
|
355
|
+
match[3] = match[3][1..-2] # PUBID
|
356
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
357
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
358
|
+
elsif match[2] == 'PUBLIC'
|
359
|
+
# External reference
|
360
|
+
match[3] = match[3][1..-2] # PUBID
|
361
|
+
match[4] = match[4][1..-2] # HREF
|
362
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
363
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
364
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
365
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
328
366
|
else
|
329
|
-
|
367
|
+
match[2] = match[2][1..-2]
|
368
|
+
match.pop if match.size == 4
|
369
|
+
# match is [ :entity, name, value ]
|
330
370
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
371
|
+
match << '%' if ref
|
372
|
+
return match
|
373
|
+
elsif @source.match("ATTLIST", true)
|
374
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
375
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
376
|
+
element = md[1]
|
377
|
+
contents = md[0]
|
378
|
+
|
379
|
+
pairs = {}
|
380
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
381
|
+
values.each do |attdef|
|
382
|
+
unless attdef[3] == "#IMPLIED"
|
383
|
+
attdef.compact!
|
384
|
+
val = attdef[3]
|
385
|
+
val = attdef[4] if val == "#FIXED "
|
386
|
+
pairs[attdef[0]] = val
|
387
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
388
|
+
@namespaces[$1] = val
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
return [ :attlistdecl, element, pairs, contents ]
|
393
|
+
elsif @source.match("NOTATION", true)
|
394
|
+
base_error_message = "Malformed notation declaration"
|
395
|
+
unless @source.match(/\s+/um, true)
|
396
|
+
if @source.match(">")
|
397
|
+
message = "#{base_error_message}: name is missing"
|
398
|
+
else
|
399
|
+
message = "#{base_error_message}: invalid name"
|
400
|
+
end
|
401
|
+
@source.position = start_position
|
402
|
+
raise REXML::ParseException.new(message, @source)
|
403
|
+
end
|
404
|
+
name = parse_name(base_error_message)
|
405
|
+
id = parse_id(base_error_message,
|
406
|
+
accept_external_id: true,
|
407
|
+
accept_public_id: true)
|
408
|
+
unless @source.match(/\s*>/um, true)
|
409
|
+
message = "#{base_error_message}: garbage before end >"
|
410
|
+
raise REXML::ParseException.new(message, @source)
|
411
|
+
end
|
412
|
+
return [:notationdecl, name, *id]
|
413
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
414
|
+
case md[1]
|
415
|
+
when /--/, /-\z/
|
416
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
417
|
+
end
|
418
|
+
return [ :comment, md[1] ] if md
|
340
419
|
end
|
341
|
-
|
342
|
-
|
420
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
421
|
+
return [ :externalentity, match[1] ]
|
422
|
+
elsif @source.match(/\]\s*>/um, true)
|
343
423
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
424
|
return [ :end_doctype ]
|
346
425
|
end
|
426
|
+
if @document_status == :in_doctype
|
427
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
428
|
+
end
|
347
429
|
end
|
348
430
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
431
|
+
@source.match(/\s*/um, true)
|
350
432
|
end
|
351
433
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
355
|
-
|
434
|
+
start_position = @source.position
|
435
|
+
if @source.match("<", true)
|
436
|
+
# :text's read_until may remain only "<" in buffer. In the
|
437
|
+
# case, buffer is empty here. So we need to fill buffer
|
438
|
+
# here explicitly.
|
439
|
+
@source.ensure_buffer
|
440
|
+
if @source.match("/", true)
|
441
|
+
@namespaces_restore_stack.pop
|
356
442
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
443
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
444
|
if md and !last_tag
|
359
445
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
446
|
raise REXML::ParseException.new(message, @source)
|
361
447
|
end
|
362
448
|
if md.nil? or last_tag != md[1]
|
363
449
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
450
|
+
message += " (got '#{md[1]}')" if md
|
451
|
+
@source.position = start_position if md.nil?
|
365
452
|
raise REXML::ParseException.new(message, @source)
|
366
453
|
end
|
367
454
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
455
|
+
elsif @source.match("!", true)
|
456
|
+
md = @source.match(/([^>]*>)/um)
|
370
457
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
458
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
459
|
+
if md[0][0] == ?-
|
460
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
461
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
462
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
463
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
464
|
end
|
379
465
|
|
380
|
-
return [ :comment, md[1] ]
|
466
|
+
return [ :comment, md[1] ]
|
381
467
|
else
|
382
|
-
md = @source.match(
|
468
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
469
|
return [ :cdata, md[1] ] if md
|
384
470
|
end
|
385
471
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
472
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
473
|
+
elsif @source.match("?", true)
|
388
474
|
return process_instruction
|
389
475
|
else
|
390
476
|
# Get the next tag
|
391
|
-
md = @source.match(
|
477
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
478
|
unless md
|
479
|
+
@source.position = start_position
|
393
480
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
481
|
end
|
482
|
+
tag = md[1]
|
395
483
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
398
|
-
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
484
|
+
@prefixes.clear
|
485
|
+
@prefixes << md[2] if md[2]
|
486
|
+
push_namespaces_restore
|
487
|
+
attributes, closed = parse_attributes(@prefixes)
|
400
488
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
402
|
-
unless @
|
489
|
+
for prefix in @prefixes
|
490
|
+
unless @namespaces.key?(prefix)
|
403
491
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
492
|
end
|
405
493
|
end
|
406
494
|
|
407
495
|
if closed
|
408
|
-
@closed =
|
409
|
-
|
496
|
+
@closed = tag
|
497
|
+
pop_namespaces_restore
|
410
498
|
else
|
411
|
-
@tags.
|
499
|
+
if @tags.empty? and @have_root
|
500
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
501
|
+
end
|
502
|
+
@tags.push( tag )
|
412
503
|
end
|
413
|
-
|
504
|
+
@have_root = true
|
505
|
+
return [ :start_element, tag, attributes ]
|
414
506
|
end
|
415
507
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
508
|
+
text = @source.read_until("<")
|
509
|
+
if text.chomp!("<")
|
510
|
+
@source.position -= "<".bytesize
|
511
|
+
end
|
512
|
+
if @tags.empty?
|
513
|
+
unless /\A\s*\z/.match?(text)
|
514
|
+
if @have_root
|
515
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
516
|
+
else
|
517
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
518
|
+
end
|
519
|
+
end
|
520
|
+
return pull_event if @have_root
|
419
521
|
end
|
420
|
-
|
421
|
-
#return [ :text, "" ] if md[0].length == 0
|
422
|
-
# unnormalized = Text::unnormalize( md[1], self )
|
423
|
-
# return PullEvent.new( :text, md[1], unnormalized )
|
424
|
-
return [ :text, md[1] ]
|
522
|
+
return [ :text, text ]
|
425
523
|
end
|
426
524
|
rescue REXML::UndefinedNamespaceException
|
427
525
|
raise
|
@@ -436,13 +534,13 @@ module REXML
|
|
436
534
|
private :pull_event
|
437
535
|
|
438
536
|
def entity( reference, entities )
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
unnormalize( value, entities )
|
537
|
+
return unless entities
|
538
|
+
|
539
|
+
value = entities[ reference ]
|
540
|
+
return if value.nil?
|
541
|
+
|
542
|
+
record_entity_expansion
|
543
|
+
unnormalize( value, entities )
|
446
544
|
end
|
447
545
|
|
448
546
|
# Escapes all possible entities
|
@@ -463,35 +561,87 @@ module REXML
|
|
463
561
|
|
464
562
|
# Unescapes all possible entities
|
465
563
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
564
|
+
if string.include?("\r")
|
565
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
566
|
+
else
|
567
|
+
rv = string.dup
|
568
|
+
end
|
468
569
|
matches = rv.scan( REFERENCE_RE )
|
469
570
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
571
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
572
|
m=$1
|
472
|
-
|
473
|
-
|
573
|
+
if m.start_with?("x")
|
574
|
+
code_point = Integer(m[1..-1], 16)
|
575
|
+
else
|
576
|
+
code_point = Integer(m, 10)
|
577
|
+
end
|
578
|
+
[code_point].pack('U*')
|
474
579
|
}
|
475
580
|
matches.collect!{|x|x[0]}.compact!
|
581
|
+
if filter
|
582
|
+
matches.reject! do |entity_reference|
|
583
|
+
filter.include?(entity_reference)
|
584
|
+
end
|
585
|
+
end
|
476
586
|
if matches.size > 0
|
477
|
-
matches.each do |entity_reference|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
587
|
+
matches.tally.each do |entity_reference, n|
|
588
|
+
entity_expansion_count_before = @entity_expansion_count
|
589
|
+
entity_value = entity( entity_reference, entities )
|
590
|
+
if entity_value
|
591
|
+
if n > 1
|
592
|
+
entity_expansion_count_delta =
|
593
|
+
@entity_expansion_count - entity_expansion_count_before
|
594
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
595
|
+
end
|
596
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
597
|
+
rv.gsub!( re, entity_value )
|
598
|
+
if rv.bytesize > @entity_expansion_text_limit
|
599
|
+
raise "entity expansion has grown too large"
|
486
600
|
end
|
601
|
+
else
|
602
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
603
|
+
rv.gsub!( er[0], er[2] ) if er
|
487
604
|
end
|
488
605
|
end
|
489
|
-
rv.gsub!(
|
606
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
607
|
end
|
491
608
|
rv
|
492
609
|
end
|
493
610
|
|
494
611
|
private
|
612
|
+
def add_namespace(prefix, uri)
|
613
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
614
|
+
if uri.nil?
|
615
|
+
@namespaces.delete(prefix)
|
616
|
+
else
|
617
|
+
@namespaces[prefix] = uri
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
def push_namespaces_restore
|
622
|
+
namespaces_restore = {}
|
623
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
624
|
+
namespaces_restore
|
625
|
+
end
|
626
|
+
|
627
|
+
def pop_namespaces_restore
|
628
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
629
|
+
namespaces_restore.each do |prefix, uri|
|
630
|
+
if uri.nil?
|
631
|
+
@namespaces.delete(prefix)
|
632
|
+
else
|
633
|
+
@namespaces[prefix] = uri
|
634
|
+
end
|
635
|
+
end
|
636
|
+
end
|
637
|
+
|
638
|
+
def record_entity_expansion(delta=1)
|
639
|
+
@entity_expansion_count += delta
|
640
|
+
if @entity_expansion_count > @entity_expansion_limit
|
641
|
+
raise "number of entity expansions exceeded, processing aborted."
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
495
645
|
def need_source_encoding_update?(xml_declaration_encoding)
|
496
646
|
return false if xml_declaration_encoding.nil?
|
497
647
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
@@ -499,16 +649,16 @@ module REXML
|
|
499
649
|
end
|
500
650
|
|
501
651
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
652
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
653
|
unless md
|
504
|
-
if @source.match(/\
|
654
|
+
if @source.match(/\S/um)
|
505
655
|
message = "#{base_error_message}: invalid name"
|
506
656
|
else
|
507
657
|
message = "#{base_error_message}: name is missing"
|
508
658
|
end
|
509
659
|
raise REXML::ParseException.new(message, @source)
|
510
660
|
end
|
511
|
-
md[
|
661
|
+
md[0]
|
512
662
|
end
|
513
663
|
|
514
664
|
def parse_id(base_error_message,
|
@@ -578,96 +728,114 @@ module REXML
|
|
578
728
|
end
|
579
729
|
|
580
730
|
def process_instruction
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
731
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
732
|
+
if @source.match(/\s+/um, true)
|
733
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
734
|
+
unless match_data
|
735
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
736
|
+
end
|
737
|
+
content = match_data[1]
|
738
|
+
else
|
739
|
+
content = nil
|
740
|
+
unless @source.match("?>", true)
|
741
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
742
|
+
end
|
743
|
+
end
|
744
|
+
if name == "xml"
|
745
|
+
if @document_status
|
746
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
747
|
+
end
|
748
|
+
version = VERSION.match(content)
|
749
|
+
version = version[1] unless version.nil?
|
750
|
+
encoding = ENCODING.match(content)
|
751
|
+
encoding = encoding[1] unless encoding.nil?
|
752
|
+
if need_source_encoding_update?(encoding)
|
753
|
+
@source.encoding = encoding
|
754
|
+
end
|
755
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
756
|
+
encoding = "UTF-16"
|
757
|
+
end
|
758
|
+
standalone = STANDALONE.match(content)
|
759
|
+
standalone = standalone[1] unless standalone.nil?
|
760
|
+
return [ :xmldecl, version, encoding, standalone ]
|
585
761
|
end
|
586
|
-
[:processing_instruction,
|
762
|
+
[:processing_instruction, name, content]
|
587
763
|
end
|
588
764
|
|
589
|
-
def parse_attributes(prefixes
|
765
|
+
def parse_attributes(prefixes)
|
590
766
|
attributes = {}
|
767
|
+
expanded_names = {}
|
591
768
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
769
|
+
while true
|
770
|
+
if @source.match(">", true)
|
771
|
+
return attributes, closed
|
772
|
+
elsif @source.match("/>", true)
|
773
|
+
closed = true
|
774
|
+
return attributes, closed
|
775
|
+
elsif match = @source.match(QNAME, true)
|
776
|
+
name = match[1]
|
777
|
+
prefix = match[2]
|
778
|
+
local_part = match[3]
|
779
|
+
|
780
|
+
unless @source.match(/\s*=\s*/um, true)
|
618
781
|
message = "Missing attribute equal: <#{name}>"
|
619
782
|
raise REXML::ParseException.new(message, @source)
|
620
783
|
end
|
621
|
-
|
622
|
-
unless quote
|
784
|
+
unless match = @source.match(/(['"])/, true)
|
623
785
|
message = "Missing attribute value start quote: <#{name}>"
|
624
786
|
raise REXML::ParseException.new(message, @source)
|
625
787
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
788
|
+
quote = match[1]
|
789
|
+
start_position = @source.position
|
790
|
+
value = @source.read_until(quote)
|
791
|
+
unless value.chomp!(quote)
|
792
|
+
@source.position = start_position
|
793
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
794
|
raise REXML::ParseException.new(message, @source)
|
639
795
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
796
|
+
@source.match(/\s*/um, true)
|
797
|
+
if prefix == "xmlns"
|
798
|
+
if local_part == "xml"
|
799
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
800
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
801
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
802
|
+
raise REXML::ParseException.new( msg, @source, self )
|
803
|
+
end
|
804
|
+
elsif local_part == "xmlns"
|
805
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
806
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
807
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
808
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
809
|
+
add_namespace(local_part, value)
|
810
|
+
elsif prefix
|
811
|
+
prefixes << prefix unless prefix == "xml"
|
657
812
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
813
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
814
|
+
if attributes[name]
|
815
|
+
msg = "Duplicate attribute #{name.inspect}"
|
816
|
+
raise REXML::ParseException.new(msg, @source, self)
|
817
|
+
end
|
667
818
|
|
668
|
-
|
819
|
+
unless prefix == "xmlns"
|
820
|
+
uri = @namespaces[prefix]
|
821
|
+
expanded_name = [uri, local_part]
|
822
|
+
existing_prefix = expanded_names[expanded_name]
|
823
|
+
if existing_prefix
|
824
|
+
message = "Namespace conflict in adding attribute " +
|
825
|
+
"\"#{local_part}\": " +
|
826
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
827
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
828
|
+
raise REXML::ParseException.new(message, @source, self)
|
829
|
+
end
|
830
|
+
expanded_names[expanded_name] = prefix
|
831
|
+
end
|
832
|
+
|
833
|
+
attributes[name] = value
|
834
|
+
else
|
835
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
836
|
+
raise REXML::ParseException.new(message, @source)
|
837
|
+
end
|
669
838
|
end
|
670
|
-
return attributes, closed
|
671
839
|
end
|
672
840
|
end
|
673
841
|
end
|