rexml 3.2.6 → 3.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +328 -0
- data/lib/rexml/attribute.rb +3 -2
- data/lib/rexml/document.rb +5 -1
- data/lib/rexml/element.rb +16 -31
- data/lib/rexml/entity.rb +9 -48
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +425 -263
- data/lib/rexml/parsers/pullparser.rb +12 -0
- data/lib/rexml/parsers/sax2parser.rb +14 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +128 -98
- data/lib/rexml/text.rb +39 -17
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +6 -50
|
@@ -1,12 +1,40 @@
|
|
|
1
|
-
# frozen_string_literal:
|
|
1
|
+
# frozen_string_literal: true
|
|
2
2
|
require_relative '../parseexception'
|
|
3
3
|
require_relative '../undefinednamespaceexception'
|
|
4
|
+
require_relative '../security'
|
|
4
5
|
require_relative '../source'
|
|
5
6
|
require 'set'
|
|
6
7
|
require "strscan"
|
|
7
8
|
|
|
8
9
|
module REXML
|
|
9
10
|
module Parsers
|
|
11
|
+
unless [].respond_to?(:tally)
|
|
12
|
+
module EnumerableTally
|
|
13
|
+
refine Enumerable do
|
|
14
|
+
def tally
|
|
15
|
+
counts = {}
|
|
16
|
+
each do |item|
|
|
17
|
+
counts[item] ||= 0
|
|
18
|
+
counts[item] += 1
|
|
19
|
+
end
|
|
20
|
+
counts
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
using EnumerableTally
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
if StringScanner::Version < "3.0.8"
|
|
28
|
+
module StringScannerCaptures
|
|
29
|
+
refine StringScanner do
|
|
30
|
+
def captures
|
|
31
|
+
values_at(*(1...size))
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
using StringScannerCaptures
|
|
36
|
+
end
|
|
37
|
+
|
|
10
38
|
# = Using the Pull Parser
|
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
|
@@ -96,7 +124,7 @@ module REXML
|
|
|
96
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
|
97
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
|
98
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
|
100
128
|
|
|
101
129
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
|
102
130
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
|
@@ -112,9 +140,32 @@ module REXML
|
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
|
113
141
|
}
|
|
114
142
|
|
|
143
|
+
module Private
|
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
|
153
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
|
156
|
+
default_entities.each do |term|
|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
private_constant :Private
|
|
161
|
+
|
|
115
162
|
def initialize( source )
|
|
116
163
|
self.stream = source
|
|
117
164
|
@listeners = []
|
|
165
|
+
@prefixes = Set.new
|
|
166
|
+
@entity_expansion_count = 0
|
|
167
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
|
168
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
|
118
169
|
end
|
|
119
170
|
|
|
120
171
|
def add_listener( listener )
|
|
@@ -122,15 +173,20 @@ module REXML
|
|
|
122
173
|
end
|
|
123
174
|
|
|
124
175
|
attr_reader :source
|
|
176
|
+
attr_reader :entity_expansion_count
|
|
177
|
+
attr_writer :entity_expansion_limit
|
|
178
|
+
attr_writer :entity_expansion_text_limit
|
|
125
179
|
|
|
126
180
|
def stream=( source )
|
|
127
181
|
@source = SourceFactory.create_from( source )
|
|
128
182
|
@closed = nil
|
|
183
|
+
@have_root = false
|
|
129
184
|
@document_status = nil
|
|
130
185
|
@tags = []
|
|
131
186
|
@stack = []
|
|
132
187
|
@entities = []
|
|
133
|
-
@
|
|
188
|
+
@namespaces = {}
|
|
189
|
+
@namespaces_restore_stack = []
|
|
134
190
|
end
|
|
135
191
|
|
|
136
192
|
def position
|
|
@@ -180,6 +236,8 @@ module REXML
|
|
|
180
236
|
|
|
181
237
|
# Returns the next event. This is a +PullEvent+ object.
|
|
182
238
|
def pull
|
|
239
|
+
@source.drop_parsed_content
|
|
240
|
+
|
|
183
241
|
pull_event.tap do |event|
|
|
184
242
|
@listeners.each do |listener|
|
|
185
243
|
listener.receive event
|
|
@@ -192,236 +250,274 @@ module REXML
|
|
|
192
250
|
x, @closed = @closed, nil
|
|
193
251
|
return [ :end_element, x ]
|
|
194
252
|
end
|
|
195
|
-
|
|
253
|
+
if empty?
|
|
254
|
+
if @document_status == :in_doctype
|
|
255
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
|
256
|
+
end
|
|
257
|
+
unless @tags.empty?
|
|
258
|
+
path = "/" + @tags.join("/")
|
|
259
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
|
260
|
+
end
|
|
261
|
+
return [ :end_document ]
|
|
262
|
+
end
|
|
196
263
|
return @stack.shift if @stack.size > 0
|
|
197
264
|
#STDERR.puts @source.encoding
|
|
198
265
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
|
266
|
+
|
|
267
|
+
@source.ensure_buffer
|
|
199
268
|
if @document_status == nil
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
|
203
|
-
case word
|
|
204
|
-
when COMMENT_START
|
|
205
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
|
206
|
-
when XMLDECL_START
|
|
207
|
-
#STDERR.puts "XMLDECL"
|
|
208
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
|
209
|
-
version = VERSION.match( results )
|
|
210
|
-
version = version[1] unless version.nil?
|
|
211
|
-
encoding = ENCODING.match(results)
|
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
|
213
|
-
if need_source_encoding_update?(encoding)
|
|
214
|
-
@source.encoding = encoding
|
|
215
|
-
end
|
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
|
217
|
-
encoding = "UTF-16"
|
|
218
|
-
end
|
|
219
|
-
standalone = STANDALONE.match(results)
|
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
|
222
|
-
when INSTRUCTION_START
|
|
269
|
+
start_position = @source.position
|
|
270
|
+
if @source.match("<?", true)
|
|
223
271
|
return process_instruction
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
|
230
|
-
id = [nil, nil, nil]
|
|
231
|
-
@document_status = :in_doctype
|
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
|
233
|
-
id = [nil, nil, nil]
|
|
234
|
-
@document_status = :after_doctype
|
|
235
|
-
else
|
|
236
|
-
id = parse_id(base_error_message,
|
|
237
|
-
accept_external_id: true,
|
|
238
|
-
accept_public_id: false)
|
|
239
|
-
if id[0] == "SYSTEM"
|
|
240
|
-
# For backward compatibility
|
|
241
|
-
id[1], id[2] = id[2], nil
|
|
272
|
+
elsif @source.match("<!", true)
|
|
273
|
+
if @source.match("--", true)
|
|
274
|
+
md = @source.match(/(.*?)-->/um, true)
|
|
275
|
+
if md.nil?
|
|
276
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
|
242
277
|
end
|
|
243
|
-
if
|
|
278
|
+
if /--|-\z/.match?(md[1])
|
|
279
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
|
280
|
+
end
|
|
281
|
+
return [ :comment, md[1] ]
|
|
282
|
+
elsif @source.match("DOCTYPE", true)
|
|
283
|
+
base_error_message = "Malformed DOCTYPE"
|
|
284
|
+
unless @source.match(/\s+/um, true)
|
|
285
|
+
if @source.match(">")
|
|
286
|
+
message = "#{base_error_message}: name is missing"
|
|
287
|
+
else
|
|
288
|
+
message = "#{base_error_message}: invalid name"
|
|
289
|
+
end
|
|
290
|
+
@source.position = start_position
|
|
291
|
+
raise REXML::ParseException.new(message, @source)
|
|
292
|
+
end
|
|
293
|
+
name = parse_name(base_error_message)
|
|
294
|
+
if @source.match(/\s*\[/um, true)
|
|
295
|
+
id = [nil, nil, nil]
|
|
244
296
|
@document_status = :in_doctype
|
|
245
|
-
elsif @source.match(/\
|
|
297
|
+
elsif @source.match(/\s*>/um, true)
|
|
298
|
+
id = [nil, nil, nil]
|
|
246
299
|
@document_status = :after_doctype
|
|
300
|
+
@source.ensure_buffer
|
|
247
301
|
else
|
|
248
|
-
|
|
249
|
-
|
|
302
|
+
id = parse_id(base_error_message,
|
|
303
|
+
accept_external_id: true,
|
|
304
|
+
accept_public_id: false)
|
|
305
|
+
if id[0] == "SYSTEM"
|
|
306
|
+
# For backward compatibility
|
|
307
|
+
id[1], id[2] = id[2], nil
|
|
308
|
+
end
|
|
309
|
+
if @source.match(/\s*\[/um, true)
|
|
310
|
+
@document_status = :in_doctype
|
|
311
|
+
elsif @source.match(/\s*>/um, true)
|
|
312
|
+
@document_status = :after_doctype
|
|
313
|
+
@source.ensure_buffer
|
|
314
|
+
else
|
|
315
|
+
message = "#{base_error_message}: garbage after external ID"
|
|
316
|
+
raise REXML::ParseException.new(message, @source)
|
|
317
|
+
end
|
|
250
318
|
end
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
@document_status = :after_doctype
|
|
261
|
-
if @source.encoding == "UTF-8"
|
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
|
319
|
+
args = [:start_doctype, name, *id]
|
|
320
|
+
if @document_status == :after_doctype
|
|
321
|
+
@source.match(/\s*/um, true)
|
|
322
|
+
@stack << [ :end_doctype ]
|
|
323
|
+
end
|
|
324
|
+
return args
|
|
325
|
+
else
|
|
326
|
+
message = "Invalid XML"
|
|
327
|
+
raise REXML::ParseException.new(message, @source)
|
|
263
328
|
end
|
|
264
329
|
end
|
|
265
330
|
end
|
|
266
331
|
if @document_status == :in_doctype
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
match[0] = :entitydecl
|
|
279
|
-
ref = false
|
|
280
|
-
if match[1] == '%'
|
|
281
|
-
ref = true
|
|
282
|
-
match.delete_at 1
|
|
283
|
-
end
|
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
|
285
|
-
if match[2] == 'SYSTEM'
|
|
286
|
-
# External reference
|
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
|
290
|
-
elsif match[2] == 'PUBLIC'
|
|
291
|
-
# External reference
|
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
|
296
|
-
else
|
|
297
|
-
match[2] = match[2][1..-2]
|
|
298
|
-
match.pop if match.size == 4
|
|
299
|
-
# match is [ :entity, name, value ]
|
|
300
|
-
end
|
|
301
|
-
match << '%' if ref
|
|
302
|
-
return match
|
|
303
|
-
when ATTLISTDECL_START
|
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
|
306
|
-
element = md[1]
|
|
307
|
-
contents = md[0]
|
|
308
|
-
|
|
309
|
-
pairs = {}
|
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
|
311
|
-
values.each do |attdef|
|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
|
313
|
-
attdef.compact!
|
|
314
|
-
val = attdef[3]
|
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
|
316
|
-
pairs[attdef[0]] = val
|
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
|
318
|
-
@nsstack[0] << $1
|
|
319
|
-
end
|
|
332
|
+
@source.match(/\s*/um, true) # skip spaces
|
|
333
|
+
start_position = @source.position
|
|
334
|
+
if @source.match("<!", true)
|
|
335
|
+
if @source.match("ELEMENT", true)
|
|
336
|
+
md = @source.match(/(.*?)>/um, true)
|
|
337
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
|
338
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
|
339
|
+
elsif @source.match("ENTITY", true)
|
|
340
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
|
341
|
+
unless match_data
|
|
342
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
|
320
343
|
end
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
344
|
+
match = [:entitydecl, *match_data.captures.compact]
|
|
345
|
+
ref = false
|
|
346
|
+
if match[1] == '%'
|
|
347
|
+
ref = true
|
|
348
|
+
match.delete_at 1
|
|
349
|
+
end
|
|
350
|
+
# Now we have to sort out what kind of entity reference this is
|
|
351
|
+
if match[2] == 'SYSTEM'
|
|
352
|
+
# External reference
|
|
353
|
+
match[3] = match[3][1..-2] # PUBID
|
|
354
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
|
355
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
|
356
|
+
elsif match[2] == 'PUBLIC'
|
|
357
|
+
# External reference
|
|
358
|
+
match[3] = match[3][1..-2] # PUBID
|
|
359
|
+
match[4] = match[4][1..-2] # HREF
|
|
360
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
|
361
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
|
362
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
|
363
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
|
328
364
|
else
|
|
329
|
-
|
|
365
|
+
match[2] = match[2][1..-2]
|
|
366
|
+
match.pop if match.size == 4
|
|
367
|
+
# match is [ :entity, name, value ]
|
|
330
368
|
end
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
369
|
+
match << '%' if ref
|
|
370
|
+
return match
|
|
371
|
+
elsif @source.match("ATTLIST", true)
|
|
372
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
|
373
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
|
374
|
+
element = md[1]
|
|
375
|
+
contents = md[0]
|
|
376
|
+
|
|
377
|
+
pairs = {}
|
|
378
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
|
379
|
+
values.each do |attdef|
|
|
380
|
+
unless attdef[3] == "#IMPLIED"
|
|
381
|
+
attdef.compact!
|
|
382
|
+
val = attdef[3]
|
|
383
|
+
val = attdef[4] if val == "#FIXED "
|
|
384
|
+
pairs[attdef[0]] = val
|
|
385
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
|
386
|
+
@namespaces[$1] = val
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
return [ :attlistdecl, element, pairs, contents ]
|
|
391
|
+
elsif @source.match("NOTATION", true)
|
|
392
|
+
base_error_message = "Malformed notation declaration"
|
|
393
|
+
unless @source.match(/\s+/um, true)
|
|
394
|
+
if @source.match(">")
|
|
395
|
+
message = "#{base_error_message}: name is missing"
|
|
396
|
+
else
|
|
397
|
+
message = "#{base_error_message}: invalid name"
|
|
398
|
+
end
|
|
399
|
+
@source.position = start_position
|
|
400
|
+
raise REXML::ParseException.new(message, @source)
|
|
401
|
+
end
|
|
402
|
+
name = parse_name(base_error_message)
|
|
403
|
+
id = parse_id(base_error_message,
|
|
404
|
+
accept_external_id: true,
|
|
405
|
+
accept_public_id: true)
|
|
406
|
+
unless @source.match(/\s*>/um, true)
|
|
407
|
+
message = "#{base_error_message}: garbage before end >"
|
|
408
|
+
raise REXML::ParseException.new(message, @source)
|
|
409
|
+
end
|
|
410
|
+
return [:notationdecl, name, *id]
|
|
411
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
|
412
|
+
case md[1]
|
|
413
|
+
when /--/, /-\z/
|
|
414
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
|
415
|
+
end
|
|
416
|
+
return [ :comment, md[1] ] if md
|
|
340
417
|
end
|
|
341
|
-
|
|
342
|
-
|
|
418
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
|
419
|
+
return [ :externalentity, match[1] ]
|
|
420
|
+
elsif @source.match(/\]\s*>/um, true)
|
|
343
421
|
@document_status = :after_doctype
|
|
344
|
-
@source.match( DOCTYPE_END, true )
|
|
345
422
|
return [ :end_doctype ]
|
|
346
423
|
end
|
|
424
|
+
if @document_status == :in_doctype
|
|
425
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
|
426
|
+
end
|
|
347
427
|
end
|
|
348
428
|
if @document_status == :after_doctype
|
|
349
|
-
@source.match(/\
|
|
429
|
+
@source.match(/\s*/um, true)
|
|
350
430
|
end
|
|
351
431
|
begin
|
|
352
|
-
|
|
353
|
-
if @source.
|
|
354
|
-
|
|
355
|
-
|
|
432
|
+
start_position = @source.position
|
|
433
|
+
if @source.match("<", true)
|
|
434
|
+
# :text's read_until may remain only "<" in buffer. In the
|
|
435
|
+
# case, buffer is empty here. So we need to fill buffer
|
|
436
|
+
# here explicitly.
|
|
437
|
+
@source.ensure_buffer
|
|
438
|
+
if @source.match("/", true)
|
|
439
|
+
@namespaces_restore_stack.pop
|
|
356
440
|
last_tag = @tags.pop
|
|
357
|
-
md = @source.match(
|
|
441
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
|
358
442
|
if md and !last_tag
|
|
359
443
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
|
360
444
|
raise REXML::ParseException.new(message, @source)
|
|
361
445
|
end
|
|
362
446
|
if md.nil? or last_tag != md[1]
|
|
363
447
|
message = "Missing end tag for '#{last_tag}'"
|
|
364
|
-
message
|
|
448
|
+
message += " (got '#{md[1]}')" if md
|
|
449
|
+
@source.position = start_position if md.nil?
|
|
365
450
|
raise REXML::ParseException.new(message, @source)
|
|
366
451
|
end
|
|
367
452
|
return [ :end_element, last_tag ]
|
|
368
|
-
elsif @source.
|
|
369
|
-
md = @source.match(
|
|
453
|
+
elsif @source.match("!", true)
|
|
454
|
+
md = @source.match(/([^>]*>)/um)
|
|
370
455
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
|
371
456
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
|
372
|
-
if md[0][
|
|
373
|
-
md = @source.match(
|
|
457
|
+
if md[0][0] == ?-
|
|
458
|
+
md = @source.match(/--(.*?)-->/um, true)
|
|
374
459
|
|
|
375
|
-
|
|
376
|
-
when /--/, /-\z/
|
|
460
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
|
377
461
|
raise REXML::ParseException.new("Malformed comment", @source)
|
|
378
462
|
end
|
|
379
463
|
|
|
380
|
-
return [ :comment, md[1] ]
|
|
464
|
+
return [ :comment, md[1] ]
|
|
381
465
|
else
|
|
382
|
-
md = @source.match(
|
|
466
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
|
383
467
|
return [ :cdata, md[1] ] if md
|
|
384
468
|
end
|
|
385
469
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
|
386
470
|
"in the doctype declaration.", @source)
|
|
387
|
-
elsif @source.
|
|
471
|
+
elsif @source.match("?", true)
|
|
388
472
|
return process_instruction
|
|
389
473
|
else
|
|
390
474
|
# Get the next tag
|
|
391
|
-
md = @source.match(
|
|
475
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
|
392
476
|
unless md
|
|
477
|
+
@source.position = start_position
|
|
393
478
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
|
394
479
|
end
|
|
480
|
+
tag = md[1]
|
|
395
481
|
@document_status = :in_element
|
|
396
|
-
prefixes
|
|
397
|
-
prefixes << md[2] if md[2]
|
|
398
|
-
|
|
399
|
-
attributes, closed = parse_attributes(prefixes
|
|
482
|
+
@prefixes.clear
|
|
483
|
+
@prefixes << md[2] if md[2]
|
|
484
|
+
push_namespaces_restore
|
|
485
|
+
attributes, closed = parse_attributes(@prefixes)
|
|
400
486
|
# Verify that all of the prefixes have been defined
|
|
401
|
-
for prefix in prefixes
|
|
402
|
-
unless @
|
|
487
|
+
for prefix in @prefixes
|
|
488
|
+
unless @namespaces.key?(prefix)
|
|
403
489
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
|
404
490
|
end
|
|
405
491
|
end
|
|
406
492
|
|
|
407
493
|
if closed
|
|
408
|
-
@closed =
|
|
409
|
-
|
|
494
|
+
@closed = tag
|
|
495
|
+
pop_namespaces_restore
|
|
410
496
|
else
|
|
411
|
-
@tags.
|
|
497
|
+
if @tags.empty? and @have_root
|
|
498
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
|
499
|
+
end
|
|
500
|
+
@tags.push( tag )
|
|
412
501
|
end
|
|
413
|
-
|
|
502
|
+
@have_root = true
|
|
503
|
+
return [ :start_element, tag, attributes ]
|
|
414
504
|
end
|
|
415
505
|
else
|
|
416
|
-
|
|
417
|
-
if
|
|
418
|
-
@source.
|
|
506
|
+
text = @source.read_until("<")
|
|
507
|
+
if text.chomp!("<")
|
|
508
|
+
@source.position -= "<".bytesize
|
|
419
509
|
end
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
510
|
+
if @tags.empty?
|
|
511
|
+
unless /\A\s*\z/.match?(text)
|
|
512
|
+
if @have_root
|
|
513
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
|
514
|
+
else
|
|
515
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
return pull_event if @have_root
|
|
519
|
+
end
|
|
520
|
+
return [ :text, text ]
|
|
425
521
|
end
|
|
426
522
|
rescue REXML::UndefinedNamespaceException
|
|
427
523
|
raise
|
|
@@ -436,13 +532,13 @@ module REXML
|
|
|
436
532
|
private :pull_event
|
|
437
533
|
|
|
438
534
|
def entity( reference, entities )
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
unnormalize( value, entities )
|
|
535
|
+
return unless entities
|
|
536
|
+
|
|
537
|
+
value = entities[ reference ]
|
|
538
|
+
return if value.nil?
|
|
539
|
+
|
|
540
|
+
record_entity_expansion
|
|
541
|
+
unnormalize( value, entities )
|
|
446
542
|
end
|
|
447
543
|
|
|
448
544
|
# Escapes all possible entities
|
|
@@ -463,35 +559,83 @@ module REXML
|
|
|
463
559
|
|
|
464
560
|
# Unescapes all possible entities
|
|
465
561
|
def unnormalize( string, entities=nil, filter=nil )
|
|
466
|
-
|
|
467
|
-
|
|
562
|
+
if string.include?("\r")
|
|
563
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
|
564
|
+
else
|
|
565
|
+
rv = string.dup
|
|
566
|
+
end
|
|
468
567
|
matches = rv.scan( REFERENCE_RE )
|
|
469
568
|
return rv if matches.size == 0
|
|
470
|
-
rv.gsub!(
|
|
569
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
|
471
570
|
m=$1
|
|
472
571
|
m = "0#{m}" if m[0] == ?x
|
|
473
572
|
[Integer(m)].pack('U*')
|
|
474
573
|
}
|
|
475
574
|
matches.collect!{|x|x[0]}.compact!
|
|
575
|
+
if filter
|
|
576
|
+
matches.reject! do |entity_reference|
|
|
577
|
+
filter.include?(entity_reference)
|
|
578
|
+
end
|
|
579
|
+
end
|
|
476
580
|
if matches.size > 0
|
|
477
|
-
matches.each do |entity_reference|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
581
|
+
matches.tally.each do |entity_reference, n|
|
|
582
|
+
entity_expansion_count_before = @entity_expansion_count
|
|
583
|
+
entity_value = entity( entity_reference, entities )
|
|
584
|
+
if entity_value
|
|
585
|
+
if n > 1
|
|
586
|
+
entity_expansion_count_delta =
|
|
587
|
+
@entity_expansion_count - entity_expansion_count_before
|
|
588
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
|
589
|
+
end
|
|
590
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
|
591
|
+
rv.gsub!( re, entity_value )
|
|
592
|
+
if rv.bytesize > @entity_expansion_text_limit
|
|
593
|
+
raise "entity expansion has grown too large"
|
|
486
594
|
end
|
|
595
|
+
else
|
|
596
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
|
597
|
+
rv.gsub!( er[0], er[2] ) if er
|
|
487
598
|
end
|
|
488
599
|
end
|
|
489
|
-
rv.gsub!(
|
|
600
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
|
490
601
|
end
|
|
491
602
|
rv
|
|
492
603
|
end
|
|
493
604
|
|
|
494
605
|
private
|
|
606
|
+
def add_namespace(prefix, uri)
|
|
607
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
|
608
|
+
if uri.nil?
|
|
609
|
+
@namespaces.delete(prefix)
|
|
610
|
+
else
|
|
611
|
+
@namespaces[prefix] = uri
|
|
612
|
+
end
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
def push_namespaces_restore
|
|
616
|
+
namespaces_restore = {}
|
|
617
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
|
618
|
+
namespaces_restore
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
def pop_namespaces_restore
|
|
622
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
|
623
|
+
namespaces_restore.each do |prefix, uri|
|
|
624
|
+
if uri.nil?
|
|
625
|
+
@namespaces.delete(prefix)
|
|
626
|
+
else
|
|
627
|
+
@namespaces[prefix] = uri
|
|
628
|
+
end
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
def record_entity_expansion(delta=1)
|
|
633
|
+
@entity_expansion_count += delta
|
|
634
|
+
if @entity_expansion_count > @entity_expansion_limit
|
|
635
|
+
raise "number of entity expansions exceeded, processing aborted."
|
|
636
|
+
end
|
|
637
|
+
end
|
|
638
|
+
|
|
495
639
|
def need_source_encoding_update?(xml_declaration_encoding)
|
|
496
640
|
return false if xml_declaration_encoding.nil?
|
|
497
641
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
|
@@ -499,16 +643,16 @@ module REXML
|
|
|
499
643
|
end
|
|
500
644
|
|
|
501
645
|
def parse_name(base_error_message)
|
|
502
|
-
md = @source.match(
|
|
646
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
|
503
647
|
unless md
|
|
504
|
-
if @source.match(/\
|
|
648
|
+
if @source.match(/\S/um)
|
|
505
649
|
message = "#{base_error_message}: invalid name"
|
|
506
650
|
else
|
|
507
651
|
message = "#{base_error_message}: name is missing"
|
|
508
652
|
end
|
|
509
653
|
raise REXML::ParseException.new(message, @source)
|
|
510
654
|
end
|
|
511
|
-
md[
|
|
655
|
+
md[0]
|
|
512
656
|
end
|
|
513
657
|
|
|
514
658
|
def parse_id(base_error_message,
|
|
@@ -578,96 +722,114 @@ module REXML
|
|
|
578
722
|
end
|
|
579
723
|
|
|
580
724
|
def process_instruction
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
725
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
|
726
|
+
if @source.match(/\s+/um, true)
|
|
727
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
|
728
|
+
unless match_data
|
|
729
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
|
730
|
+
end
|
|
731
|
+
content = match_data[1]
|
|
732
|
+
else
|
|
733
|
+
content = nil
|
|
734
|
+
unless @source.match("?>", true)
|
|
735
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
|
736
|
+
end
|
|
737
|
+
end
|
|
738
|
+
if name == "xml"
|
|
739
|
+
if @document_status
|
|
740
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
|
741
|
+
end
|
|
742
|
+
version = VERSION.match(content)
|
|
743
|
+
version = version[1] unless version.nil?
|
|
744
|
+
encoding = ENCODING.match(content)
|
|
745
|
+
encoding = encoding[1] unless encoding.nil?
|
|
746
|
+
if need_source_encoding_update?(encoding)
|
|
747
|
+
@source.encoding = encoding
|
|
748
|
+
end
|
|
749
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
|
750
|
+
encoding = "UTF-16"
|
|
751
|
+
end
|
|
752
|
+
standalone = STANDALONE.match(content)
|
|
753
|
+
standalone = standalone[1] unless standalone.nil?
|
|
754
|
+
return [ :xmldecl, version, encoding, standalone ]
|
|
585
755
|
end
|
|
586
|
-
[:processing_instruction,
|
|
756
|
+
[:processing_instruction, name, content]
|
|
587
757
|
end
|
|
588
758
|
|
|
589
|
-
def parse_attributes(prefixes
|
|
759
|
+
def parse_attributes(prefixes)
|
|
590
760
|
attributes = {}
|
|
761
|
+
expanded_names = {}
|
|
591
762
|
closed = false
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
until scanner.eos?
|
|
605
|
-
if scanner.scan(/\s+/)
|
|
606
|
-
break if scanner.eos?
|
|
607
|
-
end
|
|
608
|
-
|
|
609
|
-
pos = scanner.pos
|
|
610
|
-
loop do
|
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
|
612
|
-
unless scanner.scan(QNAME)
|
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
|
615
|
-
end
|
|
616
|
-
name = scanner[0]
|
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
|
763
|
+
while true
|
|
764
|
+
if @source.match(">", true)
|
|
765
|
+
return attributes, closed
|
|
766
|
+
elsif @source.match("/>", true)
|
|
767
|
+
closed = true
|
|
768
|
+
return attributes, closed
|
|
769
|
+
elsif match = @source.match(QNAME, true)
|
|
770
|
+
name = match[1]
|
|
771
|
+
prefix = match[2]
|
|
772
|
+
local_part = match[3]
|
|
773
|
+
|
|
774
|
+
unless @source.match(/\s*=\s*/um, true)
|
|
618
775
|
message = "Missing attribute equal: <#{name}>"
|
|
619
776
|
raise REXML::ParseException.new(message, @source)
|
|
620
777
|
end
|
|
621
|
-
|
|
622
|
-
unless quote
|
|
778
|
+
unless match = @source.match(/(['"])/, true)
|
|
623
779
|
message = "Missing attribute value start quote: <#{name}>"
|
|
624
780
|
raise REXML::ParseException.new(message, @source)
|
|
625
781
|
end
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
scanner.pos = pos
|
|
633
|
-
closed = !match_data[2].nil?
|
|
634
|
-
next
|
|
635
|
-
end
|
|
636
|
-
message =
|
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
|
782
|
+
quote = match[1]
|
|
783
|
+
start_position = @source.position
|
|
784
|
+
value = @source.read_until(quote)
|
|
785
|
+
unless value.chomp!(quote)
|
|
786
|
+
@source.position = start_position
|
|
787
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
|
638
788
|
raise REXML::ParseException.new(message, @source)
|
|
639
789
|
end
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
msg = "The '
|
|
790
|
+
@source.match(/\s*/um, true)
|
|
791
|
+
if prefix == "xmlns"
|
|
792
|
+
if local_part == "xml"
|
|
793
|
+
if value != "http://www.w3.org/XML/1998/namespace"
|
|
794
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
|
795
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
796
|
+
raise REXML::ParseException.new( msg, @source, self )
|
|
797
|
+
end
|
|
798
|
+
elsif local_part == "xmlns"
|
|
799
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
|
650
800
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
|
801
|
+
raise REXML::ParseException.new( msg, @source, self)
|
|
652
802
|
end
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
|
803
|
+
add_namespace(local_part, value)
|
|
804
|
+
elsif prefix
|
|
805
|
+
prefixes << prefix unless prefix == "xml"
|
|
657
806
|
end
|
|
658
|
-
curr_ns << local_part
|
|
659
|
-
elsif prefix
|
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
|
661
|
-
end
|
|
662
807
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
808
|
+
if attributes[name]
|
|
809
|
+
msg = "Duplicate attribute #{name.inspect}"
|
|
810
|
+
raise REXML::ParseException.new(msg, @source, self)
|
|
811
|
+
end
|
|
667
812
|
|
|
668
|
-
|
|
813
|
+
unless prefix == "xmlns"
|
|
814
|
+
uri = @namespaces[prefix]
|
|
815
|
+
expanded_name = [uri, local_part]
|
|
816
|
+
existing_prefix = expanded_names[expanded_name]
|
|
817
|
+
if existing_prefix
|
|
818
|
+
message = "Namespace conflict in adding attribute " +
|
|
819
|
+
"\"#{local_part}\": " +
|
|
820
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
|
821
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
|
822
|
+
raise REXML::ParseException.new(message, @source, self)
|
|
823
|
+
end
|
|
824
|
+
expanded_names[expanded_name] = prefix
|
|
825
|
+
end
|
|
826
|
+
|
|
827
|
+
attributes[name] = value
|
|
828
|
+
else
|
|
829
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
|
830
|
+
raise REXML::ParseException.new(message, @source)
|
|
831
|
+
end
|
|
669
832
|
end
|
|
670
|
-
return attributes, closed
|
|
671
833
|
end
|
|
672
834
|
end
|
|
673
835
|
end
|