rexml 3.2.6 → 3.3.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/NEWS.md +199 -0
- data/lib/rexml/element.rb +2 -15
- data/lib/rexml/formatters/pretty.rb +1 -1
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +307 -240
- data/lib/rexml/parsers/sax2parser.rb +2 -19
- data/lib/rexml/parsers/streamparser.rb +2 -2
- data/lib/rexml/parsers/treeparser.rb +9 -14
- data/lib/rexml/rexml.rb +1 -1
- data/lib/rexml/source.rb +121 -101
- data/lib/rexml/text.rb +34 -14
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +7 -37
@@ -1,4 +1,4 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
4
|
require_relative '../source'
|
@@ -7,6 +7,17 @@ require "strscan"
|
|
7
7
|
|
8
8
|
module REXML
|
9
9
|
module Parsers
|
10
|
+
if StringScanner::Version < "3.0.8"
|
11
|
+
module StringScannerCaptures
|
12
|
+
refine StringScanner do
|
13
|
+
def captures
|
14
|
+
values_at(*(1...size))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
using StringScannerCaptures
|
19
|
+
end
|
20
|
+
|
10
21
|
# = Using the Pull Parser
|
11
22
|
# <em>This API is experimental, and subject to change.</em>
|
12
23
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +107,7 @@ module REXML
|
|
96
107
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
108
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
109
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
110
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
111
|
|
101
112
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
113
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +123,37 @@ module REXML
|
|
112
123
|
"apos" => [/'/, "'", "'", /'/]
|
113
124
|
}
|
114
125
|
|
126
|
+
module Private
|
127
|
+
# Terminal requires two or more letters.
|
128
|
+
INSTRUCTION_TERM = "?>"
|
129
|
+
COMMENT_TERM = "-->"
|
130
|
+
CDATA_TERM = "]]>"
|
131
|
+
DOCTYPE_TERM = "]>"
|
132
|
+
# Read to the end of DOCTYPE because there is no proper ENTITY termination
|
133
|
+
ENTITY_TERM = DOCTYPE_TERM
|
134
|
+
|
135
|
+
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
|
136
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
137
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
138
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
139
|
+
NAME_PATTERN = /\s*#{NAME}/um
|
140
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
141
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
142
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
143
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
144
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
145
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
146
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
147
|
+
default_entities.each do |term|
|
148
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
149
|
+
end
|
150
|
+
end
|
151
|
+
private_constant :Private
|
152
|
+
|
115
153
|
def initialize( source )
|
116
154
|
self.stream = source
|
117
155
|
@listeners = []
|
156
|
+
@prefixes = Set.new
|
118
157
|
end
|
119
158
|
|
120
159
|
def add_listener( listener )
|
@@ -126,6 +165,7 @@ module REXML
|
|
126
165
|
def stream=( source )
|
127
166
|
@source = SourceFactory.create_from( source )
|
128
167
|
@closed = nil
|
168
|
+
@have_root = false
|
129
169
|
@document_status = nil
|
130
170
|
@tags = []
|
131
171
|
@stack = []
|
@@ -180,6 +220,8 @@ module REXML
|
|
180
220
|
|
181
221
|
# Returns the next event. This is a +PullEvent+ object.
|
182
222
|
def pull
|
223
|
+
@source.drop_parsed_content
|
224
|
+
|
183
225
|
pull_event.tap do |event|
|
184
226
|
@listeners.each do |listener|
|
185
227
|
listener.receive event
|
@@ -192,236 +234,261 @@ module REXML
|
|
192
234
|
x, @closed = @closed, nil
|
193
235
|
return [ :end_element, x ]
|
194
236
|
end
|
195
|
-
|
237
|
+
if empty?
|
238
|
+
if @document_status == :in_doctype
|
239
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
240
|
+
end
|
241
|
+
return [ :end_document ]
|
242
|
+
end
|
196
243
|
return @stack.shift if @stack.size > 0
|
197
244
|
#STDERR.puts @source.encoding
|
198
245
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
246
|
+
|
247
|
+
@source.ensure_buffer
|
199
248
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
return process_instruction
|
224
|
-
when DOCTYPE_START
|
225
|
-
base_error_message = "Malformed DOCTYPE"
|
226
|
-
@source.match(DOCTYPE_START, true)
|
227
|
-
@nsstack.unshift(curr_ns=Set.new)
|
228
|
-
name = parse_name(base_error_message)
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
249
|
+
start_position = @source.position
|
250
|
+
if @source.match("<?", true)
|
251
|
+
return process_instruction(start_position)
|
252
|
+
elsif @source.match("<!", true)
|
253
|
+
if @source.match("--", true)
|
254
|
+
md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
|
255
|
+
if md.nil?
|
256
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
257
|
+
end
|
258
|
+
if /--|-\z/.match?(md[1])
|
259
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
260
|
+
end
|
261
|
+
return [ :comment, md[1] ]
|
262
|
+
elsif @source.match("DOCTYPE", true)
|
263
|
+
base_error_message = "Malformed DOCTYPE"
|
264
|
+
unless @source.match(/\s+/um, true)
|
265
|
+
if @source.match(">")
|
266
|
+
message = "#{base_error_message}: name is missing"
|
267
|
+
else
|
268
|
+
message = "#{base_error_message}: invalid name"
|
269
|
+
end
|
270
|
+
@source.position = start_position
|
271
|
+
raise REXML::ParseException.new(message, @source)
|
242
272
|
end
|
243
|
-
|
273
|
+
@nsstack.unshift(Set.new)
|
274
|
+
name = parse_name(base_error_message)
|
275
|
+
if @source.match(/\s*\[/um, true)
|
276
|
+
id = [nil, nil, nil]
|
244
277
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
278
|
+
elsif @source.match(/\s*>/um, true)
|
279
|
+
id = [nil, nil, nil]
|
246
280
|
@document_status = :after_doctype
|
281
|
+
@source.ensure_buffer
|
247
282
|
else
|
248
|
-
|
249
|
-
|
283
|
+
id = parse_id(base_error_message,
|
284
|
+
accept_external_id: true,
|
285
|
+
accept_public_id: false)
|
286
|
+
if id[0] == "SYSTEM"
|
287
|
+
# For backward compatibility
|
288
|
+
id[1], id[2] = id[2], nil
|
289
|
+
end
|
290
|
+
if @source.match(/\s*\[/um, true)
|
291
|
+
@document_status = :in_doctype
|
292
|
+
elsif @source.match(/\s*>/um, true)
|
293
|
+
@document_status = :after_doctype
|
294
|
+
@source.ensure_buffer
|
295
|
+
else
|
296
|
+
message = "#{base_error_message}: garbage after external ID"
|
297
|
+
raise REXML::ParseException.new(message, @source)
|
298
|
+
end
|
250
299
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
300
|
+
args = [:start_doctype, name, *id]
|
301
|
+
if @document_status == :after_doctype
|
302
|
+
@source.match(/\s*/um, true)
|
303
|
+
@stack << [ :end_doctype ]
|
304
|
+
end
|
305
|
+
return args
|
306
|
+
else
|
307
|
+
message = "Invalid XML"
|
308
|
+
raise REXML::ParseException.new(message, @source)
|
263
309
|
end
|
264
310
|
end
|
265
311
|
end
|
266
312
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
313
|
+
@source.match(/\s*/um, true) # skip spaces
|
314
|
+
start_position = @source.position
|
315
|
+
if @source.match("<!", true)
|
316
|
+
if @source.match("ELEMENT", true)
|
317
|
+
md = @source.match(/(.*?)>/um, true)
|
318
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
319
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
320
|
+
elsif @source.match("ENTITY", true)
|
321
|
+
match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM).captures.compact]
|
322
|
+
ref = false
|
323
|
+
if match[1] == '%'
|
324
|
+
ref = true
|
325
|
+
match.delete_at 1
|
320
326
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
327
|
+
# Now we have to sort out what kind of entity reference this is
|
328
|
+
if match[2] == 'SYSTEM'
|
329
|
+
# External reference
|
330
|
+
match[3] = match[3][1..-2] # PUBID
|
331
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
332
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
333
|
+
elsif match[2] == 'PUBLIC'
|
334
|
+
# External reference
|
335
|
+
match[3] = match[3][1..-2] # PUBID
|
336
|
+
match[4] = match[4][1..-2] # HREF
|
337
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
338
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
328
339
|
else
|
329
|
-
|
340
|
+
match[2] = match[2][1..-2]
|
341
|
+
match.pop if match.size == 4
|
342
|
+
# match is [ :entity, name, value ]
|
330
343
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
344
|
+
match << '%' if ref
|
345
|
+
return match
|
346
|
+
elsif @source.match("ATTLIST", true)
|
347
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
348
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
349
|
+
element = md[1]
|
350
|
+
contents = md[0]
|
351
|
+
|
352
|
+
pairs = {}
|
353
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
354
|
+
values.each do |attdef|
|
355
|
+
unless attdef[3] == "#IMPLIED"
|
356
|
+
attdef.compact!
|
357
|
+
val = attdef[3]
|
358
|
+
val = attdef[4] if val == "#FIXED "
|
359
|
+
pairs[attdef[0]] = val
|
360
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
361
|
+
@nsstack[0] << $1
|
362
|
+
end
|
363
|
+
end
|
364
|
+
end
|
365
|
+
return [ :attlistdecl, element, pairs, contents ]
|
366
|
+
elsif @source.match("NOTATION", true)
|
367
|
+
base_error_message = "Malformed notation declaration"
|
368
|
+
unless @source.match(/\s+/um, true)
|
369
|
+
if @source.match(">")
|
370
|
+
message = "#{base_error_message}: name is missing"
|
371
|
+
else
|
372
|
+
message = "#{base_error_message}: invalid name"
|
373
|
+
end
|
374
|
+
@source.position = start_position
|
375
|
+
raise REXML::ParseException.new(message, @source)
|
376
|
+
end
|
377
|
+
name = parse_name(base_error_message)
|
378
|
+
id = parse_id(base_error_message,
|
379
|
+
accept_external_id: true,
|
380
|
+
accept_public_id: true)
|
381
|
+
unless @source.match(/\s*>/um, true)
|
382
|
+
message = "#{base_error_message}: garbage before end >"
|
383
|
+
raise REXML::ParseException.new(message, @source)
|
384
|
+
end
|
385
|
+
return [:notationdecl, name, *id]
|
386
|
+
elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
|
387
|
+
case md[1]
|
388
|
+
when /--/, /-\z/
|
389
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
390
|
+
end
|
391
|
+
return [ :comment, md[1] ] if md
|
340
392
|
end
|
341
|
-
|
342
|
-
|
393
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
|
394
|
+
return [ :externalentity, match[1] ]
|
395
|
+
elsif @source.match(/\]\s*>/um, true)
|
343
396
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
397
|
return [ :end_doctype ]
|
346
398
|
end
|
399
|
+
if @document_status == :in_doctype
|
400
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
401
|
+
end
|
347
402
|
end
|
348
403
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
404
|
+
@source.match(/\s*/um, true)
|
350
405
|
end
|
351
406
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
407
|
+
start_position = @source.position
|
408
|
+
if @source.match("<", true)
|
409
|
+
# :text's read_until may remain only "<" in buffer. In the
|
410
|
+
# case, buffer is empty here. So we need to fill buffer
|
411
|
+
# here explicitly.
|
412
|
+
@source.ensure_buffer
|
413
|
+
if @source.match("/", true)
|
355
414
|
@nsstack.shift
|
356
415
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
416
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
417
|
if md and !last_tag
|
359
418
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
419
|
raise REXML::ParseException.new(message, @source)
|
361
420
|
end
|
362
421
|
if md.nil? or last_tag != md[1]
|
363
422
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
423
|
+
message += " (got '#{md[1]}')" if md
|
424
|
+
@source.position = start_position if md.nil?
|
365
425
|
raise REXML::ParseException.new(message, @source)
|
366
426
|
end
|
367
427
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
428
|
+
elsif @source.match("!", true)
|
429
|
+
md = @source.match(/([^>]*>)/um)
|
370
430
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
431
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
432
|
+
if md[0][0] == ?-
|
433
|
+
md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
|
374
434
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
435
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
436
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
437
|
end
|
379
438
|
|
380
|
-
return [ :comment, md[1] ]
|
439
|
+
return [ :comment, md[1] ]
|
381
440
|
else
|
382
|
-
md = @source.match(
|
441
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
|
383
442
|
return [ :cdata, md[1] ] if md
|
384
443
|
end
|
385
444
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
445
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
388
|
-
return process_instruction
|
446
|
+
elsif @source.match("?", true)
|
447
|
+
return process_instruction(start_position)
|
389
448
|
else
|
390
449
|
# Get the next tag
|
391
|
-
md = @source.match(
|
450
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
451
|
unless md
|
452
|
+
@source.position = start_position
|
393
453
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
454
|
end
|
455
|
+
tag = md[1]
|
395
456
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
457
|
+
@prefixes.clear
|
458
|
+
@prefixes << md[2] if md[2]
|
398
459
|
@nsstack.unshift(curr_ns=Set.new)
|
399
|
-
attributes, closed = parse_attributes(prefixes, curr_ns)
|
460
|
+
attributes, closed = parse_attributes(@prefixes, curr_ns)
|
400
461
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
462
|
+
for prefix in @prefixes
|
402
463
|
unless @nsstack.find{|k| k.member?(prefix)}
|
403
464
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
465
|
end
|
405
466
|
end
|
406
467
|
|
407
468
|
if closed
|
408
|
-
@closed =
|
469
|
+
@closed = tag
|
409
470
|
@nsstack.shift
|
410
471
|
else
|
411
|
-
@tags.
|
472
|
+
if @tags.empty? and @have_root
|
473
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
474
|
+
end
|
475
|
+
@tags.push( tag )
|
412
476
|
end
|
413
|
-
|
477
|
+
@have_root = true
|
478
|
+
return [ :start_element, tag, attributes ]
|
414
479
|
end
|
415
480
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
481
|
+
text = @source.read_until("<")
|
482
|
+
if text.chomp!("<")
|
483
|
+
@source.position -= "<".bytesize
|
484
|
+
end
|
485
|
+
if @tags.empty? and @have_root
|
486
|
+
unless /\A\s*\z/.match?(text)
|
487
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
488
|
+
end
|
489
|
+
return pull_event
|
419
490
|
end
|
420
|
-
|
421
|
-
#return [ :text, "" ] if md[0].length == 0
|
422
|
-
# unnormalized = Text::unnormalize( md[1], self )
|
423
|
-
# return PullEvent.new( :text, md[1], unnormalized )
|
424
|
-
return [ :text, md[1] ]
|
491
|
+
return [ :text, text ]
|
425
492
|
end
|
426
493
|
rescue REXML::UndefinedNamespaceException
|
427
494
|
raise
|
@@ -463,11 +530,14 @@ module REXML
|
|
463
530
|
|
464
531
|
# Unescapes all possible entities
|
465
532
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
|
467
|
-
|
533
|
+
if string.include?("\r")
|
534
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
535
|
+
else
|
536
|
+
rv = string.dup
|
537
|
+
end
|
468
538
|
matches = rv.scan( REFERENCE_RE )
|
469
539
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
540
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
541
|
m=$1
|
472
542
|
m = "0#{m}" if m[0] == ?x
|
473
543
|
[Integer(m)].pack('U*')
|
@@ -478,7 +548,7 @@ module REXML
|
|
478
548
|
unless filter and filter.include?(entity_reference)
|
479
549
|
entity_value = entity( entity_reference, entities )
|
480
550
|
if entity_value
|
481
|
-
re = /&#{entity_reference};/
|
551
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
482
552
|
rv.gsub!( re, entity_value )
|
483
553
|
else
|
484
554
|
er = DEFAULT_ENTITIES[entity_reference]
|
@@ -486,7 +556,7 @@ module REXML
|
|
486
556
|
end
|
487
557
|
end
|
488
558
|
end
|
489
|
-
rv.gsub!(
|
559
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
560
|
end
|
491
561
|
rv
|
492
562
|
end
|
@@ -499,9 +569,9 @@ module REXML
|
|
499
569
|
end
|
500
570
|
|
501
571
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
572
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
573
|
unless md
|
504
|
-
if @source.match(/\
|
574
|
+
if @source.match(/\s*\S/um)
|
505
575
|
message = "#{base_error_message}: invalid name"
|
506
576
|
else
|
507
577
|
message = "#{base_error_message}: name is missing"
|
@@ -577,97 +647,94 @@ module REXML
|
|
577
647
|
end
|
578
648
|
end
|
579
649
|
|
580
|
-
def process_instruction
|
581
|
-
match_data = @source.match(
|
650
|
+
def process_instruction(start_position)
|
651
|
+
match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
|
582
652
|
unless match_data
|
583
653
|
message = "Invalid processing instruction node"
|
654
|
+
@source.position = start_position
|
584
655
|
raise REXML::ParseException.new(message, @source)
|
585
656
|
end
|
657
|
+
if match_data[1] == "xml"
|
658
|
+
if @document_status
|
659
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
660
|
+
end
|
661
|
+
content = match_data[2]
|
662
|
+
version = VERSION.match(content)
|
663
|
+
version = version[1] unless version.nil?
|
664
|
+
encoding = ENCODING.match(content)
|
665
|
+
encoding = encoding[1] unless encoding.nil?
|
666
|
+
if need_source_encoding_update?(encoding)
|
667
|
+
@source.encoding = encoding
|
668
|
+
end
|
669
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
670
|
+
encoding = "UTF-16"
|
671
|
+
end
|
672
|
+
standalone = STANDALONE.match(content)
|
673
|
+
standalone = standalone[1] unless standalone.nil?
|
674
|
+
return [ :xmldecl, version, encoding, standalone ]
|
675
|
+
end
|
586
676
|
[:processing_instruction, match_data[1], match_data[2]]
|
587
677
|
end
|
588
678
|
|
589
679
|
def parse_attributes(prefixes, curr_ns)
|
590
680
|
attributes = {}
|
591
681
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
682
|
+
while true
|
683
|
+
if @source.match(">", true)
|
684
|
+
return attributes, closed
|
685
|
+
elsif @source.match("/>", true)
|
686
|
+
closed = true
|
687
|
+
return attributes, closed
|
688
|
+
elsif match = @source.match(QNAME, true)
|
689
|
+
name = match[1]
|
690
|
+
prefix = match[2]
|
691
|
+
local_part = match[3]
|
692
|
+
|
693
|
+
unless @source.match(/\s*=\s*/um, true)
|
618
694
|
message = "Missing attribute equal: <#{name}>"
|
619
695
|
raise REXML::ParseException.new(message, @source)
|
620
696
|
end
|
621
|
-
|
622
|
-
unless quote
|
697
|
+
unless match = @source.match(/(['"])/, true)
|
623
698
|
message = "Missing attribute value start quote: <#{name}>"
|
624
699
|
raise REXML::ParseException.new(message, @source)
|
625
700
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
701
|
+
quote = match[1]
|
702
|
+
start_position = @source.position
|
703
|
+
value = @source.read_until(quote)
|
704
|
+
unless value.chomp!(quote)
|
705
|
+
@source.position = start_position
|
706
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
707
|
raise REXML::ParseException.new(message, @source)
|
639
708
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
709
|
+
@source.match(/\s*/um, true)
|
710
|
+
if prefix == "xmlns"
|
711
|
+
if local_part == "xml"
|
712
|
+
if value != "http://www.w3.org/XML/1998/namespace"
|
713
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
714
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
715
|
+
raise REXML::ParseException.new( msg, @source, self )
|
716
|
+
end
|
717
|
+
elsif local_part == "xmlns"
|
718
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
719
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
720
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
721
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
722
|
+
curr_ns << local_part
|
723
|
+
elsif prefix
|
724
|
+
prefixes << prefix unless prefix == "xml"
|
657
725
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
726
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
727
|
+
if attributes[name]
|
728
|
+
msg = "Duplicate attribute #{name.inspect}"
|
729
|
+
raise REXML::ParseException.new(msg, @source, self)
|
730
|
+
end
|
667
731
|
|
668
|
-
|
732
|
+
attributes[name] = value
|
733
|
+
else
|
734
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
735
|
+
raise REXML::ParseException.new(message, @source)
|
736
|
+
end
|
669
737
|
end
|
670
|
-
return attributes, closed
|
671
738
|
end
|
672
739
|
end
|
673
740
|
end
|