rexml 3.2.5 → 3.3.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/NEWS.md +251 -2
- data/README.md +10 -1
- data/doc/rexml/tasks/rdoc/element.rdoc +2 -2
- data/doc/rexml/tutorial.rdoc +1358 -0
- data/lib/rexml/attribute.rb +14 -9
- data/lib/rexml/document.rb +1 -1
- data/lib/rexml/element.rb +5 -18
- data/lib/rexml/entity.rb +25 -15
- data/lib/rexml/formatters/pretty.rb +2 -2
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/namespace.rb +8 -4
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +281 -240
- data/lib/rexml/parsers/treeparser.rb +9 -14
- data/lib/rexml/parsers/xpathparser.rb +136 -86
- data/lib/rexml/rexml.rb +3 -1
- data/lib/rexml/source.rb +120 -100
- data/lib/rexml/text.rb +6 -4
- data/lib/rexml/xpath_parser.rb +7 -3
- metadata +11 -39
@@ -1,4 +1,4 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
4
|
require_relative '../source'
|
@@ -7,6 +7,17 @@ require "strscan"
|
|
7
7
|
|
8
8
|
module REXML
|
9
9
|
module Parsers
|
10
|
+
if StringScanner::Version < "3.0.8"
|
11
|
+
module StringScannerCaptures
|
12
|
+
refine StringScanner do
|
13
|
+
def captures
|
14
|
+
values_at(*(1...size))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
using StringScannerCaptures
|
19
|
+
end
|
20
|
+
|
10
21
|
# = Using the Pull Parser
|
11
22
|
# <em>This API is experimental, and subject to change.</em>
|
12
23
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -96,7 +107,7 @@ module REXML
|
|
96
107
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
97
108
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
98
109
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
99
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
110
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
100
111
|
|
101
112
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
102
113
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
@@ -112,9 +123,29 @@ module REXML
|
|
112
123
|
"apos" => [/'/, "'", "'", /'/]
|
113
124
|
}
|
114
125
|
|
126
|
+
module Private
|
127
|
+
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
|
128
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
129
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
130
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
131
|
+
NAME_PATTERN = /\s*#{NAME}/um
|
132
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
133
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
134
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
135
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
136
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
137
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
138
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
139
|
+
default_entities.each do |term|
|
140
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
141
|
+
end
|
142
|
+
end
|
143
|
+
private_constant :Private
|
144
|
+
|
115
145
|
def initialize( source )
|
116
146
|
self.stream = source
|
117
147
|
@listeners = []
|
148
|
+
@prefixes = Set.new
|
118
149
|
end
|
119
150
|
|
120
151
|
def add_listener( listener )
|
@@ -180,6 +211,8 @@ module REXML
|
|
180
211
|
|
181
212
|
# Returns the next event. This is a +PullEvent+ object.
|
182
213
|
def pull
|
214
|
+
@source.drop_parsed_content
|
215
|
+
|
183
216
|
pull_event.tap do |event|
|
184
217
|
@listeners.each do |listener|
|
185
218
|
listener.receive event
|
@@ -192,236 +225,251 @@ module REXML
|
|
192
225
|
x, @closed = @closed, nil
|
193
226
|
return [ :end_element, x ]
|
194
227
|
end
|
195
|
-
|
228
|
+
if empty?
|
229
|
+
if @document_status == :in_doctype
|
230
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
231
|
+
end
|
232
|
+
return [ :end_document ]
|
233
|
+
end
|
196
234
|
return @stack.shift if @stack.size > 0
|
197
235
|
#STDERR.puts @source.encoding
|
198
236
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
237
|
+
|
238
|
+
@source.ensure_buffer
|
199
239
|
if @document_status == nil
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
encoding = ENCODING.match(results)
|
212
|
-
encoding = encoding[1] unless encoding.nil?
|
213
|
-
if need_source_encoding_update?(encoding)
|
214
|
-
@source.encoding = encoding
|
215
|
-
end
|
216
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
217
|
-
encoding = "UTF-16"
|
218
|
-
end
|
219
|
-
standalone = STANDALONE.match(results)
|
220
|
-
standalone = standalone[1] unless standalone.nil?
|
221
|
-
return [ :xmldecl, version, encoding, standalone ]
|
222
|
-
when INSTRUCTION_START
|
223
|
-
return process_instruction
|
224
|
-
when DOCTYPE_START
|
225
|
-
base_error_message = "Malformed DOCTYPE"
|
226
|
-
@source.match(DOCTYPE_START, true)
|
227
|
-
@nsstack.unshift(curr_ns=Set.new)
|
228
|
-
name = parse_name(base_error_message)
|
229
|
-
if @source.match(/\A\s*\[/um, true)
|
230
|
-
id = [nil, nil, nil]
|
231
|
-
@document_status = :in_doctype
|
232
|
-
elsif @source.match(/\A\s*>/um, true)
|
233
|
-
id = [nil, nil, nil]
|
234
|
-
@document_status = :after_doctype
|
235
|
-
else
|
236
|
-
id = parse_id(base_error_message,
|
237
|
-
accept_external_id: true,
|
238
|
-
accept_public_id: false)
|
239
|
-
if id[0] == "SYSTEM"
|
240
|
-
# For backward compatibility
|
241
|
-
id[1], id[2] = id[2], nil
|
240
|
+
start_position = @source.position
|
241
|
+
if @source.match("<?", true)
|
242
|
+
return process_instruction(start_position)
|
243
|
+
elsif @source.match("<!", true)
|
244
|
+
if @source.match("--", true)
|
245
|
+
md = @source.match(/(.*?)-->/um, true)
|
246
|
+
if md.nil?
|
247
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
248
|
+
end
|
249
|
+
if /--|-\z/.match?(md[1])
|
250
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
242
251
|
end
|
243
|
-
|
252
|
+
return [ :comment, md[1] ]
|
253
|
+
elsif @source.match("DOCTYPE", true)
|
254
|
+
base_error_message = "Malformed DOCTYPE"
|
255
|
+
unless @source.match(/\s+/um, true)
|
256
|
+
if @source.match(">")
|
257
|
+
message = "#{base_error_message}: name is missing"
|
258
|
+
else
|
259
|
+
message = "#{base_error_message}: invalid name"
|
260
|
+
end
|
261
|
+
@source.position = start_position
|
262
|
+
raise REXML::ParseException.new(message, @source)
|
263
|
+
end
|
264
|
+
@nsstack.unshift(Set.new)
|
265
|
+
name = parse_name(base_error_message)
|
266
|
+
if @source.match(/\s*\[/um, true)
|
267
|
+
id = [nil, nil, nil]
|
244
268
|
@document_status = :in_doctype
|
245
|
-
elsif @source.match(/\
|
269
|
+
elsif @source.match(/\s*>/um, true)
|
270
|
+
id = [nil, nil, nil]
|
246
271
|
@document_status = :after_doctype
|
272
|
+
@source.ensure_buffer
|
247
273
|
else
|
248
|
-
|
249
|
-
|
274
|
+
id = parse_id(base_error_message,
|
275
|
+
accept_external_id: true,
|
276
|
+
accept_public_id: false)
|
277
|
+
if id[0] == "SYSTEM"
|
278
|
+
# For backward compatibility
|
279
|
+
id[1], id[2] = id[2], nil
|
280
|
+
end
|
281
|
+
if @source.match(/\s*\[/um, true)
|
282
|
+
@document_status = :in_doctype
|
283
|
+
elsif @source.match(/\s*>/um, true)
|
284
|
+
@document_status = :after_doctype
|
285
|
+
@source.ensure_buffer
|
286
|
+
else
|
287
|
+
message = "#{base_error_message}: garbage after external ID"
|
288
|
+
raise REXML::ParseException.new(message, @source)
|
289
|
+
end
|
250
290
|
end
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
@document_status = :after_doctype
|
261
|
-
if @source.encoding == "UTF-8"
|
262
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
291
|
+
args = [:start_doctype, name, *id]
|
292
|
+
if @document_status == :after_doctype
|
293
|
+
@source.match(/\s*/um, true)
|
294
|
+
@stack << [ :end_doctype ]
|
295
|
+
end
|
296
|
+
return args
|
297
|
+
else
|
298
|
+
message = "Invalid XML"
|
299
|
+
raise REXML::ParseException.new(message, @source)
|
263
300
|
end
|
264
301
|
end
|
265
302
|
end
|
266
303
|
if @document_status == :in_doctype
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if match[1] == '%'
|
281
|
-
ref = true
|
282
|
-
match.delete_at 1
|
283
|
-
end
|
284
|
-
# Now we have to sort out what kind of entity reference this is
|
285
|
-
if match[2] == 'SYSTEM'
|
286
|
-
# External reference
|
287
|
-
match[3] = match[3][1..-2] # PUBID
|
288
|
-
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
289
|
-
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
290
|
-
elsif match[2] == 'PUBLIC'
|
291
|
-
# External reference
|
292
|
-
match[3] = match[3][1..-2] # PUBID
|
293
|
-
match[4] = match[4][1..-2] # HREF
|
294
|
-
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
295
|
-
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
296
|
-
else
|
297
|
-
match[2] = match[2][1..-2]
|
298
|
-
match.pop if match.size == 4
|
299
|
-
# match is [ :entity, name, value ]
|
300
|
-
end
|
301
|
-
match << '%' if ref
|
302
|
-
return match
|
303
|
-
when ATTLISTDECL_START
|
304
|
-
md = @source.match( ATTLISTDECL_PATTERN, true )
|
305
|
-
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
306
|
-
element = md[1]
|
307
|
-
contents = md[0]
|
308
|
-
|
309
|
-
pairs = {}
|
310
|
-
values = md[0].scan( ATTDEF_RE )
|
311
|
-
values.each do |attdef|
|
312
|
-
unless attdef[3] == "#IMPLIED"
|
313
|
-
attdef.compact!
|
314
|
-
val = attdef[3]
|
315
|
-
val = attdef[4] if val == "#FIXED "
|
316
|
-
pairs[attdef[0]] = val
|
317
|
-
if attdef[0] =~ /^xmlns:(.*)/
|
318
|
-
@nsstack[0] << $1
|
319
|
-
end
|
304
|
+
@source.match(/\s*/um, true) # skip spaces
|
305
|
+
start_position = @source.position
|
306
|
+
if @source.match("<!", true)
|
307
|
+
if @source.match("ELEMENT", true)
|
308
|
+
md = @source.match(/(.*?)>/um, true)
|
309
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
310
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
311
|
+
elsif @source.match("ENTITY", true)
|
312
|
+
match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
|
313
|
+
ref = false
|
314
|
+
if match[1] == '%'
|
315
|
+
ref = true
|
316
|
+
match.delete_at 1
|
320
317
|
end
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
318
|
+
# Now we have to sort out what kind of entity reference this is
|
319
|
+
if match[2] == 'SYSTEM'
|
320
|
+
# External reference
|
321
|
+
match[3] = match[3][1..-2] # PUBID
|
322
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
323
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
324
|
+
elsif match[2] == 'PUBLIC'
|
325
|
+
# External reference
|
326
|
+
match[3] = match[3][1..-2] # PUBID
|
327
|
+
match[4] = match[4][1..-2] # HREF
|
328
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
329
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
328
330
|
else
|
329
|
-
|
331
|
+
match[2] = match[2][1..-2]
|
332
|
+
match.pop if match.size == 4
|
333
|
+
# match is [ :entity, name, value ]
|
330
334
|
end
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
335
|
+
match << '%' if ref
|
336
|
+
return match
|
337
|
+
elsif @source.match("ATTLIST", true)
|
338
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
339
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
340
|
+
element = md[1]
|
341
|
+
contents = md[0]
|
342
|
+
|
343
|
+
pairs = {}
|
344
|
+
values = md[0].scan( ATTDEF_RE )
|
345
|
+
values.each do |attdef|
|
346
|
+
unless attdef[3] == "#IMPLIED"
|
347
|
+
attdef.compact!
|
348
|
+
val = attdef[3]
|
349
|
+
val = attdef[4] if val == "#FIXED "
|
350
|
+
pairs[attdef[0]] = val
|
351
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
352
|
+
@nsstack[0] << $1
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
return [ :attlistdecl, element, pairs, contents ]
|
357
|
+
elsif @source.match("NOTATION", true)
|
358
|
+
base_error_message = "Malformed notation declaration"
|
359
|
+
unless @source.match(/\s+/um, true)
|
360
|
+
if @source.match(">")
|
361
|
+
message = "#{base_error_message}: name is missing"
|
362
|
+
else
|
363
|
+
message = "#{base_error_message}: invalid name"
|
364
|
+
end
|
365
|
+
@source.position = start_position
|
366
|
+
raise REXML::ParseException.new(message, @source)
|
367
|
+
end
|
368
|
+
name = parse_name(base_error_message)
|
369
|
+
id = parse_id(base_error_message,
|
370
|
+
accept_external_id: true,
|
371
|
+
accept_public_id: true)
|
372
|
+
unless @source.match(/\s*>/um, true)
|
373
|
+
message = "#{base_error_message}: garbage before end >"
|
374
|
+
raise REXML::ParseException.new(message, @source)
|
375
|
+
end
|
376
|
+
return [:notationdecl, name, *id]
|
377
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
378
|
+
case md[1]
|
379
|
+
when /--/, /-\z/
|
380
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
381
|
+
end
|
382
|
+
return [ :comment, md[1] ] if md
|
340
383
|
end
|
341
|
-
|
342
|
-
|
384
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
385
|
+
return [ :externalentity, match[1] ]
|
386
|
+
elsif @source.match(/\]\s*>/um, true)
|
343
387
|
@document_status = :after_doctype
|
344
|
-
@source.match( DOCTYPE_END, true )
|
345
388
|
return [ :end_doctype ]
|
346
389
|
end
|
390
|
+
if @document_status == :in_doctype
|
391
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
392
|
+
end
|
347
393
|
end
|
348
394
|
if @document_status == :after_doctype
|
349
|
-
@source.match(/\
|
395
|
+
@source.match(/\s*/um, true)
|
350
396
|
end
|
351
397
|
begin
|
352
|
-
|
353
|
-
if @source.
|
354
|
-
|
398
|
+
start_position = @source.position
|
399
|
+
if @source.match("<", true)
|
400
|
+
# :text's read_until may remain only "<" in buffer. In the
|
401
|
+
# case, buffer is empty here. So we need to fill buffer
|
402
|
+
# here explicitly.
|
403
|
+
@source.ensure_buffer
|
404
|
+
if @source.match("/", true)
|
355
405
|
@nsstack.shift
|
356
406
|
last_tag = @tags.pop
|
357
|
-
md = @source.match(
|
407
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
358
408
|
if md and !last_tag
|
359
409
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
360
410
|
raise REXML::ParseException.new(message, @source)
|
361
411
|
end
|
362
412
|
if md.nil? or last_tag != md[1]
|
363
413
|
message = "Missing end tag for '#{last_tag}'"
|
364
|
-
message
|
414
|
+
message += " (got '#{md[1]}')" if md
|
415
|
+
@source.position = start_position if md.nil?
|
365
416
|
raise REXML::ParseException.new(message, @source)
|
366
417
|
end
|
367
418
|
return [ :end_element, last_tag ]
|
368
|
-
elsif @source.
|
369
|
-
md = @source.match(
|
419
|
+
elsif @source.match("!", true)
|
420
|
+
md = @source.match(/([^>]*>)/um)
|
370
421
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
371
422
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
372
|
-
if md[0][
|
373
|
-
md = @source.match(
|
423
|
+
if md[0][0] == ?-
|
424
|
+
md = @source.match(/--(.*?)-->/um, true)
|
374
425
|
|
375
|
-
|
376
|
-
when /--/, /-\z/
|
426
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
377
427
|
raise REXML::ParseException.new("Malformed comment", @source)
|
378
428
|
end
|
379
429
|
|
380
|
-
return [ :comment, md[1] ]
|
430
|
+
return [ :comment, md[1] ]
|
381
431
|
else
|
382
|
-
md = @source.match(
|
432
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
383
433
|
return [ :cdata, md[1] ] if md
|
384
434
|
end
|
385
435
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
386
436
|
"in the doctype declaration.", @source)
|
387
|
-
elsif @source.
|
388
|
-
return process_instruction
|
437
|
+
elsif @source.match("?", true)
|
438
|
+
return process_instruction(start_position)
|
389
439
|
else
|
390
440
|
# Get the next tag
|
391
|
-
md = @source.match(
|
441
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
392
442
|
unless md
|
443
|
+
@source.position = start_position
|
393
444
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
394
445
|
end
|
446
|
+
tag = md[1]
|
395
447
|
@document_status = :in_element
|
396
|
-
prefixes
|
397
|
-
prefixes << md[2] if md[2]
|
448
|
+
@prefixes.clear
|
449
|
+
@prefixes << md[2] if md[2]
|
398
450
|
@nsstack.unshift(curr_ns=Set.new)
|
399
|
-
attributes, closed = parse_attributes(prefixes, curr_ns)
|
451
|
+
attributes, closed = parse_attributes(@prefixes, curr_ns)
|
400
452
|
# Verify that all of the prefixes have been defined
|
401
|
-
for prefix in prefixes
|
453
|
+
for prefix in @prefixes
|
402
454
|
unless @nsstack.find{|k| k.member?(prefix)}
|
403
455
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
404
456
|
end
|
405
457
|
end
|
406
458
|
|
407
459
|
if closed
|
408
|
-
@closed =
|
460
|
+
@closed = tag
|
409
461
|
@nsstack.shift
|
410
462
|
else
|
411
|
-
@tags.push(
|
463
|
+
@tags.push( tag )
|
412
464
|
end
|
413
|
-
return [ :start_element,
|
465
|
+
return [ :start_element, tag, attributes ]
|
414
466
|
end
|
415
467
|
else
|
416
|
-
|
417
|
-
if
|
418
|
-
@source.
|
468
|
+
text = @source.read_until("<")
|
469
|
+
if text.chomp!("<")
|
470
|
+
@source.position -= "<".bytesize
|
419
471
|
end
|
420
|
-
|
421
|
-
#return [ :text, "" ] if md[0].length == 0
|
422
|
-
# unnormalized = Text::unnormalize( md[1], self )
|
423
|
-
# return PullEvent.new( :text, md[1], unnormalized )
|
424
|
-
return [ :text, md[1] ]
|
472
|
+
return [ :text, text ]
|
425
473
|
end
|
426
474
|
rescue REXML::UndefinedNamespaceException
|
427
475
|
raise
|
@@ -463,11 +511,10 @@ module REXML
|
|
463
511
|
|
464
512
|
# Unescapes all possible entities
|
465
513
|
def unnormalize( string, entities=nil, filter=nil )
|
466
|
-
rv = string.
|
467
|
-
rv.gsub!( /\r\n?/, "\n" )
|
514
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
468
515
|
matches = rv.scan( REFERENCE_RE )
|
469
516
|
return rv if matches.size == 0
|
470
|
-
rv.gsub!(
|
517
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
471
518
|
m=$1
|
472
519
|
m = "0#{m}" if m[0] == ?x
|
473
520
|
[Integer(m)].pack('U*')
|
@@ -478,7 +525,7 @@ module REXML
|
|
478
525
|
unless filter and filter.include?(entity_reference)
|
479
526
|
entity_value = entity( entity_reference, entities )
|
480
527
|
if entity_value
|
481
|
-
re = /&#{entity_reference};/
|
528
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
482
529
|
rv.gsub!( re, entity_value )
|
483
530
|
else
|
484
531
|
er = DEFAULT_ENTITIES[entity_reference]
|
@@ -486,7 +533,7 @@ module REXML
|
|
486
533
|
end
|
487
534
|
end
|
488
535
|
end
|
489
|
-
rv.gsub!(
|
536
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
490
537
|
end
|
491
538
|
rv
|
492
539
|
end
|
@@ -499,9 +546,9 @@ module REXML
|
|
499
546
|
end
|
500
547
|
|
501
548
|
def parse_name(base_error_message)
|
502
|
-
md = @source.match(
|
549
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
503
550
|
unless md
|
504
|
-
if @source.match(/\
|
551
|
+
if @source.match(/\s*\S/um)
|
505
552
|
message = "#{base_error_message}: invalid name"
|
506
553
|
else
|
507
554
|
message = "#{base_error_message}: name is missing"
|
@@ -577,97 +624,91 @@ module REXML
|
|
577
624
|
end
|
578
625
|
end
|
579
626
|
|
580
|
-
def process_instruction
|
581
|
-
match_data = @source.match(
|
627
|
+
def process_instruction(start_position)
|
628
|
+
match_data = @source.match(Private::INSTRUCTION_END, true)
|
582
629
|
unless match_data
|
583
630
|
message = "Invalid processing instruction node"
|
631
|
+
@source.position = start_position
|
584
632
|
raise REXML::ParseException.new(message, @source)
|
585
633
|
end
|
634
|
+
if @document_status.nil? and match_data[1] == "xml"
|
635
|
+
content = match_data[2]
|
636
|
+
version = VERSION.match(content)
|
637
|
+
version = version[1] unless version.nil?
|
638
|
+
encoding = ENCODING.match(content)
|
639
|
+
encoding = encoding[1] unless encoding.nil?
|
640
|
+
if need_source_encoding_update?(encoding)
|
641
|
+
@source.encoding = encoding
|
642
|
+
end
|
643
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
644
|
+
encoding = "UTF-16"
|
645
|
+
end
|
646
|
+
standalone = STANDALONE.match(content)
|
647
|
+
standalone = standalone[1] unless standalone.nil?
|
648
|
+
return [ :xmldecl, version, encoding, standalone ]
|
649
|
+
end
|
586
650
|
[:processing_instruction, match_data[1], match_data[2]]
|
587
651
|
end
|
588
652
|
|
589
653
|
def parse_attributes(prefixes, curr_ns)
|
590
654
|
attributes = {}
|
591
655
|
closed = false
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
until scanner.eos?
|
605
|
-
if scanner.scan(/\s+/)
|
606
|
-
break if scanner.eos?
|
607
|
-
end
|
608
|
-
|
609
|
-
pos = scanner.pos
|
610
|
-
loop do
|
611
|
-
break if scanner.scan(ATTRIBUTE_PATTERN)
|
612
|
-
unless scanner.scan(QNAME)
|
613
|
-
message = "Invalid attribute name: <#{scanner.rest}>"
|
614
|
-
raise REXML::ParseException.new(message, @source)
|
615
|
-
end
|
616
|
-
name = scanner[0]
|
617
|
-
unless scanner.scan(/\s*=\s*/um)
|
656
|
+
while true
|
657
|
+
if @source.match(">", true)
|
658
|
+
return attributes, closed
|
659
|
+
elsif @source.match("/>", true)
|
660
|
+
closed = true
|
661
|
+
return attributes, closed
|
662
|
+
elsif match = @source.match(QNAME, true)
|
663
|
+
name = match[1]
|
664
|
+
prefix = match[2]
|
665
|
+
local_part = match[3]
|
666
|
+
|
667
|
+
unless @source.match(/\s*=\s*/um, true)
|
618
668
|
message = "Missing attribute equal: <#{name}>"
|
619
669
|
raise REXML::ParseException.new(message, @source)
|
620
670
|
end
|
621
|
-
|
622
|
-
unless quote
|
671
|
+
unless match = @source.match(/(['"])/, true)
|
623
672
|
message = "Missing attribute value start quote: <#{name}>"
|
624
673
|
raise REXML::ParseException.new(message, @source)
|
625
674
|
end
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
scanner.pos = pos
|
633
|
-
closed = !match_data[2].nil?
|
634
|
-
next
|
635
|
-
end
|
636
|
-
message =
|
637
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
675
|
+
quote = match[1]
|
676
|
+
start_position = @source.position
|
677
|
+
value = @source.read_until(quote)
|
678
|
+
unless value.chomp!(quote)
|
679
|
+
@source.position = start_position
|
680
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
638
681
|
raise REXML::ParseException.new(message, @source)
|
639
682
|
end
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
msg = "The '
|
683
|
+
@source.match(/\s*/um, true)
|
684
|
+
if prefix == "xmlns"
|
685
|
+
if local_part == "xml"
|
686
|
+
if value != "http://www.w3.org/XML/1998/namespace"
|
687
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
688
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
689
|
+
raise REXML::ParseException.new( msg, @source, self )
|
690
|
+
end
|
691
|
+
elsif local_part == "xmlns"
|
692
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
650
693
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
651
|
-
raise REXML::ParseException.new( msg, @source, self
|
694
|
+
raise REXML::ParseException.new( msg, @source, self)
|
652
695
|
end
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
raise REXML::ParseException.new( msg, @source, self)
|
696
|
+
curr_ns << local_part
|
697
|
+
elsif prefix
|
698
|
+
prefixes << prefix unless prefix == "xml"
|
657
699
|
end
|
658
|
-
curr_ns << local_part
|
659
|
-
elsif prefix
|
660
|
-
prefixes << prefix unless prefix == "xml"
|
661
|
-
end
|
662
700
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
701
|
+
if attributes[name]
|
702
|
+
msg = "Duplicate attribute #{name.inspect}"
|
703
|
+
raise REXML::ParseException.new(msg, @source, self)
|
704
|
+
end
|
667
705
|
|
668
|
-
|
706
|
+
attributes[name] = value
|
707
|
+
else
|
708
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
709
|
+
raise REXML::ParseException.new(message, @source)
|
710
|
+
end
|
669
711
|
end
|
670
|
-
return attributes, closed
|
671
712
|
end
|
672
713
|
end
|
673
714
|
end
|