rexml 3.2.3 → 3.3.8
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/NEWS.md +502 -0
- data/README.md +11 -14
- data/doc/rexml/context.rdoc +143 -0
- data/doc/rexml/tasks/rdoc/child.rdoc +87 -0
- data/doc/rexml/tasks/rdoc/document.rdoc +276 -0
- data/doc/rexml/tasks/rdoc/element.rdoc +602 -0
- data/doc/rexml/tasks/rdoc/node.rdoc +97 -0
- data/doc/rexml/tasks/rdoc/parent.rdoc +267 -0
- data/doc/rexml/tasks/tocs/child_toc.rdoc +12 -0
- data/doc/rexml/tasks/tocs/document_toc.rdoc +30 -0
- data/doc/rexml/tasks/tocs/element_toc.rdoc +55 -0
- data/doc/rexml/tasks/tocs/master_toc.rdoc +135 -0
- data/doc/rexml/tasks/tocs/node_toc.rdoc +16 -0
- data/doc/rexml/tasks/tocs/parent_toc.rdoc +25 -0
- data/doc/rexml/tutorial.rdoc +1358 -0
- data/lib/rexml/attribute.rb +17 -11
- data/lib/rexml/doctype.rb +55 -31
- data/lib/rexml/document.rb +199 -35
- data/lib/rexml/element.rb +1802 -487
- data/lib/rexml/entity.rb +10 -39
- data/lib/rexml/formatters/pretty.rb +3 -3
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/light/node.rb +0 -8
- data/lib/rexml/namespace.rb +8 -4
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +513 -250
- data/lib/rexml/parsers/pullparser.rb +12 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/parsers/xpathparser.rb +161 -97
- data/lib/rexml/rexml.rb +29 -22
- data/lib/rexml/source.rb +128 -98
- data/lib/rexml/text.rb +46 -22
- data/lib/rexml/xpath_parser.rb +43 -33
- data/lib/rexml.rb +3 -0
- metadata +42 -46
- data/.gitignore +0 -9
- data/.travis.yml +0 -24
- data/Gemfile +0 -6
- data/Rakefile +0 -8
- data/rexml.gemspec +0 -84
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -50,7 +78,6 @@ module REXML
|
|
50
78
|
|
51
79
|
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
|
52
80
|
DOCTYPE_END = /\A\s*\]\s*>/um
|
53
|
-
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
|
54
81
|
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
|
55
82
|
COMMENT_START = /\A<!--/u
|
56
83
|
COMMENT_PATTERN = /<!--(.*?)-->/um
|
@@ -61,15 +88,14 @@ module REXML
|
|
61
88
|
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
|
62
89
|
INSTRUCTION_START = /\A<\?/u
|
63
90
|
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
|
64
|
-
TAG_MATCH =
|
65
|
-
CLOSE_MATCH =
|
91
|
+
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
|
92
|
+
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
|
66
93
|
|
67
94
|
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
|
68
95
|
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
|
69
96
|
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
|
70
97
|
|
71
98
|
ENTITY_START = /\A\s*<!ENTITY/
|
72
|
-
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
|
73
99
|
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
|
74
100
|
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
|
75
101
|
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
|
@@ -83,9 +109,6 @@ module REXML
|
|
83
109
|
ATTDEF_RE = /#{ATTDEF}/
|
84
110
|
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
|
85
111
|
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
86
|
-
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
87
|
-
PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
|
88
|
-
SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
|
89
112
|
|
90
113
|
TEXT_PATTERN = /\A([^<]*)/um
|
91
114
|
|
@@ -101,7 +124,12 @@ module REXML
|
|
101
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
102
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
103
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
104
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
128
|
+
|
129
|
+
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
130
|
+
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
131
|
+
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
|
132
|
+
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
|
105
133
|
|
106
134
|
EREFERENCE = /&(?!#{NAME};)/
|
107
135
|
|
@@ -112,9 +140,33 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
118
170
|
end
|
119
171
|
|
120
172
|
def add_listener( listener )
|
@@ -122,15 +174,20 @@ module REXML
|
|
122
174
|
end
|
123
175
|
|
124
176
|
attr_reader :source
|
177
|
+
attr_reader :entity_expansion_count
|
178
|
+
attr_writer :entity_expansion_limit
|
179
|
+
attr_writer :entity_expansion_text_limit
|
125
180
|
|
126
181
|
def stream=( source )
|
127
182
|
@source = SourceFactory.create_from( source )
|
128
183
|
@closed = nil
|
184
|
+
@have_root = false
|
129
185
|
@document_status = nil
|
130
186
|
@tags = []
|
131
187
|
@stack = []
|
132
188
|
@entities = []
|
133
|
-
@
|
189
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
190
|
+
@namespaces_restore_stack = []
|
134
191
|
end
|
135
192
|
|
136
193
|
def position
|
@@ -180,6 +237,8 @@ module REXML
|
|
180
237
|
|
181
238
|
# Returns the next event. This is a +PullEvent+ object.
|
182
239
|
def pull
|
240
|
+
@source.drop_parsed_content
|
241
|
+
|
183
242
|
pull_event.tap do |event|
|
184
243
|
@listeners.each do |listener|
|
185
244
|
listener.receive event
|
@@ -192,215 +251,274 @@ module REXML
|
|
192
251
|
x, @closed = @closed, nil
|
193
252
|
return [ :end_element, x ]
|
194
253
|
end
|
195
|
-
|
254
|
+
if empty?
|
255
|
+
if @document_status == :in_doctype
|
256
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
257
|
+
end
|
258
|
+
unless @tags.empty?
|
259
|
+
path = "/" + @tags.join("/")
|
260
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
261
|
+
end
|
262
|
+
return [ :end_document ]
|
263
|
+
end
|
196
264
|
return @stack.shift if @stack.size > 0
|
197
265
|
#STDERR.puts @source.encoding
|
198
|
-
@source.read if @source.buffer.size<2
|
199
266
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
267
|
+
|
268
|
+
@source.ensure_buffer
|
200
269
|
if @document_status == nil
|
201
|
-
|
202
|
-
|
203
|
-
word = word[1] unless word.nil?
|
204
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
205
|
-
case word
|
206
|
-
when COMMENT_START
|
207
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
208
|
-
when XMLDECL_START
|
209
|
-
#STDERR.puts "XMLDECL"
|
210
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
211
|
-
version = VERSION.match( results )
|
212
|
-
version = version[1] unless version.nil?
|
213
|
-
encoding = ENCODING.match(results)
|
214
|
-
encoding = encoding[1] unless encoding.nil?
|
215
|
-
if need_source_encoding_update?(encoding)
|
216
|
-
@source.encoding = encoding
|
217
|
-
end
|
218
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
219
|
-
encoding = "UTF-16"
|
220
|
-
end
|
221
|
-
standalone = STANDALONE.match(results)
|
222
|
-
standalone = standalone[1] unless standalone.nil?
|
223
|
-
return [ :xmldecl, version, encoding, standalone ]
|
224
|
-
when INSTRUCTION_START
|
270
|
+
start_position = @source.position
|
271
|
+
if @source.match("<?", true)
|
225
272
|
return process_instruction
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
273
|
+
elsif @source.match("<!", true)
|
274
|
+
if @source.match("--", true)
|
275
|
+
md = @source.match(/(.*?)-->/um, true)
|
276
|
+
if md.nil?
|
277
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
278
|
+
end
|
279
|
+
if /--|-\z/.match?(md[1])
|
280
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
281
|
+
end
|
282
|
+
return [ :comment, md[1] ]
|
283
|
+
elsif @source.match("DOCTYPE", true)
|
284
|
+
base_error_message = "Malformed DOCTYPE"
|
285
|
+
unless @source.match(/\s+/um, true)
|
286
|
+
if @source.match(">")
|
287
|
+
message = "#{base_error_message}: name is missing"
|
288
|
+
else
|
289
|
+
message = "#{base_error_message}: invalid name"
|
290
|
+
end
|
291
|
+
@source.position = start_position
|
292
|
+
raise REXML::ParseException.new(message, @source)
|
293
|
+
end
|
294
|
+
name = parse_name(base_error_message)
|
295
|
+
if @source.match(/\s*\[/um, true)
|
296
|
+
id = [nil, nil, nil]
|
297
|
+
@document_status = :in_doctype
|
298
|
+
elsif @source.match(/\s*>/um, true)
|
299
|
+
id = [nil, nil, nil]
|
300
|
+
@document_status = :after_doctype
|
301
|
+
@source.ensure_buffer
|
302
|
+
else
|
303
|
+
id = parse_id(base_error_message,
|
304
|
+
accept_external_id: true,
|
305
|
+
accept_public_id: false)
|
306
|
+
if id[0] == "SYSTEM"
|
307
|
+
# For backward compatibility
|
308
|
+
id[1], id[2] = id[2], nil
|
309
|
+
end
|
310
|
+
if @source.match(/\s*\[/um, true)
|
311
|
+
@document_status = :in_doctype
|
312
|
+
elsif @source.match(/\s*>/um, true)
|
313
|
+
@document_status = :after_doctype
|
314
|
+
@source.ensure_buffer
|
315
|
+
else
|
316
|
+
message = "#{base_error_message}: garbage after external ID"
|
317
|
+
raise REXML::ParseException.new(message, @source)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
args = [:start_doctype, name, *id]
|
321
|
+
if @document_status == :after_doctype
|
322
|
+
@source.match(/\s*/um, true)
|
323
|
+
@stack << [ :end_doctype ]
|
324
|
+
end
|
325
|
+
return args
|
243
326
|
else
|
244
|
-
|
245
|
-
|
246
|
-
return args
|
247
|
-
when /^\s+/
|
248
|
-
else
|
249
|
-
@document_status = :after_doctype
|
250
|
-
@source.read if @source.buffer.size<2
|
251
|
-
md = @source.match(/\s*/um, true)
|
252
|
-
if @source.encoding == "UTF-8"
|
253
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
327
|
+
message = "Invalid XML"
|
328
|
+
raise REXML::ParseException.new(message, @source)
|
254
329
|
end
|
255
330
|
end
|
256
331
|
end
|
257
332
|
if @document_status == :in_doctype
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
333
|
+
@source.match(/\s*/um, true) # skip spaces
|
334
|
+
start_position = @source.position
|
335
|
+
if @source.match("<!", true)
|
336
|
+
if @source.match("ELEMENT", true)
|
337
|
+
md = @source.match(/(.*?)>/um, true)
|
338
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
339
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
340
|
+
elsif @source.match("ENTITY", true)
|
341
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
342
|
+
unless match_data
|
343
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
344
|
+
end
|
345
|
+
match = [:entitydecl, *match_data.captures.compact]
|
346
|
+
ref = false
|
347
|
+
if match[1] == '%'
|
348
|
+
ref = true
|
349
|
+
match.delete_at 1
|
350
|
+
end
|
351
|
+
# Now we have to sort out what kind of entity reference this is
|
352
|
+
if match[2] == 'SYSTEM'
|
353
|
+
# External reference
|
354
|
+
match[3] = match[3][1..-2] # PUBID
|
355
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
356
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
357
|
+
elsif match[2] == 'PUBLIC'
|
358
|
+
# External reference
|
359
|
+
match[3] = match[3][1..-2] # PUBID
|
360
|
+
match[4] = match[4][1..-2] # HREF
|
361
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
362
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
363
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
364
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
365
|
+
else
|
366
|
+
match[2] = match[2][1..-2]
|
367
|
+
match.pop if match.size == 4
|
368
|
+
# match is [ :entity, name, value ]
|
369
|
+
end
|
370
|
+
match << '%' if ref
|
371
|
+
return match
|
372
|
+
elsif @source.match("ATTLIST", true)
|
373
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
374
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
375
|
+
element = md[1]
|
376
|
+
contents = md[0]
|
377
|
+
|
378
|
+
pairs = {}
|
379
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
380
|
+
values.each do |attdef|
|
381
|
+
unless attdef[3] == "#IMPLIED"
|
382
|
+
attdef.compact!
|
383
|
+
val = attdef[3]
|
384
|
+
val = attdef[4] if val == "#FIXED "
|
385
|
+
pairs[attdef[0]] = val
|
386
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
387
|
+
@namespaces[$1] = val
|
388
|
+
end
|
310
389
|
end
|
311
390
|
end
|
391
|
+
return [ :attlistdecl, element, pairs, contents ]
|
392
|
+
elsif @source.match("NOTATION", true)
|
393
|
+
base_error_message = "Malformed notation declaration"
|
394
|
+
unless @source.match(/\s+/um, true)
|
395
|
+
if @source.match(">")
|
396
|
+
message = "#{base_error_message}: name is missing"
|
397
|
+
else
|
398
|
+
message = "#{base_error_message}: invalid name"
|
399
|
+
end
|
400
|
+
@source.position = start_position
|
401
|
+
raise REXML::ParseException.new(message, @source)
|
402
|
+
end
|
403
|
+
name = parse_name(base_error_message)
|
404
|
+
id = parse_id(base_error_message,
|
405
|
+
accept_external_id: true,
|
406
|
+
accept_public_id: true)
|
407
|
+
unless @source.match(/\s*>/um, true)
|
408
|
+
message = "#{base_error_message}: garbage before end >"
|
409
|
+
raise REXML::ParseException.new(message, @source)
|
410
|
+
end
|
411
|
+
return [:notationdecl, name, *id]
|
412
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
413
|
+
case md[1]
|
414
|
+
when /--/, /-\z/
|
415
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
416
|
+
end
|
417
|
+
return [ :comment, md[1] ] if md
|
312
418
|
end
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
if @source.match( PUBLIC )
|
317
|
-
md = @source.match( PUBLIC, true )
|
318
|
-
vals = [md[1],md[2],md[4],md[6]]
|
319
|
-
elsif @source.match( SYSTEM )
|
320
|
-
md = @source.match( SYSTEM, true )
|
321
|
-
vals = [md[1],md[2],nil,md[4]]
|
322
|
-
else
|
323
|
-
raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
|
324
|
-
end
|
325
|
-
return [ :notationdecl, *vals ]
|
326
|
-
when DOCTYPE_END
|
419
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
420
|
+
return [ :externalentity, match[1] ]
|
421
|
+
elsif @source.match(/\]\s*>/um, true)
|
327
422
|
@document_status = :after_doctype
|
328
|
-
@source.match( DOCTYPE_END, true )
|
329
423
|
return [ :end_doctype ]
|
330
424
|
end
|
425
|
+
if @document_status == :in_doctype
|
426
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
427
|
+
end
|
428
|
+
end
|
429
|
+
if @document_status == :after_doctype
|
430
|
+
@source.match(/\s*/um, true)
|
331
431
|
end
|
332
432
|
begin
|
333
|
-
|
334
|
-
|
335
|
-
|
433
|
+
start_position = @source.position
|
434
|
+
if @source.match("<", true)
|
435
|
+
# :text's read_until may remain only "<" in buffer. In the
|
436
|
+
# case, buffer is empty here. So we need to fill buffer
|
437
|
+
# here explicitly.
|
438
|
+
@source.ensure_buffer
|
439
|
+
if @source.match("/", true)
|
440
|
+
@namespaces_restore_stack.pop
|
336
441
|
last_tag = @tags.pop
|
337
|
-
md = @source.match(
|
442
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
338
443
|
if md and !last_tag
|
339
444
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
340
445
|
raise REXML::ParseException.new(message, @source)
|
341
446
|
end
|
342
447
|
if md.nil? or last_tag != md[1]
|
343
448
|
message = "Missing end tag for '#{last_tag}'"
|
344
|
-
message
|
449
|
+
message += " (got '#{md[1]}')" if md
|
450
|
+
@source.position = start_position if md.nil?
|
345
451
|
raise REXML::ParseException.new(message, @source)
|
346
452
|
end
|
347
453
|
return [ :end_element, last_tag ]
|
348
|
-
elsif @source.
|
349
|
-
md = @source.match(
|
454
|
+
elsif @source.match("!", true)
|
455
|
+
md = @source.match(/([^>]*>)/um)
|
350
456
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
351
457
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
352
|
-
if md[0][
|
353
|
-
md = @source.match(
|
458
|
+
if md[0][0] == ?-
|
459
|
+
md = @source.match(/--(.*?)-->/um, true)
|
354
460
|
|
355
|
-
|
356
|
-
when /--/, /-\z/
|
461
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
357
462
|
raise REXML::ParseException.new("Malformed comment", @source)
|
358
463
|
end
|
359
464
|
|
360
|
-
return [ :comment, md[1] ]
|
465
|
+
return [ :comment, md[1] ]
|
361
466
|
else
|
362
|
-
md = @source.match(
|
467
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
363
468
|
return [ :cdata, md[1] ] if md
|
364
469
|
end
|
365
470
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
366
471
|
"in the doctype declaration.", @source)
|
367
|
-
elsif @source.
|
472
|
+
elsif @source.match("?", true)
|
368
473
|
return process_instruction
|
369
474
|
else
|
370
475
|
# Get the next tag
|
371
|
-
md = @source.match(
|
476
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
372
477
|
unless md
|
478
|
+
@source.position = start_position
|
373
479
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
374
480
|
end
|
375
|
-
|
376
|
-
|
377
|
-
@
|
378
|
-
|
481
|
+
tag = md[1]
|
482
|
+
@document_status = :in_element
|
483
|
+
@prefixes.clear
|
484
|
+
@prefixes << md[2] if md[2]
|
485
|
+
push_namespaces_restore
|
486
|
+
attributes, closed = parse_attributes(@prefixes)
|
379
487
|
# Verify that all of the prefixes have been defined
|
380
|
-
for prefix in prefixes
|
381
|
-
unless @
|
488
|
+
for prefix in @prefixes
|
489
|
+
unless @namespaces.key?(prefix)
|
382
490
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
383
491
|
end
|
384
492
|
end
|
385
493
|
|
386
494
|
if closed
|
387
|
-
@closed =
|
388
|
-
|
495
|
+
@closed = tag
|
496
|
+
pop_namespaces_restore
|
389
497
|
else
|
390
|
-
@tags.
|
498
|
+
if @tags.empty? and @have_root
|
499
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
500
|
+
end
|
501
|
+
@tags.push( tag )
|
391
502
|
end
|
392
|
-
|
503
|
+
@have_root = true
|
504
|
+
return [ :start_element, tag, attributes ]
|
393
505
|
end
|
394
506
|
else
|
395
|
-
|
396
|
-
if
|
397
|
-
@source.
|
507
|
+
text = @source.read_until("<")
|
508
|
+
if text.chomp!("<")
|
509
|
+
@source.position -= "<".bytesize
|
398
510
|
end
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
511
|
+
if @tags.empty?
|
512
|
+
unless /\A\s*\z/.match?(text)
|
513
|
+
if @have_root
|
514
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
515
|
+
else
|
516
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
517
|
+
end
|
518
|
+
end
|
519
|
+
return pull_event if @have_root
|
520
|
+
end
|
521
|
+
return [ :text, text ]
|
404
522
|
end
|
405
523
|
rescue REXML::UndefinedNamespaceException
|
406
524
|
raise
|
@@ -415,13 +533,13 @@ module REXML
|
|
415
533
|
private :pull_event
|
416
534
|
|
417
535
|
def entity( reference, entities )
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
unnormalize( value, entities )
|
536
|
+
return unless entities
|
537
|
+
|
538
|
+
value = entities[ reference ]
|
539
|
+
return if value.nil?
|
540
|
+
|
541
|
+
record_entity_expansion
|
542
|
+
unnormalize( value, entities )
|
425
543
|
end
|
426
544
|
|
427
545
|
# Escapes all possible entities
|
@@ -442,132 +560,277 @@ module REXML
|
|
442
560
|
|
443
561
|
# Unescapes all possible entities
|
444
562
|
def unnormalize( string, entities=nil, filter=nil )
|
445
|
-
|
446
|
-
|
563
|
+
if string.include?("\r")
|
564
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
565
|
+
else
|
566
|
+
rv = string.dup
|
567
|
+
end
|
447
568
|
matches = rv.scan( REFERENCE_RE )
|
448
569
|
return rv if matches.size == 0
|
449
|
-
rv.gsub!(
|
570
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
450
571
|
m=$1
|
451
572
|
m = "0#{m}" if m[0] == ?x
|
452
573
|
[Integer(m)].pack('U*')
|
453
574
|
}
|
454
575
|
matches.collect!{|x|x[0]}.compact!
|
576
|
+
if filter
|
577
|
+
matches.reject! do |entity_reference|
|
578
|
+
filter.include?(entity_reference)
|
579
|
+
end
|
580
|
+
end
|
455
581
|
if matches.size > 0
|
456
|
-
matches.each do |entity_reference|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
rv.gsub!( er[0], er[2] ) if er
|
582
|
+
matches.tally.each do |entity_reference, n|
|
583
|
+
entity_expansion_count_before = @entity_expansion_count
|
584
|
+
entity_value = entity( entity_reference, entities )
|
585
|
+
if entity_value
|
586
|
+
if n > 1
|
587
|
+
entity_expansion_count_delta =
|
588
|
+
@entity_expansion_count - entity_expansion_count_before
|
589
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
465
590
|
end
|
591
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
592
|
+
rv.gsub!( re, entity_value )
|
593
|
+
if rv.bytesize > @entity_expansion_text_limit
|
594
|
+
raise "entity expansion has grown too large"
|
595
|
+
end
|
596
|
+
else
|
597
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
598
|
+
rv.gsub!( er[0], er[2] ) if er
|
466
599
|
end
|
467
600
|
end
|
468
|
-
rv.gsub!(
|
601
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
469
602
|
end
|
470
603
|
rv
|
471
604
|
end
|
472
605
|
|
473
606
|
private
|
607
|
+
def add_namespace(prefix, uri)
|
608
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
609
|
+
if uri.nil?
|
610
|
+
@namespaces.delete(prefix)
|
611
|
+
else
|
612
|
+
@namespaces[prefix] = uri
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
def push_namespaces_restore
|
617
|
+
namespaces_restore = {}
|
618
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
619
|
+
namespaces_restore
|
620
|
+
end
|
621
|
+
|
622
|
+
def pop_namespaces_restore
|
623
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
624
|
+
namespaces_restore.each do |prefix, uri|
|
625
|
+
if uri.nil?
|
626
|
+
@namespaces.delete(prefix)
|
627
|
+
else
|
628
|
+
@namespaces[prefix] = uri
|
629
|
+
end
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def record_entity_expansion(delta=1)
|
634
|
+
@entity_expansion_count += delta
|
635
|
+
if @entity_expansion_count > @entity_expansion_limit
|
636
|
+
raise "number of entity expansions exceeded, processing aborted."
|
637
|
+
end
|
638
|
+
end
|
639
|
+
|
474
640
|
def need_source_encoding_update?(xml_declaration_encoding)
|
475
641
|
return false if xml_declaration_encoding.nil?
|
476
642
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
477
643
|
true
|
478
644
|
end
|
479
645
|
|
480
|
-
def
|
481
|
-
|
482
|
-
unless
|
483
|
-
|
646
|
+
def parse_name(base_error_message)
|
647
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
648
|
+
unless md
|
649
|
+
if @source.match(/\S/um)
|
650
|
+
message = "#{base_error_message}: invalid name"
|
651
|
+
else
|
652
|
+
message = "#{base_error_message}: name is missing"
|
653
|
+
end
|
484
654
|
raise REXML::ParseException.new(message, @source)
|
485
655
|
end
|
486
|
-
[
|
656
|
+
md[0]
|
487
657
|
end
|
488
658
|
|
489
|
-
def
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
659
|
+
def parse_id(base_error_message,
|
660
|
+
accept_external_id:,
|
661
|
+
accept_public_id:)
|
662
|
+
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
|
663
|
+
pubid = system = nil
|
664
|
+
pubid_literal = md[1]
|
665
|
+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
|
666
|
+
system_literal = md[2]
|
667
|
+
system = system_literal[1..-2] if system_literal # Remove quote
|
668
|
+
["PUBLIC", pubid, system]
|
669
|
+
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
|
670
|
+
pubid = system = nil
|
671
|
+
pubid_literal = md[1]
|
672
|
+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
|
673
|
+
["PUBLIC", pubid, nil]
|
674
|
+
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
|
675
|
+
system = nil
|
676
|
+
system_literal = md[1]
|
677
|
+
system = system_literal[1..-2] if system_literal # Remove quote
|
678
|
+
["SYSTEM", nil, system]
|
679
|
+
else
|
680
|
+
details = parse_id_invalid_details(accept_external_id: accept_external_id,
|
681
|
+
accept_public_id: accept_public_id)
|
682
|
+
message = "#{base_error_message}: #{details}"
|
495
683
|
raise REXML::ParseException.new(message, @source)
|
496
684
|
end
|
685
|
+
end
|
497
686
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
687
|
+
def parse_id_invalid_details(accept_external_id:,
|
688
|
+
accept_public_id:)
|
689
|
+
public = /\A\s*PUBLIC/um
|
690
|
+
system = /\A\s*SYSTEM/um
|
691
|
+
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
|
692
|
+
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
693
|
+
return "public ID literal is missing"
|
694
|
+
end
|
695
|
+
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
|
696
|
+
return "invalid public ID literal"
|
697
|
+
end
|
698
|
+
if accept_public_id
|
699
|
+
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
700
|
+
return "system ID literal is missing"
|
701
|
+
end
|
702
|
+
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
703
|
+
return "invalid system literal"
|
704
|
+
end
|
705
|
+
"garbage after system literal"
|
706
|
+
else
|
707
|
+
"garbage after public ID literal"
|
708
|
+
end
|
709
|
+
elsif accept_external_id and @source.match(/#{system}/um)
|
710
|
+
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
711
|
+
return "system literal is missing"
|
712
|
+
end
|
713
|
+
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
|
714
|
+
return "invalid system literal"
|
715
|
+
end
|
716
|
+
"garbage after system literal"
|
717
|
+
else
|
718
|
+
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
719
|
+
return "invalid ID type"
|
720
|
+
end
|
721
|
+
"ID type is missing"
|
722
|
+
end
|
723
|
+
end
|
502
724
|
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
725
|
+
def process_instruction
|
726
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
727
|
+
if @source.match(/\s+/um, true)
|
728
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
729
|
+
unless match_data
|
730
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
507
731
|
end
|
732
|
+
content = match_data[1]
|
733
|
+
else
|
734
|
+
content = nil
|
735
|
+
unless @source.match("?>", true)
|
736
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
737
|
+
end
|
738
|
+
end
|
739
|
+
if name == "xml"
|
740
|
+
if @document_status
|
741
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
742
|
+
end
|
743
|
+
version = VERSION.match(content)
|
744
|
+
version = version[1] unless version.nil?
|
745
|
+
encoding = ENCODING.match(content)
|
746
|
+
encoding = encoding[1] unless encoding.nil?
|
747
|
+
if need_source_encoding_update?(encoding)
|
748
|
+
@source.encoding = encoding
|
749
|
+
end
|
750
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
751
|
+
encoding = "UTF-16"
|
752
|
+
end
|
753
|
+
standalone = STANDALONE.match(content)
|
754
|
+
standalone = standalone[1] unless standalone.nil?
|
755
|
+
return [ :xmldecl, version, encoding, standalone ]
|
756
|
+
end
|
757
|
+
[:processing_instruction, name, content]
|
758
|
+
end
|
508
759
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
760
|
+
def parse_attributes(prefixes)
|
761
|
+
attributes = {}
|
762
|
+
expanded_names = {}
|
763
|
+
closed = false
|
764
|
+
while true
|
765
|
+
if @source.match(">", true)
|
766
|
+
return attributes, closed
|
767
|
+
elsif @source.match("/>", true)
|
768
|
+
closed = true
|
769
|
+
return attributes, closed
|
770
|
+
elsif match = @source.match(QNAME, true)
|
771
|
+
name = match[1]
|
772
|
+
prefix = match[2]
|
773
|
+
local_part = match[3]
|
774
|
+
|
775
|
+
unless @source.match(/\s*=\s*/um, true)
|
518
776
|
message = "Missing attribute equal: <#{name}>"
|
519
777
|
raise REXML::ParseException.new(message, @source)
|
520
778
|
end
|
521
|
-
|
522
|
-
unless quote
|
779
|
+
unless match = @source.match(/(['"])/, true)
|
523
780
|
message = "Missing attribute value start quote: <#{name}>"
|
524
781
|
raise REXML::ParseException.new(message, @source)
|
525
782
|
end
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
scanner.pos = pos
|
533
|
-
closed = !match_data[2].nil?
|
534
|
-
next
|
535
|
-
end
|
536
|
-
message =
|
537
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
783
|
+
quote = match[1]
|
784
|
+
start_position = @source.position
|
785
|
+
value = @source.read_until(quote)
|
786
|
+
unless value.chomp!(quote)
|
787
|
+
@source.position = start_position
|
788
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
538
789
|
raise REXML::ParseException.new(message, @source)
|
539
790
|
end
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
msg = "The '
|
791
|
+
@source.match(/\s*/um, true)
|
792
|
+
if prefix == "xmlns"
|
793
|
+
if local_part == "xml"
|
794
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
795
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
796
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
797
|
+
raise REXML::ParseException.new( msg, @source, self )
|
798
|
+
end
|
799
|
+
elsif local_part == "xmlns"
|
800
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
550
801
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
551
|
-
raise REXML::ParseException.new( msg, @source, self
|
802
|
+
raise REXML::ParseException.new( msg, @source, self)
|
552
803
|
end
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
raise REXML::ParseException.new( msg, @source, self)
|
804
|
+
add_namespace(local_part, value)
|
805
|
+
elsif prefix
|
806
|
+
prefixes << prefix unless prefix == "xml"
|
557
807
|
end
|
558
|
-
curr_ns << local_part
|
559
|
-
elsif prefix
|
560
|
-
prefixes << prefix unless prefix == "xml"
|
561
|
-
end
|
562
808
|
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
809
|
+
if attributes[name]
|
810
|
+
msg = "Duplicate attribute #{name.inspect}"
|
811
|
+
raise REXML::ParseException.new(msg, @source, self)
|
812
|
+
end
|
567
813
|
|
568
|
-
|
814
|
+
unless prefix == "xmlns"
|
815
|
+
uri = @namespaces[prefix]
|
816
|
+
expanded_name = [uri, local_part]
|
817
|
+
existing_prefix = expanded_names[expanded_name]
|
818
|
+
if existing_prefix
|
819
|
+
message = "Namespace conflict in adding attribute " +
|
820
|
+
"\"#{local_part}\": " +
|
821
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
822
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
823
|
+
raise REXML::ParseException.new(message, @source, self)
|
824
|
+
end
|
825
|
+
expanded_names[expanded_name] = prefix
|
826
|
+
end
|
827
|
+
|
828
|
+
attributes[name] = value
|
829
|
+
else
|
830
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
831
|
+
raise REXML::ParseException.new(message, @source)
|
832
|
+
end
|
569
833
|
end
|
570
|
-
return attributes, closed
|
571
834
|
end
|
572
835
|
end
|
573
836
|
end
|