rexml 3.2.4 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +558 -0
- data/README.md +11 -14
- data/doc/rexml/context.rdoc +143 -0
- data/doc/rexml/tasks/rdoc/child.rdoc +87 -0
- data/doc/rexml/tasks/rdoc/document.rdoc +276 -0
- data/doc/rexml/tasks/rdoc/element.rdoc +602 -0
- data/doc/rexml/tasks/rdoc/node.rdoc +97 -0
- data/doc/rexml/tasks/rdoc/parent.rdoc +267 -0
- data/doc/rexml/tasks/tocs/child_toc.rdoc +12 -0
- data/doc/rexml/tasks/tocs/document_toc.rdoc +30 -0
- data/doc/rexml/tasks/tocs/element_toc.rdoc +55 -0
- data/doc/rexml/tasks/tocs/master_toc.rdoc +135 -0
- data/doc/rexml/tasks/tocs/node_toc.rdoc +16 -0
- data/doc/rexml/tasks/tocs/parent_toc.rdoc +25 -0
- data/doc/rexml/tutorial.rdoc +1358 -0
- data/lib/rexml/attribute.rb +17 -11
- data/lib/rexml/doctype.rb +55 -31
- data/lib/rexml/document.rb +199 -35
- data/lib/rexml/element.rb +1802 -487
- data/lib/rexml/entity.rb +9 -38
- data/lib/rexml/formatters/pretty.rb +3 -3
- data/lib/rexml/functions.rb +1 -2
- data/lib/rexml/light/node.rb +0 -8
- data/lib/rexml/namespace.rb +8 -4
- data/lib/rexml/node.rb +8 -4
- data/lib/rexml/parseexception.rb +1 -0
- data/lib/rexml/parsers/baseparser.rb +545 -252
- data/lib/rexml/parsers/pullparser.rb +16 -0
- data/lib/rexml/parsers/sax2parser.rb +16 -19
- data/lib/rexml/parsers/streamparser.rb +16 -10
- data/lib/rexml/parsers/treeparser.rb +9 -21
- data/lib/rexml/parsers/xpathparser.rb +161 -97
- data/lib/rexml/rexml.rb +29 -22
- data/lib/rexml/source.rb +185 -100
- data/lib/rexml/text.rb +60 -61
- data/lib/rexml/xpath_parser.rb +43 -33
- data/lib/rexml.rb +3 -0
- metadata +42 -46
- data/.gitignore +0 -9
- data/.travis.yml +0 -24
- data/Gemfile +0 -6
- data/Rakefile +0 -8
- data/rexml.gemspec +0 -84
@@ -1,12 +1,40 @@
|
|
1
|
-
# frozen_string_literal:
|
1
|
+
# frozen_string_literal: true
|
2
2
|
require_relative '../parseexception'
|
3
3
|
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../security'
|
4
5
|
require_relative '../source'
|
5
6
|
require 'set'
|
6
7
|
require "strscan"
|
7
8
|
|
8
9
|
module REXML
|
9
10
|
module Parsers
|
11
|
+
unless [].respond_to?(:tally)
|
12
|
+
module EnumerableTally
|
13
|
+
refine Enumerable do
|
14
|
+
def tally
|
15
|
+
counts = {}
|
16
|
+
each do |item|
|
17
|
+
counts[item] ||= 0
|
18
|
+
counts[item] += 1
|
19
|
+
end
|
20
|
+
counts
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
using EnumerableTally
|
25
|
+
end
|
26
|
+
|
27
|
+
if StringScanner::Version < "3.0.8"
|
28
|
+
module StringScannerCaptures
|
29
|
+
refine StringScanner do
|
30
|
+
def captures
|
31
|
+
values_at(*(1...size))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
using StringScannerCaptures
|
36
|
+
end
|
37
|
+
|
10
38
|
# = Using the Pull Parser
|
11
39
|
# <em>This API is experimental, and subject to change.</em>
|
12
40
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
@@ -50,7 +78,6 @@ module REXML
|
|
50
78
|
|
51
79
|
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
|
52
80
|
DOCTYPE_END = /\A\s*\]\s*>/um
|
53
|
-
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
|
54
81
|
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
|
55
82
|
COMMENT_START = /\A<!--/u
|
56
83
|
COMMENT_PATTERN = /<!--(.*?)-->/um
|
@@ -61,15 +88,14 @@ module REXML
|
|
61
88
|
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
|
62
89
|
INSTRUCTION_START = /\A<\?/u
|
63
90
|
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
|
64
|
-
TAG_MATCH =
|
65
|
-
CLOSE_MATCH =
|
91
|
+
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
|
92
|
+
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
|
66
93
|
|
67
94
|
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
|
68
95
|
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
|
69
96
|
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
|
70
97
|
|
71
98
|
ENTITY_START = /\A\s*<!ENTITY/
|
72
|
-
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
|
73
99
|
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
|
74
100
|
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
|
75
101
|
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
|
@@ -83,9 +109,6 @@ module REXML
|
|
83
109
|
ATTDEF_RE = /#{ATTDEF}/
|
84
110
|
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
|
85
111
|
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
86
|
-
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
87
|
-
PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
|
88
|
-
SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
|
89
112
|
|
90
113
|
TEXT_PATTERN = /\A([^<]*)/um
|
91
114
|
|
@@ -101,7 +124,12 @@ module REXML
|
|
101
124
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
102
125
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
103
126
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
104
|
-
ENTITYDECL = /\s*(?:#{GEDECL})
|
127
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
128
|
+
|
129
|
+
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
130
|
+
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
131
|
+
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
|
132
|
+
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
|
105
133
|
|
106
134
|
EREFERENCE = /&(?!#{NAME};)/
|
107
135
|
|
@@ -112,9 +140,34 @@ module REXML
|
|
112
140
|
"apos" => [/'/, "'", "'", /'/]
|
113
141
|
}
|
114
142
|
|
143
|
+
module Private
|
144
|
+
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
|
145
|
+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
146
|
+
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
147
|
+
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
148
|
+
NAME_PATTERN = /#{NAME}/um
|
149
|
+
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
150
|
+
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
151
|
+
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
152
|
+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
|
153
|
+
CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
|
154
|
+
DEFAULT_ENTITIES_PATTERNS = {}
|
155
|
+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
|
156
|
+
default_entities.each do |term|
|
157
|
+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
|
158
|
+
end
|
159
|
+
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
160
|
+
end
|
161
|
+
private_constant :Private
|
162
|
+
|
115
163
|
def initialize( source )
|
116
164
|
self.stream = source
|
117
165
|
@listeners = []
|
166
|
+
@prefixes = Set.new
|
167
|
+
@entity_expansion_count = 0
|
168
|
+
@entity_expansion_limit = Security.entity_expansion_limit
|
169
|
+
@entity_expansion_text_limit = Security.entity_expansion_text_limit
|
170
|
+
@source.ensure_buffer
|
118
171
|
end
|
119
172
|
|
120
173
|
def add_listener( listener )
|
@@ -122,15 +175,24 @@ module REXML
|
|
122
175
|
end
|
123
176
|
|
124
177
|
attr_reader :source
|
178
|
+
attr_reader :entity_expansion_count
|
179
|
+
attr_writer :entity_expansion_limit
|
180
|
+
attr_writer :entity_expansion_text_limit
|
125
181
|
|
126
182
|
def stream=( source )
|
127
183
|
@source = SourceFactory.create_from( source )
|
184
|
+
reset
|
185
|
+
end
|
186
|
+
|
187
|
+
def reset
|
128
188
|
@closed = nil
|
189
|
+
@have_root = false
|
129
190
|
@document_status = nil
|
130
191
|
@tags = []
|
131
192
|
@stack = []
|
132
193
|
@entities = []
|
133
|
-
@
|
194
|
+
@namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
|
195
|
+
@namespaces_restore_stack = []
|
134
196
|
end
|
135
197
|
|
136
198
|
def position
|
@@ -180,6 +242,8 @@ module REXML
|
|
180
242
|
|
181
243
|
# Returns the next event. This is a +PullEvent+ object.
|
182
244
|
def pull
|
245
|
+
@source.drop_parsed_content
|
246
|
+
|
183
247
|
pull_event.tap do |event|
|
184
248
|
@listeners.each do |listener|
|
185
249
|
listener.receive event
|
@@ -192,215 +256,277 @@ module REXML
|
|
192
256
|
x, @closed = @closed, nil
|
193
257
|
return [ :end_element, x ]
|
194
258
|
end
|
195
|
-
|
259
|
+
if empty?
|
260
|
+
if @document_status == :in_doctype
|
261
|
+
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
|
262
|
+
end
|
263
|
+
unless @tags.empty?
|
264
|
+
path = "/" + @tags.join("/")
|
265
|
+
raise ParseException.new("Missing end tag for '#{path}'", @source)
|
266
|
+
end
|
267
|
+
return [ :end_document ]
|
268
|
+
end
|
196
269
|
return @stack.shift if @stack.size > 0
|
197
270
|
#STDERR.puts @source.encoding
|
198
|
-
@source.read if @source.buffer.size<2
|
199
271
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
272
|
+
|
273
|
+
@source.ensure_buffer
|
200
274
|
if @document_status == nil
|
201
|
-
|
202
|
-
|
203
|
-
word = word[1] unless word.nil?
|
204
|
-
#STDERR.puts "WORD = #{word.inspect}"
|
205
|
-
case word
|
206
|
-
when COMMENT_START
|
207
|
-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
208
|
-
when XMLDECL_START
|
209
|
-
#STDERR.puts "XMLDECL"
|
210
|
-
results = @source.match( XMLDECL_PATTERN, true )[1]
|
211
|
-
version = VERSION.match( results )
|
212
|
-
version = version[1] unless version.nil?
|
213
|
-
encoding = ENCODING.match(results)
|
214
|
-
encoding = encoding[1] unless encoding.nil?
|
215
|
-
if need_source_encoding_update?(encoding)
|
216
|
-
@source.encoding = encoding
|
217
|
-
end
|
218
|
-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
219
|
-
encoding = "UTF-16"
|
220
|
-
end
|
221
|
-
standalone = STANDALONE.match(results)
|
222
|
-
standalone = standalone[1] unless standalone.nil?
|
223
|
-
return [ :xmldecl, version, encoding, standalone ]
|
224
|
-
when INSTRUCTION_START
|
275
|
+
start_position = @source.position
|
276
|
+
if @source.match?("<?", true)
|
225
277
|
return process_instruction
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
278
|
+
elsif @source.match?("<!", true)
|
279
|
+
if @source.match?("--", true)
|
280
|
+
md = @source.match(/(.*?)-->/um, true)
|
281
|
+
if md.nil?
|
282
|
+
raise REXML::ParseException.new("Unclosed comment", @source)
|
283
|
+
end
|
284
|
+
if /--|-\z/.match?(md[1])
|
285
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
286
|
+
end
|
287
|
+
return [ :comment, md[1] ]
|
288
|
+
elsif @source.match?("DOCTYPE", true)
|
289
|
+
base_error_message = "Malformed DOCTYPE"
|
290
|
+
unless @source.match?(/\s+/um, true)
|
291
|
+
if @source.match?(">")
|
292
|
+
message = "#{base_error_message}: name is missing"
|
293
|
+
else
|
294
|
+
message = "#{base_error_message}: invalid name"
|
295
|
+
end
|
296
|
+
@source.position = start_position
|
297
|
+
raise REXML::ParseException.new(message, @source)
|
298
|
+
end
|
299
|
+
name = parse_name(base_error_message)
|
300
|
+
@source.match?(/\s*/um, true) # skip spaces
|
301
|
+
if @source.match?("[", true)
|
302
|
+
id = [nil, nil, nil]
|
303
|
+
@document_status = :in_doctype
|
304
|
+
elsif @source.match?(">", true)
|
305
|
+
id = [nil, nil, nil]
|
306
|
+
@document_status = :after_doctype
|
307
|
+
@source.ensure_buffer
|
308
|
+
else
|
309
|
+
id = parse_id(base_error_message,
|
310
|
+
accept_external_id: true,
|
311
|
+
accept_public_id: false)
|
312
|
+
if id[0] == "SYSTEM"
|
313
|
+
# For backward compatibility
|
314
|
+
id[1], id[2] = id[2], nil
|
315
|
+
end
|
316
|
+
@source.match?(/\s*/um, true) # skip spaces
|
317
|
+
if @source.match?("[", true)
|
318
|
+
@document_status = :in_doctype
|
319
|
+
elsif @source.match?(">", true)
|
320
|
+
@document_status = :after_doctype
|
321
|
+
@source.ensure_buffer
|
322
|
+
else
|
323
|
+
message = "#{base_error_message}: garbage after external ID"
|
324
|
+
raise REXML::ParseException.new(message, @source)
|
325
|
+
end
|
326
|
+
end
|
327
|
+
args = [:start_doctype, name, *id]
|
328
|
+
if @document_status == :after_doctype
|
329
|
+
@source.match?(/\s*/um, true)
|
330
|
+
@stack << [ :end_doctype ]
|
331
|
+
end
|
332
|
+
return args
|
243
333
|
else
|
244
|
-
|
245
|
-
|
246
|
-
return args
|
247
|
-
when /^\s+/
|
248
|
-
else
|
249
|
-
@document_status = :after_doctype
|
250
|
-
@source.read if @source.buffer.size<2
|
251
|
-
md = @source.match(/\s*/um, true)
|
252
|
-
if @source.encoding == "UTF-8"
|
253
|
-
@source.buffer.force_encoding(::Encoding::UTF_8)
|
334
|
+
message = "Invalid XML"
|
335
|
+
raise REXML::ParseException.new(message, @source)
|
254
336
|
end
|
255
337
|
end
|
256
338
|
end
|
257
339
|
if @document_status == :in_doctype
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
340
|
+
@source.match?(/\s*/um, true) # skip spaces
|
341
|
+
start_position = @source.position
|
342
|
+
if @source.match?("<!", true)
|
343
|
+
if @source.match?("ELEMENT", true)
|
344
|
+
md = @source.match(/(.*?)>/um, true)
|
345
|
+
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
346
|
+
return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
347
|
+
elsif @source.match?("ENTITY", true)
|
348
|
+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
|
349
|
+
unless match_data
|
350
|
+
raise REXML::ParseException.new("Malformed entity declaration", @source)
|
351
|
+
end
|
352
|
+
match = [:entitydecl, *match_data.captures.compact]
|
353
|
+
ref = false
|
354
|
+
if match[1] == '%'
|
355
|
+
ref = true
|
356
|
+
match.delete_at 1
|
357
|
+
end
|
358
|
+
# Now we have to sort out what kind of entity reference this is
|
359
|
+
if match[2] == 'SYSTEM'
|
360
|
+
# External reference
|
361
|
+
match[3] = match[3][1..-2] # PUBID
|
362
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
363
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
364
|
+
elsif match[2] == 'PUBLIC'
|
365
|
+
# External reference
|
366
|
+
match[3] = match[3][1..-2] # PUBID
|
367
|
+
match[4] = match[4][1..-2] # HREF
|
368
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
369
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
370
|
+
elsif Private::PEREFERENCE_PATTERN.match?(match[2])
|
371
|
+
raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
|
372
|
+
else
|
373
|
+
match[2] = match[2][1..-2]
|
374
|
+
match.pop if match.size == 4
|
375
|
+
# match is [ :entity, name, value ]
|
376
|
+
end
|
377
|
+
match << '%' if ref
|
378
|
+
return match
|
379
|
+
elsif @source.match?("ATTLIST", true)
|
380
|
+
md = @source.match(Private::ATTLISTDECL_END, true)
|
381
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
382
|
+
element = md[1]
|
383
|
+
contents = "<!ATTLIST" + md[0]
|
384
|
+
|
385
|
+
pairs = {}
|
386
|
+
values = md[0].strip.scan( ATTDEF_RE )
|
387
|
+
values.each do |attdef|
|
388
|
+
unless attdef[3] == "#IMPLIED"
|
389
|
+
attdef.compact!
|
390
|
+
val = attdef[3]
|
391
|
+
val = attdef[4] if val == "#FIXED "
|
392
|
+
pairs[attdef[0]] = val
|
393
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
394
|
+
@namespaces[$1] = val
|
395
|
+
end
|
310
396
|
end
|
311
397
|
end
|
398
|
+
return [ :attlistdecl, element, pairs, contents ]
|
399
|
+
elsif @source.match?("NOTATION", true)
|
400
|
+
base_error_message = "Malformed notation declaration"
|
401
|
+
unless @source.match?(/\s+/um, true)
|
402
|
+
if @source.match?(">")
|
403
|
+
message = "#{base_error_message}: name is missing"
|
404
|
+
else
|
405
|
+
message = "#{base_error_message}: invalid name"
|
406
|
+
end
|
407
|
+
@source.position = start_position
|
408
|
+
raise REXML::ParseException.new(message, @source)
|
409
|
+
end
|
410
|
+
name = parse_name(base_error_message)
|
411
|
+
id = parse_id(base_error_message,
|
412
|
+
accept_external_id: true,
|
413
|
+
accept_public_id: true)
|
414
|
+
@source.match?(/\s*/um, true) # skip spaces
|
415
|
+
unless @source.match?(">", true)
|
416
|
+
message = "#{base_error_message}: garbage before end >"
|
417
|
+
raise REXML::ParseException.new(message, @source)
|
418
|
+
end
|
419
|
+
return [:notationdecl, name, *id]
|
420
|
+
elsif md = @source.match(/--(.*?)-->/um, true)
|
421
|
+
case md[1]
|
422
|
+
when /--/, /-\z/
|
423
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
424
|
+
end
|
425
|
+
return [ :comment, md[1] ] if md
|
312
426
|
end
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
if @source.match( PUBLIC )
|
317
|
-
md = @source.match( PUBLIC, true )
|
318
|
-
vals = [md[1],md[2],md[4],md[6]]
|
319
|
-
elsif @source.match( SYSTEM )
|
320
|
-
md = @source.match( SYSTEM, true )
|
321
|
-
vals = [md[1],md[2],nil,md[4]]
|
322
|
-
else
|
323
|
-
raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
|
324
|
-
end
|
325
|
-
return [ :notationdecl, *vals ]
|
326
|
-
when DOCTYPE_END
|
427
|
+
elsif match = @source.match(/(%.*?;)\s*/um, true)
|
428
|
+
return [ :externalentity, match[1] ]
|
429
|
+
elsif @source.match?(/\]\s*>/um, true)
|
327
430
|
@document_status = :after_doctype
|
328
|
-
@source.match( DOCTYPE_END, true )
|
329
431
|
return [ :end_doctype ]
|
330
432
|
end
|
433
|
+
if @document_status == :in_doctype
|
434
|
+
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
|
435
|
+
end
|
436
|
+
end
|
437
|
+
if @document_status == :after_doctype
|
438
|
+
@source.match?(/\s*/um, true)
|
331
439
|
end
|
332
440
|
begin
|
333
|
-
|
334
|
-
|
335
|
-
|
441
|
+
start_position = @source.position
|
442
|
+
if @source.match?("<", true)
|
443
|
+
# :text's read_until may remain only "<" in buffer. In the
|
444
|
+
# case, buffer is empty here. So we need to fill buffer
|
445
|
+
# here explicitly.
|
446
|
+
@source.ensure_buffer
|
447
|
+
if @source.match?("/", true)
|
448
|
+
@namespaces_restore_stack.pop
|
336
449
|
last_tag = @tags.pop
|
337
|
-
md = @source.match(
|
450
|
+
md = @source.match(Private::CLOSE_PATTERN, true)
|
338
451
|
if md and !last_tag
|
339
452
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
340
453
|
raise REXML::ParseException.new(message, @source)
|
341
454
|
end
|
342
455
|
if md.nil? or last_tag != md[1]
|
343
456
|
message = "Missing end tag for '#{last_tag}'"
|
344
|
-
message
|
457
|
+
message += " (got '#{md[1]}')" if md
|
458
|
+
@source.position = start_position if md.nil?
|
345
459
|
raise REXML::ParseException.new(message, @source)
|
346
460
|
end
|
347
461
|
return [ :end_element, last_tag ]
|
348
|
-
elsif @source.
|
349
|
-
md = @source.match(
|
462
|
+
elsif @source.match?("!", true)
|
463
|
+
md = @source.match(/([^>]*>)/um)
|
350
464
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
351
465
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
352
|
-
if md[0][
|
353
|
-
md = @source.match(
|
466
|
+
if md[0][0] == ?-
|
467
|
+
md = @source.match(/--(.*?)-->/um, true)
|
354
468
|
|
355
|
-
|
356
|
-
when /--/, /-\z/
|
469
|
+
if md.nil? || /--|-\z/.match?(md[1])
|
357
470
|
raise REXML::ParseException.new("Malformed comment", @source)
|
358
471
|
end
|
359
472
|
|
360
|
-
return [ :comment, md[1] ]
|
473
|
+
return [ :comment, md[1] ]
|
361
474
|
else
|
362
|
-
md = @source.match(
|
475
|
+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
363
476
|
return [ :cdata, md[1] ] if md
|
364
477
|
end
|
365
478
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
366
479
|
"in the doctype declaration.", @source)
|
367
|
-
elsif @source.
|
480
|
+
elsif @source.match?("?", true)
|
368
481
|
return process_instruction
|
369
482
|
else
|
370
483
|
# Get the next tag
|
371
|
-
md = @source.match(
|
484
|
+
md = @source.match(Private::TAG_PATTERN, true)
|
372
485
|
unless md
|
486
|
+
@source.position = start_position
|
373
487
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
374
488
|
end
|
375
|
-
|
376
|
-
|
377
|
-
@
|
378
|
-
|
489
|
+
tag = md[1]
|
490
|
+
@document_status = :in_element
|
491
|
+
@prefixes.clear
|
492
|
+
@prefixes << md[2] if md[2]
|
493
|
+
push_namespaces_restore
|
494
|
+
attributes, closed = parse_attributes(@prefixes)
|
379
495
|
# Verify that all of the prefixes have been defined
|
380
|
-
for prefix in prefixes
|
381
|
-
unless @
|
496
|
+
for prefix in @prefixes
|
497
|
+
unless @namespaces.key?(prefix)
|
382
498
|
raise UndefinedNamespaceException.new(prefix,@source,self)
|
383
499
|
end
|
384
500
|
end
|
385
501
|
|
386
502
|
if closed
|
387
|
-
@closed =
|
388
|
-
|
503
|
+
@closed = tag
|
504
|
+
pop_namespaces_restore
|
389
505
|
else
|
390
|
-
@tags.
|
506
|
+
if @tags.empty? and @have_root
|
507
|
+
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
|
508
|
+
end
|
509
|
+
@tags.push( tag )
|
391
510
|
end
|
392
|
-
|
511
|
+
@have_root = true
|
512
|
+
return [ :start_element, tag, attributes ]
|
393
513
|
end
|
394
514
|
else
|
395
|
-
|
396
|
-
if
|
397
|
-
@source.
|
515
|
+
text = @source.read_until("<")
|
516
|
+
if text.chomp!("<")
|
517
|
+
@source.position -= "<".bytesize
|
398
518
|
end
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
519
|
+
if @tags.empty?
|
520
|
+
unless /\A\s*\z/.match?(text)
|
521
|
+
if @have_root
|
522
|
+
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
|
523
|
+
else
|
524
|
+
raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
|
525
|
+
end
|
526
|
+
end
|
527
|
+
return pull_event if @have_root
|
528
|
+
end
|
529
|
+
return [ :text, text ]
|
404
530
|
end
|
405
531
|
rescue REXML::UndefinedNamespaceException
|
406
532
|
raise
|
@@ -415,13 +541,13 @@ module REXML
|
|
415
541
|
private :pull_event
|
416
542
|
|
417
543
|
def entity( reference, entities )
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
unnormalize( value, entities )
|
544
|
+
return unless entities
|
545
|
+
|
546
|
+
value = entities[ reference ]
|
547
|
+
return if value.nil?
|
548
|
+
|
549
|
+
record_entity_expansion
|
550
|
+
unnormalize( value, entities )
|
425
551
|
end
|
426
552
|
|
427
553
|
# Escapes all possible entities
|
@@ -442,132 +568,299 @@ module REXML
|
|
442
568
|
|
443
569
|
# Unescapes all possible entities
|
444
570
|
def unnormalize( string, entities=nil, filter=nil )
|
445
|
-
|
446
|
-
|
571
|
+
if string.include?("\r")
|
572
|
+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
|
573
|
+
else
|
574
|
+
rv = string.dup
|
575
|
+
end
|
447
576
|
matches = rv.scan( REFERENCE_RE )
|
448
577
|
return rv if matches.size == 0
|
449
|
-
rv.gsub!(
|
578
|
+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
|
450
579
|
m=$1
|
451
|
-
|
452
|
-
|
580
|
+
if m.start_with?("x")
|
581
|
+
code_point = Integer(m[1..-1], 16)
|
582
|
+
else
|
583
|
+
code_point = Integer(m, 10)
|
584
|
+
end
|
585
|
+
[code_point].pack('U*')
|
453
586
|
}
|
454
587
|
matches.collect!{|x|x[0]}.compact!
|
588
|
+
if filter
|
589
|
+
matches.reject! do |entity_reference|
|
590
|
+
filter.include?(entity_reference)
|
591
|
+
end
|
592
|
+
end
|
455
593
|
if matches.size > 0
|
456
|
-
matches.each do |entity_reference|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
rv.gsub!( er[0], er[2] ) if er
|
594
|
+
matches.tally.each do |entity_reference, n|
|
595
|
+
entity_expansion_count_before = @entity_expansion_count
|
596
|
+
entity_value = entity( entity_reference, entities )
|
597
|
+
if entity_value
|
598
|
+
if n > 1
|
599
|
+
entity_expansion_count_delta =
|
600
|
+
@entity_expansion_count - entity_expansion_count_before
|
601
|
+
record_entity_expansion(entity_expansion_count_delta * (n - 1))
|
465
602
|
end
|
603
|
+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
|
604
|
+
rv.gsub!( re, entity_value )
|
605
|
+
if rv.bytesize > @entity_expansion_text_limit
|
606
|
+
raise "entity expansion has grown too large"
|
607
|
+
end
|
608
|
+
else
|
609
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
610
|
+
rv.gsub!( er[0], er[2] ) if er
|
466
611
|
end
|
467
612
|
end
|
468
|
-
rv.gsub!(
|
613
|
+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
|
469
614
|
end
|
470
615
|
rv
|
471
616
|
end
|
472
617
|
|
473
618
|
private
|
619
|
+
def add_namespace(prefix, uri)
|
620
|
+
@namespaces_restore_stack.last[prefix] = @namespaces[prefix]
|
621
|
+
if uri.nil?
|
622
|
+
@namespaces.delete(prefix)
|
623
|
+
else
|
624
|
+
@namespaces[prefix] = uri
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
def push_namespaces_restore
|
629
|
+
namespaces_restore = {}
|
630
|
+
@namespaces_restore_stack.push(namespaces_restore)
|
631
|
+
namespaces_restore
|
632
|
+
end
|
633
|
+
|
634
|
+
def pop_namespaces_restore
|
635
|
+
namespaces_restore = @namespaces_restore_stack.pop
|
636
|
+
namespaces_restore.each do |prefix, uri|
|
637
|
+
if uri.nil?
|
638
|
+
@namespaces.delete(prefix)
|
639
|
+
else
|
640
|
+
@namespaces[prefix] = uri
|
641
|
+
end
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
def record_entity_expansion(delta=1)
|
646
|
+
@entity_expansion_count += delta
|
647
|
+
if @entity_expansion_count > @entity_expansion_limit
|
648
|
+
raise "number of entity expansions exceeded, processing aborted."
|
649
|
+
end
|
650
|
+
end
|
651
|
+
|
474
652
|
def need_source_encoding_update?(xml_declaration_encoding)
|
475
653
|
return false if xml_declaration_encoding.nil?
|
476
654
|
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
477
655
|
true
|
478
656
|
end
|
479
657
|
|
480
|
-
def
|
481
|
-
|
482
|
-
unless
|
483
|
-
|
658
|
+
def parse_name(base_error_message)
|
659
|
+
md = @source.match(Private::NAME_PATTERN, true)
|
660
|
+
unless md
|
661
|
+
if @source.match?(/\S/um)
|
662
|
+
message = "#{base_error_message}: invalid name"
|
663
|
+
else
|
664
|
+
message = "#{base_error_message}: name is missing"
|
665
|
+
end
|
484
666
|
raise REXML::ParseException.new(message, @source)
|
485
667
|
end
|
486
|
-
[
|
668
|
+
md[0]
|
487
669
|
end
|
488
670
|
|
489
|
-
def
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
671
|
+
def parse_id(base_error_message,
|
672
|
+
accept_external_id:,
|
673
|
+
accept_public_id:)
|
674
|
+
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
|
675
|
+
pubid = system = nil
|
676
|
+
pubid_literal = md[1]
|
677
|
+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
|
678
|
+
system_literal = md[2]
|
679
|
+
system = system_literal[1..-2] if system_literal # Remove quote
|
680
|
+
["PUBLIC", pubid, system]
|
681
|
+
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
|
682
|
+
pubid = system = nil
|
683
|
+
pubid_literal = md[1]
|
684
|
+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
|
685
|
+
["PUBLIC", pubid, nil]
|
686
|
+
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
|
687
|
+
system = nil
|
688
|
+
system_literal = md[1]
|
689
|
+
system = system_literal[1..-2] if system_literal # Remove quote
|
690
|
+
["SYSTEM", nil, system]
|
691
|
+
else
|
692
|
+
details = parse_id_invalid_details(accept_external_id: accept_external_id,
|
693
|
+
accept_public_id: accept_public_id)
|
694
|
+
message = "#{base_error_message}: #{details}"
|
495
695
|
raise REXML::ParseException.new(message, @source)
|
496
696
|
end
|
697
|
+
end
|
698
|
+
|
699
|
+
def parse_id_invalid_details(accept_external_id:,
|
700
|
+
accept_public_id:)
|
701
|
+
public = /\A\s*PUBLIC/um
|
702
|
+
system = /\A\s*SYSTEM/um
|
703
|
+
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
|
704
|
+
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
|
705
|
+
return "public ID literal is missing"
|
706
|
+
end
|
707
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
|
708
|
+
return "invalid public ID literal"
|
709
|
+
end
|
710
|
+
if accept_public_id
|
711
|
+
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
|
712
|
+
return "system ID literal is missing"
|
713
|
+
end
|
714
|
+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
|
715
|
+
return "invalid system literal"
|
716
|
+
end
|
717
|
+
"garbage after system literal"
|
718
|
+
else
|
719
|
+
"garbage after public ID literal"
|
720
|
+
end
|
721
|
+
elsif accept_external_id and @source.match?(/#{system}/um)
|
722
|
+
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
|
723
|
+
return "system literal is missing"
|
724
|
+
end
|
725
|
+
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
|
726
|
+
return "invalid system literal"
|
727
|
+
end
|
728
|
+
"garbage after system literal"
|
729
|
+
else
|
730
|
+
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
|
731
|
+
return "invalid ID type"
|
732
|
+
end
|
733
|
+
"ID type is missing"
|
734
|
+
end
|
735
|
+
end
|
497
736
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
737
|
+
def process_instruction
|
738
|
+
name = parse_name("Malformed XML: Invalid processing instruction node")
|
739
|
+
if @source.match?(/\s+/um, true)
|
740
|
+
match_data = @source.match(/(.*?)\?>/um, true)
|
741
|
+
unless match_data
|
742
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
743
|
+
end
|
744
|
+
content = match_data[1]
|
745
|
+
else
|
746
|
+
content = nil
|
747
|
+
unless @source.match?("?>", true)
|
748
|
+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
|
749
|
+
end
|
750
|
+
end
|
751
|
+
if name == "xml"
|
752
|
+
if @document_status
|
753
|
+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
|
754
|
+
end
|
755
|
+
version = VERSION.match(content)
|
756
|
+
version = version[1] unless version.nil?
|
757
|
+
encoding = ENCODING.match(content)
|
758
|
+
encoding = encoding[1] unless encoding.nil?
|
759
|
+
if need_source_encoding_update?(encoding)
|
760
|
+
@source.encoding = encoding
|
761
|
+
end
|
762
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
763
|
+
encoding = "UTF-16"
|
764
|
+
end
|
765
|
+
standalone = STANDALONE.match(content)
|
766
|
+
standalone = standalone[1] unless standalone.nil?
|
767
|
+
return [ :xmldecl, version, encoding, standalone ]
|
768
|
+
end
|
769
|
+
[:processing_instruction, name, content]
|
770
|
+
end
|
502
771
|
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
772
|
+
if StringScanner::Version < "3.1.1"
|
773
|
+
def scan_quote
|
774
|
+
@source.match(/(['"])/, true)&.[](1)
|
775
|
+
end
|
776
|
+
else
|
777
|
+
def scan_quote
|
778
|
+
case @source.peek_byte
|
779
|
+
when 34 # '"'.ord
|
780
|
+
@source.scan_byte
|
781
|
+
'"'
|
782
|
+
when 39 # "'".ord
|
783
|
+
@source.scan_byte
|
784
|
+
"'"
|
785
|
+
else
|
786
|
+
nil
|
507
787
|
end
|
788
|
+
end
|
789
|
+
end
|
508
790
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
791
|
+
def parse_attributes(prefixes)
|
792
|
+
attributes = {}
|
793
|
+
expanded_names = {}
|
794
|
+
closed = false
|
795
|
+
while true
|
796
|
+
if @source.match?(">", true)
|
797
|
+
return attributes, closed
|
798
|
+
elsif @source.match?("/>", true)
|
799
|
+
closed = true
|
800
|
+
return attributes, closed
|
801
|
+
elsif match = @source.match(QNAME, true)
|
802
|
+
name = match[1]
|
803
|
+
prefix = match[2]
|
804
|
+
local_part = match[3]
|
805
|
+
|
806
|
+
unless @source.match?(/\s*=\s*/um, true)
|
518
807
|
message = "Missing attribute equal: <#{name}>"
|
519
808
|
raise REXML::ParseException.new(message, @source)
|
520
809
|
end
|
521
|
-
quote =
|
522
|
-
unless quote
|
810
|
+
unless quote = scan_quote
|
523
811
|
message = "Missing attribute value start quote: <#{name}>"
|
524
812
|
raise REXML::ParseException.new(message, @source)
|
525
813
|
end
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
scanner << match_data[1]
|
532
|
-
scanner.pos = pos
|
533
|
-
closed = !match_data[2].nil?
|
534
|
-
next
|
535
|
-
end
|
536
|
-
message =
|
537
|
-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
814
|
+
start_position = @source.position
|
815
|
+
value = @source.read_until(quote)
|
816
|
+
unless value.chomp!(quote)
|
817
|
+
@source.position = start_position
|
818
|
+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
538
819
|
raise REXML::ParseException.new(message, @source)
|
539
820
|
end
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
msg = "The '
|
821
|
+
@source.match?(/\s*/um, true)
|
822
|
+
if prefix == "xmlns"
|
823
|
+
if local_part == "xml"
|
824
|
+
if value != Private::XML_PREFIXED_NAMESPACE
|
825
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
826
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
827
|
+
raise REXML::ParseException.new( msg, @source, self )
|
828
|
+
end
|
829
|
+
elsif local_part == "xmlns"
|
830
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
550
831
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
551
|
-
raise REXML::ParseException.new( msg, @source, self
|
832
|
+
raise REXML::ParseException.new( msg, @source, self)
|
552
833
|
end
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
raise REXML::ParseException.new( msg, @source, self)
|
834
|
+
add_namespace(local_part, value)
|
835
|
+
elsif prefix
|
836
|
+
prefixes << prefix unless prefix == "xml"
|
557
837
|
end
|
558
|
-
curr_ns << local_part
|
559
|
-
elsif prefix
|
560
|
-
prefixes << prefix unless prefix == "xml"
|
561
|
-
end
|
562
838
|
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
839
|
+
if attributes[name]
|
840
|
+
msg = "Duplicate attribute #{name.inspect}"
|
841
|
+
raise REXML::ParseException.new(msg, @source, self)
|
842
|
+
end
|
843
|
+
|
844
|
+
unless prefix == "xmlns"
|
845
|
+
uri = @namespaces[prefix]
|
846
|
+
expanded_name = [uri, local_part]
|
847
|
+
existing_prefix = expanded_names[expanded_name]
|
848
|
+
if existing_prefix
|
849
|
+
message = "Namespace conflict in adding attribute " +
|
850
|
+
"\"#{local_part}\": " +
|
851
|
+
"Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
|
852
|
+
"prefix \"#{prefix}\" = \"#{uri}\""
|
853
|
+
raise REXML::ParseException.new(message, @source, self)
|
854
|
+
end
|
855
|
+
expanded_names[expanded_name] = prefix
|
856
|
+
end
|
567
857
|
|
568
|
-
|
858
|
+
attributes[name] = value
|
859
|
+
else
|
860
|
+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
861
|
+
raise REXML::ParseException.new(message, @source)
|
862
|
+
end
|
569
863
|
end
|
570
|
-
return attributes, closed
|
571
864
|
end
|
572
865
|
end
|
573
866
|
end
|