rexml 3.1.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rexml might be problematic. Click here for more details.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +10 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +60 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rexml/attlistdecl.rb +63 -0
- data/lib/rexml/attribute.rb +192 -0
- data/lib/rexml/cdata.rb +68 -0
- data/lib/rexml/child.rb +97 -0
- data/lib/rexml/comment.rb +80 -0
- data/lib/rexml/doctype.rb +270 -0
- data/lib/rexml/document.rb +291 -0
- data/lib/rexml/dtd/attlistdecl.rb +11 -0
- data/lib/rexml/dtd/dtd.rb +47 -0
- data/lib/rexml/dtd/elementdecl.rb +18 -0
- data/lib/rexml/dtd/entitydecl.rb +57 -0
- data/lib/rexml/dtd/notationdecl.rb +40 -0
- data/lib/rexml/element.rb +1267 -0
- data/lib/rexml/encoding.rb +51 -0
- data/lib/rexml/entity.rb +171 -0
- data/lib/rexml/formatters/default.rb +112 -0
- data/lib/rexml/formatters/pretty.rb +142 -0
- data/lib/rexml/formatters/transitive.rb +58 -0
- data/lib/rexml/functions.rb +447 -0
- data/lib/rexml/instruction.rb +71 -0
- data/lib/rexml/light/node.rb +196 -0
- data/lib/rexml/namespace.rb +48 -0
- data/lib/rexml/node.rb +76 -0
- data/lib/rexml/output.rb +30 -0
- data/lib/rexml/parent.rb +166 -0
- data/lib/rexml/parseexception.rb +52 -0
- data/lib/rexml/parsers/baseparser.rb +586 -0
- data/lib/rexml/parsers/lightparser.rb +59 -0
- data/lib/rexml/parsers/pullparser.rb +197 -0
- data/lib/rexml/parsers/sax2parser.rb +273 -0
- data/lib/rexml/parsers/streamparser.rb +61 -0
- data/lib/rexml/parsers/treeparser.rb +101 -0
- data/lib/rexml/parsers/ultralightparser.rb +57 -0
- data/lib/rexml/parsers/xpathparser.rb +675 -0
- data/lib/rexml/quickpath.rb +266 -0
- data/lib/rexml/rexml.rb +32 -0
- data/lib/rexml/sax2listener.rb +98 -0
- data/lib/rexml/security.rb +28 -0
- data/lib/rexml/source.rb +298 -0
- data/lib/rexml/streamlistener.rb +93 -0
- data/lib/rexml/syncenumerator.rb +33 -0
- data/lib/rexml/text.rb +424 -0
- data/lib/rexml/undefinednamespaceexception.rb +9 -0
- data/lib/rexml/validation/relaxng.rb +539 -0
- data/lib/rexml/validation/validation.rb +144 -0
- data/lib/rexml/validation/validationexception.rb +10 -0
- data/lib/rexml/xmldecl.rb +116 -0
- data/lib/rexml/xmltokens.rb +85 -0
- data/lib/rexml/xpath.rb +81 -0
- data/lib/rexml/xpath_parser.rb +934 -0
- data/rexml.gemspec +42 -0
- metadata +131 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: false
|
2
|
+
module REXML
|
3
|
+
class ParseException < RuntimeError
|
4
|
+
attr_accessor :source, :parser, :continued_exception
|
5
|
+
|
6
|
+
def initialize( message, source=nil, parser=nil, exception=nil )
|
7
|
+
super(message)
|
8
|
+
@source = source
|
9
|
+
@parser = parser
|
10
|
+
@continued_exception = exception
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
# Quote the original exception, if there was one
|
15
|
+
if @continued_exception
|
16
|
+
err = @continued_exception.inspect
|
17
|
+
err << "\n"
|
18
|
+
err << @continued_exception.backtrace.join("\n")
|
19
|
+
err << "\n...\n"
|
20
|
+
else
|
21
|
+
err = ""
|
22
|
+
end
|
23
|
+
|
24
|
+
# Get the stack trace and error message
|
25
|
+
err << super
|
26
|
+
|
27
|
+
# Add contextual information
|
28
|
+
if @source
|
29
|
+
err << "\nLine: #{line}\n"
|
30
|
+
err << "Position: #{position}\n"
|
31
|
+
err << "Last 80 unconsumed characters:\n"
|
32
|
+
err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
|
33
|
+
end
|
34
|
+
|
35
|
+
err
|
36
|
+
end
|
37
|
+
|
38
|
+
def position
|
39
|
+
@source.current_line[0] if @source and defined? @source.current_line and
|
40
|
+
@source.current_line
|
41
|
+
end
|
42
|
+
|
43
|
+
def line
|
44
|
+
@source.current_line[2] if @source and defined? @source.current_line and
|
45
|
+
@source.current_line
|
46
|
+
end
|
47
|
+
|
48
|
+
def context
|
49
|
+
@source.current_line
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,586 @@
|
|
1
|
+
# frozen_string_literal: false
|
2
|
+
require_relative '../parseexception'
|
3
|
+
require_relative '../undefinednamespaceexception'
|
4
|
+
require_relative '../source'
|
5
|
+
require 'set'
|
6
|
+
require "strscan"
|
7
|
+
|
8
|
+
module REXML
|
9
|
+
module Parsers
|
10
|
+
# = Using the Pull Parser
|
11
|
+
# <em>This API is experimental, and subject to change.</em>
|
12
|
+
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
13
|
+
# while parser.has_next?
|
14
|
+
# res = parser.next
|
15
|
+
# puts res[1]['att'] if res.start_tag? and res[0] == 'b'
|
16
|
+
# end
|
17
|
+
# See the PullEvent class for information on the content of the results.
|
18
|
+
# The data is identical to the arguments passed for the various events to
|
19
|
+
# the StreamListener API.
|
20
|
+
#
|
21
|
+
# Notice that:
|
22
|
+
# parser = PullParser.new( "<a>BAD DOCUMENT" )
|
23
|
+
# while parser.has_next?
|
24
|
+
# res = parser.next
|
25
|
+
# raise res[1] if res.error?
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
# Nat Price gave me some good ideas for the API.
|
29
|
+
class BaseParser
|
30
|
+
LETTER = '[:alpha:]'
|
31
|
+
DIGIT = '[:digit:]'
|
32
|
+
|
33
|
+
COMBININGCHAR = '' # TODO
|
34
|
+
EXTENDER = '' # TODO
|
35
|
+
|
36
|
+
NCNAME_STR= "[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*"
|
37
|
+
QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
|
38
|
+
QNAME = /(#{QNAME_STR})/
|
39
|
+
|
40
|
+
NAMECHAR = '[\-\w\.:]'
|
41
|
+
NAME = "([\\w:]#{NAMECHAR}*)"
|
42
|
+
NMTOKEN = "(?:#{NAMECHAR})+"
|
43
|
+
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
|
44
|
+
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
|
45
|
+
REFERENCE_RE = /#{REFERENCE}/
|
46
|
+
|
47
|
+
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
|
48
|
+
DOCTYPE_END = /\A\s*\]\s*>/um
|
49
|
+
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
|
50
|
+
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
|
51
|
+
COMMENT_START = /\A<!--/u
|
52
|
+
COMMENT_PATTERN = /<!--(.*?)-->/um
|
53
|
+
CDATA_START = /\A<!\[CDATA\[/u
|
54
|
+
CDATA_END = /\A\s*\]\s*>/um
|
55
|
+
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
|
56
|
+
XMLDECL_START = /\A<\?xml\s/u;
|
57
|
+
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
|
58
|
+
INSTRUCTION_START = /\A<\?/u
|
59
|
+
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
|
60
|
+
TAG_MATCH = /^<((?>#{QNAME_STR}))/um
|
61
|
+
CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
|
62
|
+
|
63
|
+
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
|
64
|
+
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
|
65
|
+
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
|
66
|
+
|
67
|
+
ENTITY_START = /\A\s*<!ENTITY/
|
68
|
+
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
|
69
|
+
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
|
70
|
+
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
|
71
|
+
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
|
72
|
+
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
|
73
|
+
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
|
74
|
+
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
|
75
|
+
ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
|
76
|
+
ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
|
77
|
+
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
|
78
|
+
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
|
79
|
+
ATTDEF_RE = /#{ATTDEF}/
|
80
|
+
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
|
81
|
+
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
82
|
+
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
83
|
+
PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
|
84
|
+
SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
|
85
|
+
|
86
|
+
TEXT_PATTERN = /\A([^<]*)/um
|
87
|
+
|
88
|
+
# Entity constants
|
89
|
+
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
|
90
|
+
SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
|
91
|
+
PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
|
92
|
+
EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
|
93
|
+
NDATADECL = "\\s+NDATA\\s+#{NAME}"
|
94
|
+
PEREFERENCE = "%#{NAME};"
|
95
|
+
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
|
96
|
+
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
|
97
|
+
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
98
|
+
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
99
|
+
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
100
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
|
101
|
+
|
102
|
+
EREFERENCE = /&(?!#{NAME};)/
|
103
|
+
|
104
|
+
DEFAULT_ENTITIES = {
|
105
|
+
'gt' => [/>/, '>', '>', />/],
|
106
|
+
'lt' => [/</, '<', '<', /</],
|
107
|
+
'quot' => [/"/, '"', '"', /"/],
|
108
|
+
"apos" => [/'/, "'", "'", /'/]
|
109
|
+
}
|
110
|
+
|
111
|
+
def initialize( source )
|
112
|
+
self.stream = source
|
113
|
+
@listeners = []
|
114
|
+
end
|
115
|
+
|
116
|
+
def add_listener( listener )
|
117
|
+
@listeners << listener
|
118
|
+
end
|
119
|
+
|
120
|
+
attr_reader :source
|
121
|
+
|
122
|
+
def stream=( source )
|
123
|
+
@source = SourceFactory.create_from( source )
|
124
|
+
@closed = nil
|
125
|
+
@document_status = nil
|
126
|
+
@tags = []
|
127
|
+
@stack = []
|
128
|
+
@entities = []
|
129
|
+
@nsstack = []
|
130
|
+
end
|
131
|
+
|
132
|
+
def position
|
133
|
+
if @source.respond_to? :position
|
134
|
+
@source.position
|
135
|
+
else
|
136
|
+
# FIXME
|
137
|
+
0
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Returns true if there are no more events
|
142
|
+
def empty?
|
143
|
+
return (@source.empty? and @stack.empty?)
|
144
|
+
end
|
145
|
+
|
146
|
+
# Returns true if there are more events. Synonymous with !empty?
|
147
|
+
def has_next?
|
148
|
+
return !(@source.empty? and @stack.empty?)
|
149
|
+
end
|
150
|
+
|
151
|
+
# Push an event back on the head of the stream. This method
|
152
|
+
# has (theoretically) infinite depth.
|
153
|
+
def unshift token
|
154
|
+
@stack.unshift(token)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Peek at the +depth+ event in the stack. The first element on the stack
|
158
|
+
# is at depth 0. If +depth+ is -1, will parse to the end of the input
|
159
|
+
# stream and return the last event, which is always :end_document.
|
160
|
+
# Be aware that this causes the stream to be parsed up to the +depth+
|
161
|
+
# event, so you can effectively pre-parse the entire document (pull the
|
162
|
+
# entire thing into memory) using this method.
|
163
|
+
def peek depth=0
|
164
|
+
raise %Q[Illegal argument "#{depth}"] if depth < -1
|
165
|
+
temp = []
|
166
|
+
if depth == -1
|
167
|
+
temp.push(pull()) until empty?
|
168
|
+
else
|
169
|
+
while @stack.size+temp.size < depth+1
|
170
|
+
temp.push(pull())
|
171
|
+
end
|
172
|
+
end
|
173
|
+
@stack += temp if temp.size > 0
|
174
|
+
@stack[depth]
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns the next event. This is a +PullEvent+ object.
|
178
|
+
def pull
|
179
|
+
pull_event.tap do |event|
|
180
|
+
@listeners.each do |listener|
|
181
|
+
listener.receive event
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def pull_event
|
187
|
+
if @closed
|
188
|
+
x, @closed = @closed, nil
|
189
|
+
return [ :end_element, x ]
|
190
|
+
end
|
191
|
+
return [ :end_document ] if empty?
|
192
|
+
return @stack.shift if @stack.size > 0
|
193
|
+
#STDERR.puts @source.encoding
|
194
|
+
@source.read if @source.buffer.size<2
|
195
|
+
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
196
|
+
if @document_status == nil
|
197
|
+
#@source.consume( /^\s*/um )
|
198
|
+
word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
|
199
|
+
word = word[1] unless word.nil?
|
200
|
+
#STDERR.puts "WORD = #{word.inspect}"
|
201
|
+
case word
|
202
|
+
when COMMENT_START
|
203
|
+
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
204
|
+
when XMLDECL_START
|
205
|
+
#STDERR.puts "XMLDECL"
|
206
|
+
results = @source.match( XMLDECL_PATTERN, true )[1]
|
207
|
+
version = VERSION.match( results )
|
208
|
+
version = version[1] unless version.nil?
|
209
|
+
encoding = ENCODING.match(results)
|
210
|
+
encoding = encoding[1] unless encoding.nil?
|
211
|
+
if need_source_encoding_update?(encoding)
|
212
|
+
@source.encoding = encoding
|
213
|
+
end
|
214
|
+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
215
|
+
encoding = "UTF-16"
|
216
|
+
end
|
217
|
+
standalone = STANDALONE.match(results)
|
218
|
+
standalone = standalone[1] unless standalone.nil?
|
219
|
+
return [ :xmldecl, version, encoding, standalone ]
|
220
|
+
when INSTRUCTION_START
|
221
|
+
return process_instruction
|
222
|
+
when DOCTYPE_START
|
223
|
+
md = @source.match( DOCTYPE_PATTERN, true )
|
224
|
+
@nsstack.unshift(curr_ns=Set.new)
|
225
|
+
identity = md[1]
|
226
|
+
close = md[2]
|
227
|
+
identity =~ IDENTITY
|
228
|
+
name = $1
|
229
|
+
raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
|
230
|
+
pub_sys = $2.nil? ? nil : $2.strip
|
231
|
+
long_name = $4.nil? ? nil : $4.strip
|
232
|
+
uri = $6.nil? ? nil : $6.strip
|
233
|
+
args = [ :start_doctype, name, pub_sys, long_name, uri ]
|
234
|
+
if close == ">"
|
235
|
+
@document_status = :after_doctype
|
236
|
+
@source.read if @source.buffer.size<2
|
237
|
+
md = @source.match(/^\s*/um, true)
|
238
|
+
@stack << [ :end_doctype ]
|
239
|
+
else
|
240
|
+
@document_status = :in_doctype
|
241
|
+
end
|
242
|
+
return args
|
243
|
+
when /^\s+/
|
244
|
+
else
|
245
|
+
@document_status = :after_doctype
|
246
|
+
@source.read if @source.buffer.size<2
|
247
|
+
md = @source.match(/\s*/um, true)
|
248
|
+
if @source.encoding == "UTF-8"
|
249
|
+
@source.buffer.force_encoding(::Encoding::UTF_8)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
if @document_status == :in_doctype
|
254
|
+
md = @source.match(/\s*(.*?>)/um)
|
255
|
+
case md[1]
|
256
|
+
when SYSTEMENTITY
|
257
|
+
match = @source.match( SYSTEMENTITY, true )[1]
|
258
|
+
return [ :externalentity, match ]
|
259
|
+
|
260
|
+
when ELEMENTDECL_START
|
261
|
+
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
|
262
|
+
|
263
|
+
when ENTITY_START
|
264
|
+
match = @source.match( ENTITYDECL, true ).to_a.compact
|
265
|
+
match[0] = :entitydecl
|
266
|
+
ref = false
|
267
|
+
if match[1] == '%'
|
268
|
+
ref = true
|
269
|
+
match.delete_at 1
|
270
|
+
end
|
271
|
+
# Now we have to sort out what kind of entity reference this is
|
272
|
+
if match[2] == 'SYSTEM'
|
273
|
+
# External reference
|
274
|
+
match[3] = match[3][1..-2] # PUBID
|
275
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
276
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
277
|
+
elsif match[2] == 'PUBLIC'
|
278
|
+
# External reference
|
279
|
+
match[3] = match[3][1..-2] # PUBID
|
280
|
+
match[4] = match[4][1..-2] # HREF
|
281
|
+
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
282
|
+
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
283
|
+
else
|
284
|
+
match[2] = match[2][1..-2]
|
285
|
+
match.pop if match.size == 4
|
286
|
+
# match is [ :entity, name, value ]
|
287
|
+
end
|
288
|
+
match << '%' if ref
|
289
|
+
return match
|
290
|
+
when ATTLISTDECL_START
|
291
|
+
md = @source.match( ATTLISTDECL_PATTERN, true )
|
292
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
293
|
+
element = md[1]
|
294
|
+
contents = md[0]
|
295
|
+
|
296
|
+
pairs = {}
|
297
|
+
values = md[0].scan( ATTDEF_RE )
|
298
|
+
values.each do |attdef|
|
299
|
+
unless attdef[3] == "#IMPLIED"
|
300
|
+
attdef.compact!
|
301
|
+
val = attdef[3]
|
302
|
+
val = attdef[4] if val == "#FIXED "
|
303
|
+
pairs[attdef[0]] = val
|
304
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
305
|
+
@nsstack[0] << $1
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
return [ :attlistdecl, element, pairs, contents ]
|
310
|
+
when NOTATIONDECL_START
|
311
|
+
md = nil
|
312
|
+
if @source.match( PUBLIC )
|
313
|
+
md = @source.match( PUBLIC, true )
|
314
|
+
vals = [md[1],md[2],md[4],md[6]]
|
315
|
+
elsif @source.match( SYSTEM )
|
316
|
+
md = @source.match( SYSTEM, true )
|
317
|
+
vals = [md[1],md[2],nil,md[4]]
|
318
|
+
else
|
319
|
+
raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
|
320
|
+
end
|
321
|
+
return [ :notationdecl, *vals ]
|
322
|
+
when DOCTYPE_END
|
323
|
+
@document_status = :after_doctype
|
324
|
+
@source.match( DOCTYPE_END, true )
|
325
|
+
return [ :end_doctype ]
|
326
|
+
end
|
327
|
+
end
|
328
|
+
begin
|
329
|
+
if @source.buffer[0] == ?<
|
330
|
+
if @source.buffer[1] == ?/
|
331
|
+
@nsstack.shift
|
332
|
+
last_tag = @tags.pop
|
333
|
+
md = @source.match( CLOSE_MATCH, true )
|
334
|
+
if md.nil? or last_tag != md[1]
|
335
|
+
message = "Missing end tag for '#{last_tag}'"
|
336
|
+
message << " (got '#{md[1]}')" if md
|
337
|
+
raise REXML::ParseException.new(message, @source)
|
338
|
+
end
|
339
|
+
return [ :end_element, last_tag ]
|
340
|
+
elsif @source.buffer[1] == ?!
|
341
|
+
md = @source.match(/\A(\s*[^>]*>)/um)
|
342
|
+
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
343
|
+
raise REXML::ParseException.new("Malformed node", @source) unless md
|
344
|
+
if md[0][2] == ?-
|
345
|
+
md = @source.match( COMMENT_PATTERN, true )
|
346
|
+
|
347
|
+
case md[1]
|
348
|
+
when /--/, /-\z/
|
349
|
+
raise REXML::ParseException.new("Malformed comment", @source)
|
350
|
+
end
|
351
|
+
|
352
|
+
return [ :comment, md[1] ] if md
|
353
|
+
else
|
354
|
+
md = @source.match( CDATA_PATTERN, true )
|
355
|
+
return [ :cdata, md[1] ] if md
|
356
|
+
end
|
357
|
+
raise REXML::ParseException.new( "Declarations can only occur "+
|
358
|
+
"in the doctype declaration.", @source)
|
359
|
+
elsif @source.buffer[1] == ??
|
360
|
+
return process_instruction
|
361
|
+
else
|
362
|
+
# Get the next tag
|
363
|
+
md = @source.match(TAG_MATCH, true)
|
364
|
+
unless md
|
365
|
+
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
366
|
+
end
|
367
|
+
prefixes = Set.new
|
368
|
+
prefixes << md[2] if md[2]
|
369
|
+
@nsstack.unshift(curr_ns=Set.new)
|
370
|
+
attributes, closed = parse_attributes(prefixes, curr_ns)
|
371
|
+
# Verify that all of the prefixes have been defined
|
372
|
+
for prefix in prefixes
|
373
|
+
unless @nsstack.find{|k| k.member?(prefix)}
|
374
|
+
raise UndefinedNamespaceException.new(prefix,@source,self)
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
if closed
|
379
|
+
@closed = md[1]
|
380
|
+
@nsstack.shift
|
381
|
+
else
|
382
|
+
@tags.push( md[1] )
|
383
|
+
end
|
384
|
+
return [ :start_element, md[1], attributes ]
|
385
|
+
end
|
386
|
+
else
|
387
|
+
md = @source.match( TEXT_PATTERN, true )
|
388
|
+
if md[0].length == 0
|
389
|
+
@source.match( /(\s+)/, true )
|
390
|
+
end
|
391
|
+
#STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
|
392
|
+
#return [ :text, "" ] if md[0].length == 0
|
393
|
+
# unnormalized = Text::unnormalize( md[1], self )
|
394
|
+
# return PullEvent.new( :text, md[1], unnormalized )
|
395
|
+
return [ :text, md[1] ]
|
396
|
+
end
|
397
|
+
rescue REXML::UndefinedNamespaceException
|
398
|
+
raise
|
399
|
+
rescue REXML::ParseException
|
400
|
+
raise
|
401
|
+
rescue => error
|
402
|
+
raise REXML::ParseException.new( "Exception parsing",
|
403
|
+
@source, self, (error ? error : $!) )
|
404
|
+
end
|
405
|
+
return [ :dummy ]
|
406
|
+
end
|
407
|
+
private :pull_event
|
408
|
+
|
409
|
+
def entity( reference, entities )
|
410
|
+
value = nil
|
411
|
+
value = entities[ reference ] if entities
|
412
|
+
if not value
|
413
|
+
value = DEFAULT_ENTITIES[ reference ]
|
414
|
+
value = value[2] if value
|
415
|
+
end
|
416
|
+
unnormalize( value, entities ) if value
|
417
|
+
end
|
418
|
+
|
419
|
+
# Escapes all possible entities
|
420
|
+
def normalize( input, entities=nil, entity_filter=nil )
|
421
|
+
copy = input.clone
|
422
|
+
# Doing it like this rather than in a loop improves the speed
|
423
|
+
copy.gsub!( EREFERENCE, '&' )
|
424
|
+
entities.each do |key, value|
|
425
|
+
copy.gsub!( value, "&#{key};" ) unless entity_filter and
|
426
|
+
entity_filter.include?(entity)
|
427
|
+
end if entities
|
428
|
+
copy.gsub!( EREFERENCE, '&' )
|
429
|
+
DEFAULT_ENTITIES.each do |key, value|
|
430
|
+
copy.gsub!( value[3], value[1] )
|
431
|
+
end
|
432
|
+
copy
|
433
|
+
end
|
434
|
+
|
435
|
+
# Unescapes all possible entities
|
436
|
+
def unnormalize( string, entities=nil, filter=nil )
|
437
|
+
rv = string.clone
|
438
|
+
rv.gsub!( /\r\n?/, "\n" )
|
439
|
+
matches = rv.scan( REFERENCE_RE )
|
440
|
+
return rv if matches.size == 0
|
441
|
+
rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
|
442
|
+
m=$1
|
443
|
+
m = "0#{m}" if m[0] == ?x
|
444
|
+
[Integer(m)].pack('U*')
|
445
|
+
}
|
446
|
+
matches.collect!{|x|x[0]}.compact!
|
447
|
+
if matches.size > 0
|
448
|
+
matches.each do |entity_reference|
|
449
|
+
unless filter and filter.include?(entity_reference)
|
450
|
+
entity_value = entity( entity_reference, entities )
|
451
|
+
if entity_value
|
452
|
+
re = /&#{entity_reference};/
|
453
|
+
rv.gsub!( re, entity_value )
|
454
|
+
else
|
455
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
456
|
+
rv.gsub!( er[0], er[2] ) if er
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
rv.gsub!( /&/, '&' )
|
461
|
+
end
|
462
|
+
rv
|
463
|
+
end
|
464
|
+
|
465
|
+
private
|
466
|
+
def need_source_encoding_update?(xml_declaration_encoding)
|
467
|
+
return false if xml_declaration_encoding.nil?
|
468
|
+
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
|
469
|
+
true
|
470
|
+
end
|
471
|
+
|
472
|
+
def process_instruction
|
473
|
+
match_data = @source.match(INSTRUCTION_PATTERN, true)
|
474
|
+
unless match_data
|
475
|
+
message = "Invalid processing instruction node"
|
476
|
+
raise REXML::ParseException.new(message, @source)
|
477
|
+
end
|
478
|
+
[:processing_instruction, match_data[1], match_data[2]]
|
479
|
+
end
|
480
|
+
|
481
|
+
def parse_attributes(prefixes, curr_ns)
|
482
|
+
attributes = {}
|
483
|
+
closed = false
|
484
|
+
match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
485
|
+
if match_data.nil?
|
486
|
+
message = "Start tag isn't ended"
|
487
|
+
raise REXML::ParseException.new(message, @source)
|
488
|
+
end
|
489
|
+
|
490
|
+
raw_attributes = match_data[1]
|
491
|
+
closed = !match_data[2].nil?
|
492
|
+
return attributes, closed if raw_attributes.nil?
|
493
|
+
return attributes, closed if raw_attributes.empty?
|
494
|
+
|
495
|
+
scanner = StringScanner.new(raw_attributes)
|
496
|
+
until scanner.eos?
|
497
|
+
if scanner.scan(/\s+/)
|
498
|
+
break if scanner.eos?
|
499
|
+
end
|
500
|
+
|
501
|
+
pos = scanner.pos
|
502
|
+
loop do
|
503
|
+
break if scanner.scan(ATTRIBUTE_PATTERN)
|
504
|
+
unless scanner.scan(QNAME)
|
505
|
+
message = "Invalid attribute name: <#{scanner.rest}>"
|
506
|
+
raise REXML::ParseException.new(message, @source)
|
507
|
+
end
|
508
|
+
name = scanner[0]
|
509
|
+
unless scanner.scan(/\s*=\s*/um)
|
510
|
+
message = "Missing attribute equal: <#{name}>"
|
511
|
+
raise REXML::ParseException.new(message, @source)
|
512
|
+
end
|
513
|
+
quote = scanner.scan(/['"]/)
|
514
|
+
unless quote
|
515
|
+
message = "Missing attribute value start quote: <#{name}>"
|
516
|
+
raise REXML::ParseException.new(message, @source)
|
517
|
+
end
|
518
|
+
unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
|
519
|
+
match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
520
|
+
if match_data
|
521
|
+
scanner << "/" if closed
|
522
|
+
scanner << ">"
|
523
|
+
scanner << match_data[1]
|
524
|
+
scanner.pos = pos
|
525
|
+
closed = !match_data[2].nil?
|
526
|
+
next
|
527
|
+
end
|
528
|
+
message =
|
529
|
+
"Missing attribute value end quote: <#{name}>: <#{quote}>"
|
530
|
+
raise REXML::ParseException.new(message, @source)
|
531
|
+
end
|
532
|
+
end
|
533
|
+
name = scanner[1]
|
534
|
+
prefix = scanner[2]
|
535
|
+
local_part = scanner[3]
|
536
|
+
# quote = scanner[4]
|
537
|
+
value = scanner[5]
|
538
|
+
if prefix == "xmlns"
|
539
|
+
if local_part == "xml"
|
540
|
+
if value != "http://www.w3.org/XML/1998/namespace"
|
541
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
542
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
543
|
+
raise REXML::ParseException.new( msg, @source, self )
|
544
|
+
end
|
545
|
+
elsif local_part == "xmlns"
|
546
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
547
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
548
|
+
raise REXML::ParseException.new( msg, @source, self)
|
549
|
+
end
|
550
|
+
curr_ns << local_part
|
551
|
+
elsif prefix
|
552
|
+
prefixes << prefix unless prefix == "xml"
|
553
|
+
end
|
554
|
+
|
555
|
+
if attributes.has_key?(name)
|
556
|
+
msg = "Duplicate attribute #{name.inspect}"
|
557
|
+
raise REXML::ParseException.new(msg, @source, self)
|
558
|
+
end
|
559
|
+
|
560
|
+
attributes[name] = value
|
561
|
+
end
|
562
|
+
return attributes, closed
|
563
|
+
end
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
=begin
|
569
|
+
case event[0]
|
570
|
+
when :start_element
|
571
|
+
when :text
|
572
|
+
when :end_element
|
573
|
+
when :processing_instruction
|
574
|
+
when :cdata
|
575
|
+
when :comment
|
576
|
+
when :xmldecl
|
577
|
+
when :start_doctype
|
578
|
+
when :end_doctype
|
579
|
+
when :externalentity
|
580
|
+
when :elementdecl
|
581
|
+
when :entity
|
582
|
+
when :attlistdecl
|
583
|
+
when :notationdecl
|
584
|
+
when :end_doctype
|
585
|
+
end
|
586
|
+
=end
|