xmlscan 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +1276 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +31 -0
- data/README.rdoc +365 -0
- data/Rakefile +65 -0
- data/THANKS +11 -0
- data/VERSION +1 -0
- data/install.rb +41 -0
- data/lib/xmlscan/htmlscan.rb +290 -0
- data/lib/xmlscan/namespace.rb +353 -0
- data/lib/xmlscan/parser.rb +300 -0
- data/lib/xmlscan/scanner.rb +1123 -0
- data/lib/xmlscan/version.rb +23 -0
- data/lib/xmlscan/visitor.rb +162 -0
- data/lib/xmlscan/xmlchar.rb +248 -0
- data/test.rb +7 -0
- metadata +113 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/parser.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: parser.rb,v 1.10 2003/01/22 13:06:18 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'xmlscan/scanner'
|
11
|
+
|
12
|
+
|
13
|
+
module XMLScan
|
14
|
+
|
15
|
+
class XMLParser < XMLScanner
|
16
|
+
|
17
|
+
class AttributeChecker < Hash
|
18
|
+
# AttributeChecker inherits Hash only for speed.
|
19
|
+
|
20
|
+
def check_unique(name)
|
21
|
+
not key? name and store(name, true)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
#PredefinedEntity = {
|
28
|
+
# 'lt' => '<',
|
29
|
+
# 'gt' => '>',
|
30
|
+
# 'amp' => '&',
|
31
|
+
# 'quot' => '"',
|
32
|
+
# 'apos' => "'",
|
33
|
+
#}
|
34
|
+
|
35
|
+
|
36
|
+
def parse(*)
|
37
|
+
@elem = []
|
38
|
+
@attr = AttributeChecker.new
|
39
|
+
@standalone = nil
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def on_xmldecl_version(str)
|
47
|
+
unless str == '1.0' then
|
48
|
+
warning "unsupported XML version `#{str}'"
|
49
|
+
end
|
50
|
+
@visitor.on_xmldecl_version str
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def on_xmldecl_standalone(str)
|
55
|
+
if str == 'yes' then
|
56
|
+
@standalone = true
|
57
|
+
elsif str == 'no' then
|
58
|
+
@standalone = false
|
59
|
+
else
|
60
|
+
parse_error "standalone declaration must be either `yes' or `no'"
|
61
|
+
end
|
62
|
+
@visitor.on_xmldecl_standalone str
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def on_doctype(name, pubid, sysid)
|
67
|
+
if pubid and not sysid then
|
68
|
+
parse_error "public external ID must have both public ID and system ID"
|
69
|
+
end
|
70
|
+
@visitor.on_doctype name, pubid, sysid
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
def on_prolog_space(s)
|
75
|
+
# just ignore it.
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
def on_pi(target, pi)
|
80
|
+
if target.downcase == 'xml' then
|
81
|
+
parse_error "reserved PI target `#{target}'"
|
82
|
+
end
|
83
|
+
@visitor.on_pi target, pi
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
#def on_entityref(ref)
|
88
|
+
# rep = PredefinedEntity[ref]
|
89
|
+
# if rep then
|
90
|
+
# @visitor.on_chardata rep
|
91
|
+
# else
|
92
|
+
# @visitor.on_entityref ref
|
93
|
+
# end
|
94
|
+
#end
|
95
|
+
|
96
|
+
|
97
|
+
#def on_attr_entityref(ref)
|
98
|
+
# rep = PredefinedEntity[ref]
|
99
|
+
# if rep then
|
100
|
+
# @visitor.on_attr_value rep
|
101
|
+
# else
|
102
|
+
# @visitor.on_attr_entityref ref
|
103
|
+
# end
|
104
|
+
#end
|
105
|
+
|
106
|
+
|
107
|
+
#def on_charref_hex(code)
|
108
|
+
# on_charref code
|
109
|
+
#end
|
110
|
+
|
111
|
+
|
112
|
+
#def on_attr_charref_hex(code)
|
113
|
+
# on_attr_charref code
|
114
|
+
#end
|
115
|
+
|
116
|
+
|
117
|
+
def on_stag(name)
|
118
|
+
@elem.push name
|
119
|
+
@visitor.on_stag name
|
120
|
+
@attr.clear
|
121
|
+
end
|
122
|
+
|
123
|
+
def on_attribute(name)
|
124
|
+
unless @attr.check_unique name then
|
125
|
+
wellformed_error "doubled attribute `#{name}'"
|
126
|
+
end
|
127
|
+
@visitor.on_attribute name
|
128
|
+
end
|
129
|
+
|
130
|
+
def on_attr_value(str)
|
131
|
+
str.tr! "\t\r\n", ' ' # normalize
|
132
|
+
@visitor.on_attr_value str
|
133
|
+
end
|
134
|
+
|
135
|
+
def on_stag_end_empty(name)
|
136
|
+
# @visitor.on_stag_end name
|
137
|
+
# @elem.pop
|
138
|
+
# @visitor.on_etag name
|
139
|
+
@visitor.on_stag_end_empty name
|
140
|
+
@elem.pop
|
141
|
+
end
|
142
|
+
|
143
|
+
def on_etag(name)
|
144
|
+
last = @elem.pop
|
145
|
+
if last == name then
|
146
|
+
@visitor.on_etag name
|
147
|
+
elsif last then
|
148
|
+
wellformed_error "element type `#{name}' is not matched"
|
149
|
+
@visitor.on_etag last
|
150
|
+
else
|
151
|
+
parse_error "end tag `#{name}' appears alone"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
public
|
157
|
+
|
158
|
+
|
159
|
+
def scan_content(s)
|
160
|
+
elem = @elem # for speed
|
161
|
+
src = @src # for speed
|
162
|
+
found_root_element = false
|
163
|
+
|
164
|
+
begin
|
165
|
+
|
166
|
+
# -- first start tag --
|
167
|
+
elem.clear
|
168
|
+
found_stag = false
|
169
|
+
|
170
|
+
while s and not found_stag
|
171
|
+
if (c = s[0]) == ?< then
|
172
|
+
if (c = s[1]) == ?/ then
|
173
|
+
# should be a parse error
|
174
|
+
scan_etag s
|
175
|
+
elsif c == ?! then
|
176
|
+
if s[2] == ?- and s[3] == ?- then
|
177
|
+
scan_comment s
|
178
|
+
elsif /\A<!\[CDATA\[/n =~ s then
|
179
|
+
parse_error "CDATA section is found outside of root element"
|
180
|
+
scan_cdata $'
|
181
|
+
else
|
182
|
+
scan_bang_tag s
|
183
|
+
end
|
184
|
+
elsif c == ?? then
|
185
|
+
scan_pi s
|
186
|
+
else
|
187
|
+
found_root_element = true
|
188
|
+
found_stag = true
|
189
|
+
scan_stag s
|
190
|
+
end
|
191
|
+
else
|
192
|
+
parse_error "content of element is found outside of root element"
|
193
|
+
scan_chardata s
|
194
|
+
end
|
195
|
+
s = src.get
|
196
|
+
end
|
197
|
+
|
198
|
+
if not found_root_element and not found_stag then
|
199
|
+
parse_error "no root element was found"
|
200
|
+
end
|
201
|
+
|
202
|
+
# -- contents --
|
203
|
+
while s and not elem.empty?
|
204
|
+
if (c = s[0]) == ?< then
|
205
|
+
if (c = s[1]) == ?/ then
|
206
|
+
scan_etag s
|
207
|
+
elsif c == ?! then
|
208
|
+
if s[2] == ?- and s[3] == ?- then
|
209
|
+
scan_comment s
|
210
|
+
elsif /\A<!\[CDATA\[/n =~ s then
|
211
|
+
scan_cdata $'
|
212
|
+
else
|
213
|
+
scan_bang_tag s
|
214
|
+
end
|
215
|
+
elsif c == ?? then
|
216
|
+
scan_pi s
|
217
|
+
else
|
218
|
+
scan_stag s
|
219
|
+
end
|
220
|
+
else
|
221
|
+
scan_chardata s
|
222
|
+
end
|
223
|
+
s = src.get
|
224
|
+
end
|
225
|
+
|
226
|
+
unless elem.empty? then
|
227
|
+
while name = elem.pop
|
228
|
+
parse_error "unclosed element `#{name}' meets EOF"
|
229
|
+
@visitor.on_etag name
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# -- epilogue --
|
234
|
+
finish = true
|
235
|
+
|
236
|
+
while s
|
237
|
+
if (c = s[0]) == ?< then
|
238
|
+
if (c = s[1]) == ?/ then
|
239
|
+
finish = false # content out of root element
|
240
|
+
break
|
241
|
+
elsif c == ?! then
|
242
|
+
if s[2] == ?- and s[3] == ?- then
|
243
|
+
scan_comment s
|
244
|
+
else
|
245
|
+
finish = false # content out of root element
|
246
|
+
break
|
247
|
+
end
|
248
|
+
elsif c == ?? then
|
249
|
+
scan_pi s
|
250
|
+
else
|
251
|
+
parse_error "another root element is found" # stag
|
252
|
+
finish = false
|
253
|
+
break
|
254
|
+
end
|
255
|
+
else
|
256
|
+
if s.strip.empty? then
|
257
|
+
on_prolog_space s
|
258
|
+
else
|
259
|
+
finish = false # content out of root element
|
260
|
+
break
|
261
|
+
end
|
262
|
+
end
|
263
|
+
s = src.get
|
264
|
+
end
|
265
|
+
|
266
|
+
end until finish
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
|
278
|
+
|
279
|
+
if $0 == __FILE__ then
|
280
|
+
class TestVisitor
|
281
|
+
include XMLScan::Visitor
|
282
|
+
def parse_error(msg)
|
283
|
+
STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
284
|
+
end
|
285
|
+
def wellformed_error(msg)
|
286
|
+
STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
287
|
+
end
|
288
|
+
def warning(msg)
|
289
|
+
STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
$s = scan = XMLScan::XMLParser.new(TestVisitor.new)
|
294
|
+
src = ARGF
|
295
|
+
def src.path; filename; end
|
296
|
+
t1 = Time.times.utime
|
297
|
+
scan.parse src
|
298
|
+
t2 = Time.times.utime
|
299
|
+
STDERR.printf "%2.3f sec\n", t2 - t1
|
300
|
+
end
|
@@ -0,0 +1,1123 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/scanner.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: scanner.rb,v 1.75.2.3 2003/05/01 15:43:23 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
#
|
11
|
+
# CONSIDERATIONS FOR CHARACTER ENCODINGS:
|
12
|
+
#
|
13
|
+
# There are the following common characteristics in character encodings
|
14
|
+
# which are supported by Ruby's $KCODE feature (ISO-8859-*, Shift_JIS,
|
15
|
+
# EUC, and UTF-8):
|
16
|
+
#
|
17
|
+
# - Stateless.
|
18
|
+
# - ASCII characters are encoded in the same manner as US-ASCII.
|
19
|
+
# - The octet sequences corresponding to non-ASCII characters begin
|
20
|
+
# with an octet greater than 0x80.
|
21
|
+
# - The following characters can be identified by just one octet.
|
22
|
+
# That is, every octets corresponding to the following characters in
|
23
|
+
# US-ASCII never appear as a part of an octet sequence representing a
|
24
|
+
# non-ASCII character.
|
25
|
+
#
|
26
|
+
# Whitespaces("\t", "\n", "\r", and " ") and
|
27
|
+
# ! \ " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
28
|
+
#
|
29
|
+
# Be careful that `[' and `]' are NOT included in the list!
|
30
|
+
#
|
31
|
+
# If we build a regular expression carefully in accordance with these
|
32
|
+
# characteristics, we can get the same match regardless of the value
|
33
|
+
# of $KCODE. Moreover, if it can be premised on them, we can detect
|
34
|
+
# several delimiters without regular expressions. XMLScanner uses this
|
35
|
+
# fact in order to share many regular expressions in all $KCODE modes,
|
36
|
+
# and in order to optimize parsing speed.
|
37
|
+
#
|
38
|
+
|
39
|
+
require 'xmlscan/visitor'
|
40
|
+
|
41
|
+
|
42
|
+
module XMLScan
|
43
|
+
|
44
|
+
class Input
|
45
|
+
|
46
|
+
def initialize(src)
|
47
|
+
@src = src
|
48
|
+
unless src.respond_to? :gets then
|
49
|
+
if src.respond_to? :to_ary then
|
50
|
+
@v = src.to_ary
|
51
|
+
@n = -1
|
52
|
+
def self.gets ; @v.at(@n += 1) ; end
|
53
|
+
def self.lineno ; @n + 1 ; end
|
54
|
+
else
|
55
|
+
@v = @src
|
56
|
+
def self.gets ; s = @v ; @v = nil ; s ; end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
if src.respond_to? :lineno then
|
60
|
+
def self.lineno ; @src.lineno ; end
|
61
|
+
end
|
62
|
+
if src.respond_to? :path then
|
63
|
+
def self.path ; @src.path ; end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
attr_reader :src
|
68
|
+
|
69
|
+
def gets ; @src.gets ; end
|
70
|
+
def lineno ; 0 ; end
|
71
|
+
def path ; '-' ; end
|
72
|
+
|
73
|
+
def self.wrap(src)
|
74
|
+
unless src.respond_to? :gets and src.respond_to? :lineno and
|
75
|
+
src.respond_to? :path then
|
76
|
+
src = new(src)
|
77
|
+
end
|
78
|
+
src
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.unwrap(obj)
|
82
|
+
if self === obj then
|
83
|
+
obj.src
|
84
|
+
else
|
85
|
+
obj
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
class PrivateArray < Array
|
94
|
+
m = superclass.instance_methods - Kernel.instance_methods
|
95
|
+
private(*m)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class Source < PrivateArray
|
100
|
+
# Source inherits Array only for speed.
|
101
|
+
|
102
|
+
def initialize(src)
|
103
|
+
super()
|
104
|
+
@src = Input.wrap(src)
|
105
|
+
@eof = false
|
106
|
+
@last = nil
|
107
|
+
end
|
108
|
+
|
109
|
+
def source
|
110
|
+
Input.unwrap @src
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
def eof?
|
115
|
+
@eof and empty?
|
116
|
+
end
|
117
|
+
|
118
|
+
def abort
|
119
|
+
@eof = true
|
120
|
+
@last = nil
|
121
|
+
clear
|
122
|
+
self
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
def get
|
127
|
+
pop or
|
128
|
+
unless @eof then
|
129
|
+
last = @last
|
130
|
+
begin
|
131
|
+
src = @src.gets
|
132
|
+
unless src then
|
133
|
+
@eof = true
|
134
|
+
unshift last
|
135
|
+
last = nil
|
136
|
+
break
|
137
|
+
end
|
138
|
+
a = src.split(/(?=<|>[<>])|>/, -1)
|
139
|
+
if last then
|
140
|
+
unless /\A[<>]/ =~ a.first then
|
141
|
+
a[0] = last << (a.first || '')
|
142
|
+
else
|
143
|
+
push last
|
144
|
+
end
|
145
|
+
end
|
146
|
+
concat a
|
147
|
+
last = pop
|
148
|
+
end while empty?
|
149
|
+
@last = last
|
150
|
+
reverse!
|
151
|
+
pop
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def prepare
|
157
|
+
s = get
|
158
|
+
s = get and s = '>' << s if s and s.empty? # preserve first `>'
|
159
|
+
s and push s
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def tag_end?
|
164
|
+
s = last || @last and s[0] != ?<
|
165
|
+
end
|
166
|
+
|
167
|
+
def tag_start?
|
168
|
+
s = last || @last and s[0] == ?<
|
169
|
+
end
|
170
|
+
|
171
|
+
def close_tag # tag_end?, and remove a `>'.
|
172
|
+
unless s = last || @last and s[0] != ?< then
|
173
|
+
false
|
174
|
+
else
|
175
|
+
if s == '>' or s.empty? then
|
176
|
+
s1 = get
|
177
|
+
unless s = last || @last and s[0] == ?< then # for speed up
|
178
|
+
out = [ s1 ]
|
179
|
+
out.push get while s = last || @last and s == '>' || s.empty?
|
180
|
+
x=out.pop unless s and s[0] != ?< # De Morgan
|
181
|
+
concat out
|
182
|
+
end
|
183
|
+
end
|
184
|
+
true
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
def get_text # get until tag_start?
|
190
|
+
s = last || @last and s[0] != ?< and get
|
191
|
+
end
|
192
|
+
|
193
|
+
def get_tag # get until tag_end?
|
194
|
+
s = last || @last and s[0] == ?< and get
|
195
|
+
end
|
196
|
+
|
197
|
+
def get_plain
|
198
|
+
s = get
|
199
|
+
s = '>' << s unless not s or (c = s[0]) == ?< or c == ?> # De Morgan
|
200
|
+
s
|
201
|
+
end
|
202
|
+
|
203
|
+
def lineno
|
204
|
+
@src.lineno
|
205
|
+
end
|
206
|
+
|
207
|
+
def path
|
208
|
+
@src.path
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# The following methods are for debug.
|
213
|
+
|
214
|
+
def inspect
|
215
|
+
a = []
|
216
|
+
reverse_each { |i|
|
217
|
+
a.push ">" unless /\A[<>]/ =~ i
|
218
|
+
a.push i.inspect
|
219
|
+
}
|
220
|
+
last = []
|
221
|
+
if @last then
|
222
|
+
last.push ">" unless /\A[<>]/ =~ @last
|
223
|
+
last.push @last.inspect
|
224
|
+
end
|
225
|
+
a.push '#eof' if @eof
|
226
|
+
"((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
|
227
|
+
end
|
228
|
+
|
229
|
+
def each
|
230
|
+
prepare
|
231
|
+
while s = get
|
232
|
+
yield s
|
233
|
+
end
|
234
|
+
self
|
235
|
+
end
|
236
|
+
|
237
|
+
def test
|
238
|
+
last or @last or (s = get and push s and s)
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
class XMLScanner
|
246
|
+
|
247
|
+
class << self
|
248
|
+
|
249
|
+
def provided_options
|
250
|
+
options = []
|
251
|
+
private_instance_methods.each { |i|
|
252
|
+
options.push $' if /\Aapply_option_/ =~ i
|
253
|
+
}
|
254
|
+
options
|
255
|
+
end
|
256
|
+
|
257
|
+
def apply_option(instance, option)
|
258
|
+
instance.__send__ "apply_option_#{option}"
|
259
|
+
end
|
260
|
+
|
261
|
+
def apply_options(instance, options)
|
262
|
+
h = {}
|
263
|
+
options.each { |i| h[i.to_s] = true }
|
264
|
+
options = h
|
265
|
+
ancestors.each { |klass|
|
266
|
+
if klass.respond_to? :provided_options then
|
267
|
+
klass.provided_options.each { |i|
|
268
|
+
if options.include? i then
|
269
|
+
options.delete i
|
270
|
+
klass.apply_option instance, i
|
271
|
+
end
|
272
|
+
}
|
273
|
+
end
|
274
|
+
}
|
275
|
+
unless options.empty? then
|
276
|
+
raise ArgumentError, "undefined option `#{options.keys[0]}'"
|
277
|
+
end
|
278
|
+
instance
|
279
|
+
end
|
280
|
+
private :apply_options
|
281
|
+
|
282
|
+
def new(visitor, *options)
|
283
|
+
instance = super(visitor)
|
284
|
+
apply_options instance, options
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
|
290
|
+
|
291
|
+
def initialize(visitor)
|
292
|
+
@visitor = visitor
|
293
|
+
@decoration = nil
|
294
|
+
@src = nil
|
295
|
+
@optkey = nil
|
296
|
+
end
|
297
|
+
|
298
|
+
attr_accessor :optkey
|
299
|
+
|
300
|
+
def opt_encoding() OptRegexp::RE_ENCODINGS[optkey] end
|
301
|
+
|
302
|
+
|
303
|
+
def decorate(decoration)
|
304
|
+
unless @decoration then
|
305
|
+
@visitor = @decoration = Decoration.new(@visitor)
|
306
|
+
end
|
307
|
+
@decoration.expand decoration
|
308
|
+
end
|
309
|
+
private :decorate
|
310
|
+
|
311
|
+
|
312
|
+
def lineno
|
313
|
+
@src && @src.lineno
|
314
|
+
end
|
315
|
+
|
316
|
+
def path
|
317
|
+
@src && @src.path
|
318
|
+
end
|
319
|
+
|
320
|
+
def source
|
321
|
+
@src.source
|
322
|
+
end
|
323
|
+
|
324
|
+
|
325
|
+
private
|
326
|
+
|
327
|
+
def parse_error(msg)
|
328
|
+
@visitor.parse_error msg
|
329
|
+
end
|
330
|
+
|
331
|
+
def wellformed_error(msg)
|
332
|
+
@visitor.wellformed_error msg
|
333
|
+
end
|
334
|
+
|
335
|
+
def valid_error(msg)
|
336
|
+
@visitor.valid_error msg
|
337
|
+
end
|
338
|
+
|
339
|
+
def warning(msg)
|
340
|
+
@visitor.warning msg
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
def on_xmldecl
|
345
|
+
@visitor.on_xmldecl
|
346
|
+
end
|
347
|
+
|
348
|
+
def on_xmldecl_key(key, str)
|
349
|
+
meth = "on_xmldecl_#{key}"
|
350
|
+
if @visitor.respond_to? meth
|
351
|
+
self.send meth, str
|
352
|
+
else
|
353
|
+
self.send :on_xmldecl_other, key, str
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def on_xmldecl_version(str)
|
358
|
+
@visitor.on_xmldecl_version str
|
359
|
+
end
|
360
|
+
|
361
|
+
def on_xmldecl_encoding(str)
|
362
|
+
@visitor.on_xmldecl_encoding str
|
363
|
+
end
|
364
|
+
|
365
|
+
def on_xmldecl_standalone(str)
|
366
|
+
@visitor.on_xmldecl_standalone str
|
367
|
+
end
|
368
|
+
|
369
|
+
def on_xmldecl_other(name, value)
|
370
|
+
@visitor.on_xmldecl_other name, value
|
371
|
+
end
|
372
|
+
|
373
|
+
def on_xmldecl_end
|
374
|
+
@visitor.on_xmldecl_end
|
375
|
+
end
|
376
|
+
|
377
|
+
def on_doctype(root, pubid, sysid)
|
378
|
+
@visitor.on_doctype root, pubid, sysid
|
379
|
+
end
|
380
|
+
|
381
|
+
def on_prolog_space(str)
|
382
|
+
@visitor.on_prolog_space str
|
383
|
+
end
|
384
|
+
|
385
|
+
def on_comment(str)
|
386
|
+
@visitor.on_comment str
|
387
|
+
end
|
388
|
+
|
389
|
+
def on_pi(target, pi)
|
390
|
+
@visitor.on_pi target, pi
|
391
|
+
end
|
392
|
+
|
393
|
+
def on_chardata(str)
|
394
|
+
@visitor.on_chardata str
|
395
|
+
end
|
396
|
+
|
397
|
+
def on_cdata(str)
|
398
|
+
@visitor.on_cdata str
|
399
|
+
end
|
400
|
+
|
401
|
+
def on_etag(name)
|
402
|
+
@visitor.on_etag name
|
403
|
+
end
|
404
|
+
|
405
|
+
def on_entityref(ref)
|
406
|
+
@visitor.on_entityref ref
|
407
|
+
end
|
408
|
+
|
409
|
+
def on_charref(code)
|
410
|
+
@visitor.on_charref code
|
411
|
+
end
|
412
|
+
|
413
|
+
def on_charref_hex(code)
|
414
|
+
@visitor.on_charref_hex code
|
415
|
+
end
|
416
|
+
|
417
|
+
def on_start_document
|
418
|
+
@visitor.on_start_document
|
419
|
+
end
|
420
|
+
|
421
|
+
def on_end_document
|
422
|
+
@visitor.on_end_document
|
423
|
+
end
|
424
|
+
|
425
|
+
|
426
|
+
# <hoge fuga="foo&bar;&&foo" />HOGE
|
427
|
+
# ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
|
428
|
+
# 1 2 3 4 5 6 7 8 9 A
|
429
|
+
#
|
430
|
+
# The following method will be called with the following arguments
|
431
|
+
# when the parser reaches the above point;
|
432
|
+
#
|
433
|
+
# 1: on_stag ('hoge')
|
434
|
+
# 2: on_attribute ('fuga')
|
435
|
+
# 3: on_attr_value ('foo')
|
436
|
+
# 4: on_attr_entityref ('bar')
|
437
|
+
# 5: on_attr_charref (38)
|
438
|
+
# 6: on_attr_charref_hex (38)
|
439
|
+
# 7: on_attr_value ('foo')
|
440
|
+
# 8: on_attribute_end ('fuga')
|
441
|
+
# 9: on_stag_end_empty ('hoge')
|
442
|
+
# or
|
443
|
+
# on_stag_end ('hoge')
|
444
|
+
#
|
445
|
+
# A: on_chardata ('HOGE')
|
446
|
+
|
447
|
+
def on_stag(name)
|
448
|
+
@visitor.on_stag name
|
449
|
+
end
|
450
|
+
|
451
|
+
def on_attribute(name)
|
452
|
+
@visitor.on_attribute name
|
453
|
+
end
|
454
|
+
|
455
|
+
def on_attr_value(str)
|
456
|
+
@visitor.on_attr_value str
|
457
|
+
end
|
458
|
+
|
459
|
+
def on_attr_entityref(ref)
|
460
|
+
@visitor.on_attr_entityref ref
|
461
|
+
end
|
462
|
+
|
463
|
+
def on_attr_charref(code)
|
464
|
+
@visitor.on_attr_charref code
|
465
|
+
end
|
466
|
+
|
467
|
+
def on_attr_charref_hex(code)
|
468
|
+
@visitor.on_attr_charref_hex code
|
469
|
+
end
|
470
|
+
|
471
|
+
def on_attribute_end(name)
|
472
|
+
@visitor.on_attribute_end name
|
473
|
+
end
|
474
|
+
|
475
|
+
def on_stag_end_empty(name)
|
476
|
+
@visitor.on_stag_end_empty name
|
477
|
+
end
|
478
|
+
|
479
|
+
def on_stag_end(name)
|
480
|
+
@visitor.on_stag_end name
|
481
|
+
end
|
482
|
+
|
483
|
+
|
484
|
+
|
485
|
+
private
|
486
|
+
|
487
|
+
module OptRegexp
|
488
|
+
UTFSTR = "é"
|
489
|
+
S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
|
490
|
+
E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
|
491
|
+
|
492
|
+
RE_ENCODINGS = {
|
493
|
+
:n=>/e/n.encoding,
|
494
|
+
:e=>/#{E_OPT_EXAMPLE}/e.encoding,
|
495
|
+
:s=>/#{S_OPT_EXAMPLE}/s.encoding,
|
496
|
+
:u=>/#{UTFSTR}/u.encoding
|
497
|
+
}
|
498
|
+
|
499
|
+
RE_ENCODING_OPTIONS = {
|
500
|
+
:n=>/e/n.options,
|
501
|
+
:e=>/#{E_OPT_EXAMPLE}/e.options,
|
502
|
+
:s=>/#{S_OPT_EXAMPLE}/s.options,
|
503
|
+
:u=>/#{UTFSTR}/u.options
|
504
|
+
}
|
505
|
+
|
506
|
+
private
|
507
|
+
def opt_regexp(re)
|
508
|
+
h = {}
|
509
|
+
RE_ENCODING_OPTIONS.each { |k,opt|
|
510
|
+
h[k] = Regexp.new(re.encode(RE_ENCODINGS[k]), opt)
|
511
|
+
}
|
512
|
+
h.default = Regexp.new(re)
|
513
|
+
h
|
514
|
+
end
|
515
|
+
end
|
516
|
+
extend OptRegexp
|
517
|
+
|
518
|
+
|
519
|
+
InvalidEntityRef = opt_regexp('(?=[^#\d\w]|\z)')
|
520
|
+
|
521
|
+
def scan_chardata(s)
|
522
|
+
while true
|
523
|
+
unless /&/ =~ s then
|
524
|
+
on_chardata s
|
525
|
+
else
|
526
|
+
s = $`
|
527
|
+
on_chardata s unless s.empty?
|
528
|
+
ref = nil
|
529
|
+
$'.split('&', -1).each { |s|
|
530
|
+
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
531
|
+
if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
|
532
|
+
then
|
533
|
+
parse_error "reference to `#{ref}' doesn't end with `;'"
|
534
|
+
else
|
535
|
+
parse_error "`&' is not used for entity/character references"
|
536
|
+
on_chardata('&' << s)
|
537
|
+
next
|
538
|
+
end
|
539
|
+
end
|
540
|
+
ref = $`
|
541
|
+
s = $'
|
542
|
+
if /\A[^#]/ =~ ref then
|
543
|
+
on_entityref ref
|
544
|
+
elsif /\A#(\d+)\z/ =~ ref then
|
545
|
+
on_charref $1.to_i
|
546
|
+
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
547
|
+
on_charref_hex $1.hex
|
548
|
+
else
|
549
|
+
parse_error "invalid character reference `#{ref}'"
|
550
|
+
end
|
551
|
+
on_chardata s unless s.empty?
|
552
|
+
}
|
553
|
+
end
|
554
|
+
s = @src.get_text
|
555
|
+
break unless s
|
556
|
+
s = '>' << s unless s == '>'
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
|
561
|
+
def scan_attvalue(s) # almostly copy & paste from scan_chardata
|
562
|
+
unless /&/ =~ s then
|
563
|
+
on_attr_value s
|
564
|
+
else
|
565
|
+
s = $`
|
566
|
+
on_attr_value s unless s.empty?
|
567
|
+
ref = nil
|
568
|
+
$'.split('&', -1).each { |s|
|
569
|
+
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
570
|
+
if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
|
571
|
+
then
|
572
|
+
parse_error "reference to `#{ref}' doesn't end with `;'"
|
573
|
+
else
|
574
|
+
parse_error "`&' is not used for entity/character references"
|
575
|
+
on_attr_value('&' << s)
|
576
|
+
next
|
577
|
+
end
|
578
|
+
end
|
579
|
+
ref = $`
|
580
|
+
s = $'
|
581
|
+
if /\A[^#]/ =~ ref then
|
582
|
+
on_attr_entityref ref
|
583
|
+
elsif /\A#(\d+)\z/ =~ ref then
|
584
|
+
on_attr_charref $1.to_i
|
585
|
+
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
586
|
+
on_attr_charref_hex $1.hex
|
587
|
+
else
|
588
|
+
parse_error "invalid character reference `#{ref}'"
|
589
|
+
end
|
590
|
+
on_attr_value s unless s.empty?
|
591
|
+
}
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
|
596
|
+
def scan_comment(s)
|
597
|
+
s[0,4] = '' # remove `<!--'
|
598
|
+
comm = ''
|
599
|
+
until /--/ =~ s
|
600
|
+
comm << s
|
601
|
+
s = @src.get_plain
|
602
|
+
unless s then
|
603
|
+
parse_error "unterminated comment meets EOF"
|
604
|
+
return on_comment(comm)
|
605
|
+
end
|
606
|
+
end
|
607
|
+
comm << $`
|
608
|
+
until (s = $').empty? and @src.close_tag
|
609
|
+
if s == '-' and @src.close_tag then # --->
|
610
|
+
parse_error "comment ending in `--->' is not allowed"
|
611
|
+
comm << s
|
612
|
+
break
|
613
|
+
end
|
614
|
+
parse_error "comment includes `--'"
|
615
|
+
comm << '--'
|
616
|
+
until /--/ =~ s # copy & paste for performance
|
617
|
+
comm << s
|
618
|
+
s = @src.get_plain
|
619
|
+
unless s then
|
620
|
+
parse_error "unterminated comment meets EOF"
|
621
|
+
return on_comment(comm)
|
622
|
+
end
|
623
|
+
end
|
624
|
+
comm << $`
|
625
|
+
end
|
626
|
+
on_comment comm
|
627
|
+
end
|
628
|
+
|
629
|
+
|
630
|
+
def scan_pi(s)
|
631
|
+
unless /\A<\?([^ \t\n\r?]+)(?:[ \t\n\r]+|(?=\?\z))/ =~ s then
|
632
|
+
parse_error "parse error at `<?'"
|
633
|
+
s << '>' if @src.close_tag
|
634
|
+
on_chardata s
|
635
|
+
else
|
636
|
+
target = $1
|
637
|
+
pi = $'
|
638
|
+
until pi[-1] == ?? and @src.close_tag
|
639
|
+
s = @src.get_plain
|
640
|
+
unless s then
|
641
|
+
parse_error "unterminated PI meets EOF"
|
642
|
+
return on_pi(target, pi)
|
643
|
+
end
|
644
|
+
pi << s
|
645
|
+
end
|
646
|
+
pi.chop! # remove last `?'
|
647
|
+
on_pi target, pi
|
648
|
+
end
|
649
|
+
end
|
650
|
+
|
651
|
+
|
652
|
+
CDATAPattern = opt_regexp('\]\]\z')
|
653
|
+
|
654
|
+
def scan_cdata(s)
|
655
|
+
cdata = s
|
656
|
+
re = CDATAPattern[@optkey]
|
657
|
+
until re =~ cdata and @src.close_tag
|
658
|
+
s = @src.get_plain
|
659
|
+
unless s then
|
660
|
+
parse_error "unterminated CDATA section meets EOF"
|
661
|
+
return on_cdata(cdata)
|
662
|
+
end
|
663
|
+
cdata << s
|
664
|
+
end
|
665
|
+
cdata.chop!.chop! # remove ']]'
|
666
|
+
on_cdata cdata
|
667
|
+
end
|
668
|
+
|
669
|
+
|
670
|
+
def found_unclosed_etag(name)
|
671
|
+
if @src.tag_start? then
|
672
|
+
parse_error "unclosed end tag `#{name}' meets another tag"
|
673
|
+
else
|
674
|
+
parse_error "unclosed end tag `#{name}' meets EOF"
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
def found_empty_etag
|
679
|
+
parse_error "parse error at `</'"
|
680
|
+
on_chardata '</>'
|
681
|
+
end
|
682
|
+
|
683
|
+
|
684
|
+
def scan_etag(s)
|
685
|
+
s[0,2] = '' # remove '</'
|
686
|
+
if s.empty? then
|
687
|
+
if @src.close_tag then # </>
|
688
|
+
return found_empty_etag
|
689
|
+
else # </< or </[EOF]
|
690
|
+
parse_error "parse error at `</'"
|
691
|
+
s << '>' if @src.close_tag
|
692
|
+
return on_chardata('</' << s)
|
693
|
+
end
|
694
|
+
elsif /[ \t\n\r]+/ =~ s then
|
695
|
+
s1, s2 = $`, $'
|
696
|
+
if s1.empty? then # </ tag
|
697
|
+
parse_error "parse error at `</'"
|
698
|
+
s << '>' if @src.close_tag
|
699
|
+
return on_chardata('</' + s)
|
700
|
+
elsif not s2.empty? then # </ta g
|
701
|
+
parse_error "illegal whitespace is found within end tag `#{s1}'"
|
702
|
+
while @src.get_tag
|
703
|
+
end
|
704
|
+
end
|
705
|
+
s = s1
|
706
|
+
end
|
707
|
+
found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
|
708
|
+
on_etag s
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
def found_empty_stag
|
713
|
+
parse_error "parse error at `<'"
|
714
|
+
on_chardata '<>'
|
715
|
+
end
|
716
|
+
|
717
|
+
def found_unclosed_stag(name)
|
718
|
+
if @src.tag_start? then
|
719
|
+
parse_error "unclosed start tag `#{name}' meets another tag"
|
720
|
+
else
|
721
|
+
parse_error "unclosed start tag `#{name}' meets EOF"
|
722
|
+
end
|
723
|
+
end
|
724
|
+
|
725
|
+
def found_unclosed_emptyelem(name)
|
726
|
+
if @src.tag_start? then
|
727
|
+
parse_error "unclosed empty element tag `#{name}' meets another tag"
|
728
|
+
else
|
729
|
+
parse_error "unclosed empty element tag `#{name}' meets EOF"
|
730
|
+
end
|
731
|
+
end
|
732
|
+
|
733
|
+
|
734
|
+
def found_stag_error(s)
|
735
|
+
if /\A[\/='"]/ =~ s then
|
736
|
+
tok, s = $&, $'
|
737
|
+
elsif /(?=[ \t\n\r\/='"])/ =~ s then
|
738
|
+
tok, s = $`, $'
|
739
|
+
else
|
740
|
+
tok, s = s, nil
|
741
|
+
end
|
742
|
+
parse_error "parse error at `#{tok}'"
|
743
|
+
s
|
744
|
+
end
|
745
|
+
|
746
|
+
|
747
|
+
def scan_stag(s)
|
748
|
+
unless /(?=[\/ \t\n\r='"])/ =~ s then
|
749
|
+
name = s
|
750
|
+
name[0,1] = '' # remove `<'
|
751
|
+
if name.empty? then
|
752
|
+
if @src.close_tag then # <>
|
753
|
+
return found_empty_stag
|
754
|
+
else # << or <[EOF]
|
755
|
+
parse_error "parse error at `<'"
|
756
|
+
return on_chardata('<')
|
757
|
+
end
|
758
|
+
end
|
759
|
+
on_stag name
|
760
|
+
found_unclosed_stag name unless @src.close_tag
|
761
|
+
on_stag_end name
|
762
|
+
else
|
763
|
+
name = $`
|
764
|
+
s = $'
|
765
|
+
name[0,1] = '' # remove `<'
|
766
|
+
if name.empty? then # `< tag' or `<=`
|
767
|
+
parse_error "parse error at `<'"
|
768
|
+
s << '>' if @src.close_tag
|
769
|
+
return on_chardata('<' << s)
|
770
|
+
end
|
771
|
+
on_stag name
|
772
|
+
emptyelem = false
|
773
|
+
key,val,error,qmark,c = nil
|
774
|
+
begin
|
775
|
+
continue = false
|
776
|
+
s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
|
777
|
+
) { |key,val,error|
|
778
|
+
if key then # key="value"
|
779
|
+
on_attribute key
|
780
|
+
qmark = val.slice!(0,1)
|
781
|
+
if val[-1] == qmark[0] then
|
782
|
+
val.chop!
|
783
|
+
scan_attvalue val unless val.empty?
|
784
|
+
else
|
785
|
+
scan_attvalue val unless val.empty?
|
786
|
+
begin
|
787
|
+
s = @src.get
|
788
|
+
unless s then
|
789
|
+
parse_error "unterminated attribute `#{key}' meets EOF"
|
790
|
+
break
|
791
|
+
end
|
792
|
+
c = s[0]
|
793
|
+
val, s = s.split(qmark, 2)
|
794
|
+
if c == ?< then
|
795
|
+
wellformed_error "`<' is found in attribute `#{key}'"
|
796
|
+
elsif c != ?> then
|
797
|
+
scan_attvalue '>'
|
798
|
+
end
|
799
|
+
scan_attvalue val if c
|
800
|
+
end until s
|
801
|
+
continue = s # if eof then continue is false, else true.
|
802
|
+
end
|
803
|
+
on_attribute_end key
|
804
|
+
elsif error then
|
805
|
+
continue = s = found_stag_error(error)
|
806
|
+
else
|
807
|
+
emptyelem = true
|
808
|
+
end
|
809
|
+
}
|
810
|
+
end while continue
|
811
|
+
unless @src.close_tag then
|
812
|
+
if emptyelem then
|
813
|
+
found_unclosed_emptyelem name
|
814
|
+
else
|
815
|
+
found_unclosed_stag name
|
816
|
+
end
|
817
|
+
end
|
818
|
+
if emptyelem then
|
819
|
+
on_stag_end_empty name
|
820
|
+
else
|
821
|
+
on_stag_end name
|
822
|
+
end
|
823
|
+
end
|
824
|
+
end
|
825
|
+
|
826
|
+
|
827
|
+
def scan_bang_tag(s)
|
828
|
+
parse_error "parse error at `<!'"
|
829
|
+
s << '>' if @src.close_tag
|
830
|
+
on_chardata s
|
831
|
+
end
|
832
|
+
|
833
|
+
|
834
|
+
def scan_content(s)
|
835
|
+
src = @src # for speed
|
836
|
+
while s
|
837
|
+
if (c = s[0]) == ?< then
|
838
|
+
if (c = s[1]) == ?/ then
|
839
|
+
scan_etag s
|
840
|
+
elsif c == ?! then
|
841
|
+
if s[2] == ?- and s[3] == ?- then
|
842
|
+
scan_comment s
|
843
|
+
elsif /\A<!\[CDATA\[/ =~ s then
|
844
|
+
scan_cdata $'
|
845
|
+
else
|
846
|
+
scan_bang_tag s
|
847
|
+
end
|
848
|
+
elsif c == ?? then
|
849
|
+
scan_pi s
|
850
|
+
else
|
851
|
+
scan_stag s
|
852
|
+
end
|
853
|
+
else
|
854
|
+
scan_chardata s
|
855
|
+
end
|
856
|
+
s = src.get
|
857
|
+
end
|
858
|
+
end
|
859
|
+
|
860
|
+
|
861
|
+
def get_until_qmark(str, qmark)
|
862
|
+
begin
|
863
|
+
#s = @src.get_plain
|
864
|
+
s = @src.get
|
865
|
+
break unless s
|
866
|
+
c = s[0]
|
867
|
+
v, s = s.split(qmark, 2)
|
868
|
+
str << '>' unless c == ?< or c == ?> # De Morgan
|
869
|
+
str << v if c
|
870
|
+
end until s
|
871
|
+
s
|
872
|
+
end
|
873
|
+
|
874
|
+
|
875
|
+
XMLDeclPattern = opt_regexp(%q{[ \t\n\r]([\-_\d\w]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|(\?\z)|([\-_.\d\w]+|[^ \t\n\r])})
|
876
|
+
|
877
|
+
def scan_xmldecl(s)
|
878
|
+
endmark = nil
|
879
|
+
info = nil
|
880
|
+
state = 0
|
881
|
+
on_xmldecl
|
882
|
+
begin
|
883
|
+
continue = false
|
884
|
+
s.scan(XMLDeclPattern[@optkey]) { |key,val,endtok,error|
|
885
|
+
if key then
|
886
|
+
qmark = val.slice!(0,1) # remove quotation marks
|
887
|
+
if val[-1] == qmark[0] then
|
888
|
+
val.chop!
|
889
|
+
else
|
890
|
+
continue = s = get_until_qmark(val, qmark)
|
891
|
+
unless s then
|
892
|
+
parse_error "unterminated XML declaration meets EOF"
|
893
|
+
endmark = true
|
894
|
+
end
|
895
|
+
end
|
896
|
+
newstate = case state
|
897
|
+
when 0; key == 'version' ? 1 : 4
|
898
|
+
when 1; key == 'encoding' ? 2 : key == 'standalone' ? 3 : 4
|
899
|
+
else key == 'standalone' ? 3 : 4
|
900
|
+
end
|
901
|
+
state = if newstate == 4
|
902
|
+
known=%w{version encoding standalone}.member?(key)
|
903
|
+
parse_error known ? "#{key} declaration must not be here" :
|
904
|
+
"unknown declaration `#{key}' in XML declaration"
|
905
|
+
state < 2 ? 2 : 3
|
906
|
+
else newstate end
|
907
|
+
on_xmldecl_key key, val
|
908
|
+
elsif endtok then
|
909
|
+
endmark = if ct=@src.close_tag
|
910
|
+
true
|
911
|
+
else
|
912
|
+
parse_error "unexpected `#{endmark}' found in XML declaration"
|
913
|
+
nil
|
914
|
+
end
|
915
|
+
# here always exit the loop.
|
916
|
+
else
|
917
|
+
parse_error "parse error at `#{error}'"
|
918
|
+
end
|
919
|
+
}
|
920
|
+
end while !endmark and continue || s = @src.get_plain
|
921
|
+
parse_error "unterminated XML declaration meets EOF" unless s or endmark
|
922
|
+
parse_error "no declaration found in XML declaration" if state == 0
|
923
|
+
on_xmldecl_end
|
924
|
+
end
|
925
|
+
|
926
|
+
|
927
|
+
SkipDTD = opt_regexp(%q{(['"]|\A<!--|\A<\?|--\z|\?\z)|\]\s*\z}) #'
|
928
|
+
|
929
|
+
def skip_internal_dtd(s)
|
930
|
+
quote = nil
|
931
|
+
continue = true
|
932
|
+
begin # skip until `]>'
|
933
|
+
s.scan(SkipDTD[@optkey]) { |q,| #'
|
934
|
+
if quote then
|
935
|
+
quote = nil if quote == q and quote.size == 1 || @src.tag_end?
|
936
|
+
elsif q then
|
937
|
+
if q == '<!--' then
|
938
|
+
quote = '--'
|
939
|
+
elsif q == '<?' then
|
940
|
+
quote = '?'
|
941
|
+
elsif q == '"' or q == "'" then
|
942
|
+
quote = q
|
943
|
+
end
|
944
|
+
elsif @src.close_tag then
|
945
|
+
continue = false
|
946
|
+
end
|
947
|
+
}
|
948
|
+
end while continue and s = @src.get
|
949
|
+
parse_error "unterminated internal DTD subset meets EOF" unless s
|
950
|
+
end
|
951
|
+
|
952
|
+
|
953
|
+
def scan_internal_dtd(s)
|
954
|
+
warning "internal DTD subset is not supported"
|
955
|
+
skip_internal_dtd s
|
956
|
+
end
|
957
|
+
|
958
|
+
|
959
|
+
def found_invalid_pubsys(pubsys)
|
960
|
+
parse_error "`PUBLIC' or `SYSTEM' should be here"
|
961
|
+
'SYSTEM'
|
962
|
+
end
|
963
|
+
|
964
|
+
|
965
|
+
DoctypePattern = opt_regexp(%q{[ \t\n\r](?:([^ \t\n\r\/'"=\[]+)|('[^']*'?|"[^"]*"?))|([\-_.\d\w]+|[^ \t\n\r])}) #"
|
966
|
+
|
967
|
+
def scan_doctype(s)
|
968
|
+
root = syspub = sysid = pubid = nil
|
969
|
+
internal_dtd = false
|
970
|
+
re = DoctypePattern[@opt]
|
971
|
+
begin
|
972
|
+
if re =~ s then
|
973
|
+
name, str, delim, s = $1, $2, $3, $'
|
974
|
+
if name then
|
975
|
+
if not root then
|
976
|
+
root = name
|
977
|
+
elsif not syspub then
|
978
|
+
unless name == 'PUBLIC' or name == 'SYSTEM' then
|
979
|
+
name = found_invalid_pubsys(name)
|
980
|
+
end
|
981
|
+
syspub = name
|
982
|
+
else
|
983
|
+
parse_error "parse error at `#{name}'"
|
984
|
+
end
|
985
|
+
elsif str then
|
986
|
+
qmark = str.slice!(0,1) # remove quotation marks
|
987
|
+
unless syspub then
|
988
|
+
parse_error "parse error at `#{qmark}'"
|
989
|
+
s = str << s
|
990
|
+
else
|
991
|
+
if str[-1] == qmark[0] then
|
992
|
+
str.chop!
|
993
|
+
else
|
994
|
+
s = get_until_qmark(str, qmark) || ''
|
995
|
+
end
|
996
|
+
if not sysid then
|
997
|
+
sysid = str
|
998
|
+
elsif not pubid and syspub == 'PUBLIC' then
|
999
|
+
pubid = sysid
|
1000
|
+
sysid = str
|
1001
|
+
else
|
1002
|
+
parse_error "too many external ID literals in DOCTYPE"
|
1003
|
+
end
|
1004
|
+
end
|
1005
|
+
elsif delim == '[' then
|
1006
|
+
internal_dtd = true
|
1007
|
+
break
|
1008
|
+
else
|
1009
|
+
parse_error "parse error at `#{delim}'"
|
1010
|
+
end
|
1011
|
+
else
|
1012
|
+
s = ''
|
1013
|
+
end
|
1014
|
+
if s.empty? then
|
1015
|
+
break if @src.close_tag
|
1016
|
+
s = @src.get_plain
|
1017
|
+
end
|
1018
|
+
end while s
|
1019
|
+
parse_error "unterminated DOCTYPE declaration meets EOF" unless s
|
1020
|
+
unless root then
|
1021
|
+
parse_error "no root element is specified in DOCTYPE"
|
1022
|
+
end
|
1023
|
+
if syspub and not sysid then
|
1024
|
+
parse_error "too few external ID literals in DOCTYPE"
|
1025
|
+
end
|
1026
|
+
if syspub == 'PUBLIC' and not pubid then
|
1027
|
+
pubid, sysid = sysid, nil
|
1028
|
+
end
|
1029
|
+
on_doctype root, pubid, sysid
|
1030
|
+
scan_internal_dtd s if internal_dtd
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
|
1034
|
+
def scan_prolog(s)
|
1035
|
+
if /\A<\?xml(?=[ \t\n\r])/ =~ s then
|
1036
|
+
scan_xmldecl $'
|
1037
|
+
s = @src.get
|
1038
|
+
end
|
1039
|
+
doctype = true
|
1040
|
+
src = @src # for speed
|
1041
|
+
while s
|
1042
|
+
if s[0] == ?< then
|
1043
|
+
if (c = s[1]) == ?! then
|
1044
|
+
if s[2] == ?- and s[3] == ?- then
|
1045
|
+
scan_comment s
|
1046
|
+
elsif /\A<!DOCTYPE(?=[ \t\n\r])/ =~ s and doctype then
|
1047
|
+
doctype = false
|
1048
|
+
scan_doctype $'
|
1049
|
+
else
|
1050
|
+
break
|
1051
|
+
end
|
1052
|
+
elsif c == ?? then
|
1053
|
+
scan_pi s
|
1054
|
+
else
|
1055
|
+
break
|
1056
|
+
end
|
1057
|
+
s = src.get
|
1058
|
+
elsif /[^ \t\r\n]/ !~ s then
|
1059
|
+
on_prolog_space s unless s.empty?
|
1060
|
+
s = src.get_plain
|
1061
|
+
else
|
1062
|
+
break
|
1063
|
+
end
|
1064
|
+
end
|
1065
|
+
scan_content(s || src.get)
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
|
1069
|
+
def scan_document
|
1070
|
+
on_start_document
|
1071
|
+
@src.prepare
|
1072
|
+
scan_prolog @src.get
|
1073
|
+
on_end_document
|
1074
|
+
end
|
1075
|
+
|
1076
|
+
|
1077
|
+
def make_source(src)
|
1078
|
+
Source.new src
|
1079
|
+
end
|
1080
|
+
|
1081
|
+
|
1082
|
+
public
|
1083
|
+
|
1084
|
+
def parse_document(src)
|
1085
|
+
@src = make_source(src)
|
1086
|
+
begin
|
1087
|
+
scan_document
|
1088
|
+
ensure
|
1089
|
+
@src = nil
|
1090
|
+
end
|
1091
|
+
self
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
alias parse parse_document
|
1095
|
+
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
|
1102
|
+
|
1103
|
+
|
1104
|
+
|
1105
|
+
if $0 == __FILE__ then
|
1106
|
+
class TestVisitor
|
1107
|
+
include XMLScan::Visitor
|
1108
|
+
def parse_error(msg)
|
1109
|
+
STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
1110
|
+
end
|
1111
|
+
def wellformed_error(msg)
|
1112
|
+
STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
1113
|
+
end
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
$s = scan = XMLScan::XMLScanner.new(TestVisitor.new)
|
1117
|
+
src = ARGF
|
1118
|
+
def src.path; filename; end
|
1119
|
+
t1 = Time.times.utime
|
1120
|
+
scan.parse src
|
1121
|
+
t2 = Time.times.utime
|
1122
|
+
STDERR.printf "%2.3f sec\n", t2 - t1
|
1123
|
+
end
|