xmlscan 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +1276 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +31 -0
- data/README.rdoc +365 -0
- data/Rakefile +65 -0
- data/THANKS +11 -0
- data/VERSION +1 -0
- data/install.rb +41 -0
- data/lib/xmlscan/htmlscan.rb +290 -0
- data/lib/xmlscan/namespace.rb +353 -0
- data/lib/xmlscan/parser.rb +300 -0
- data/lib/xmlscan/scanner.rb +1123 -0
- data/lib/xmlscan/version.rb +23 -0
- data/lib/xmlscan/visitor.rb +162 -0
- data/lib/xmlscan/xmlchar.rb +248 -0
- data/test.rb +7 -0
- metadata +113 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/parser.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: parser.rb,v 1.10 2003/01/22 13:06:18 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'xmlscan/scanner'
|
11
|
+
|
12
|
+
|
13
|
+
module XMLScan
|
14
|
+
|
15
|
+
class XMLParser < XMLScanner
|
16
|
+
|
17
|
+
class AttributeChecker < Hash
|
18
|
+
# AttributeChecker inherits Hash only for speed.
|
19
|
+
|
20
|
+
def check_unique(name)
|
21
|
+
not key? name and store(name, true)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
#PredefinedEntity = {
|
28
|
+
# 'lt' => '<',
|
29
|
+
# 'gt' => '>',
|
30
|
+
# 'amp' => '&',
|
31
|
+
# 'quot' => '"',
|
32
|
+
# 'apos' => "'",
|
33
|
+
#}
|
34
|
+
|
35
|
+
|
36
|
+
def parse(*)
|
37
|
+
@elem = []
|
38
|
+
@attr = AttributeChecker.new
|
39
|
+
@standalone = nil
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def on_xmldecl_version(str)
|
47
|
+
unless str == '1.0' then
|
48
|
+
warning "unsupported XML version `#{str}'"
|
49
|
+
end
|
50
|
+
@visitor.on_xmldecl_version str
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def on_xmldecl_standalone(str)
|
55
|
+
if str == 'yes' then
|
56
|
+
@standalone = true
|
57
|
+
elsif str == 'no' then
|
58
|
+
@standalone = false
|
59
|
+
else
|
60
|
+
parse_error "standalone declaration must be either `yes' or `no'"
|
61
|
+
end
|
62
|
+
@visitor.on_xmldecl_standalone str
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def on_doctype(name, pubid, sysid)
|
67
|
+
if pubid and not sysid then
|
68
|
+
parse_error "public external ID must have both public ID and system ID"
|
69
|
+
end
|
70
|
+
@visitor.on_doctype name, pubid, sysid
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
def on_prolog_space(s)
|
75
|
+
# just ignore it.
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
def on_pi(target, pi)
|
80
|
+
if target.downcase == 'xml' then
|
81
|
+
parse_error "reserved PI target `#{target}'"
|
82
|
+
end
|
83
|
+
@visitor.on_pi target, pi
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
#def on_entityref(ref)
|
88
|
+
# rep = PredefinedEntity[ref]
|
89
|
+
# if rep then
|
90
|
+
# @visitor.on_chardata rep
|
91
|
+
# else
|
92
|
+
# @visitor.on_entityref ref
|
93
|
+
# end
|
94
|
+
#end
|
95
|
+
|
96
|
+
|
97
|
+
#def on_attr_entityref(ref)
|
98
|
+
# rep = PredefinedEntity[ref]
|
99
|
+
# if rep then
|
100
|
+
# @visitor.on_attr_value rep
|
101
|
+
# else
|
102
|
+
# @visitor.on_attr_entityref ref
|
103
|
+
# end
|
104
|
+
#end
|
105
|
+
|
106
|
+
|
107
|
+
#def on_charref_hex(code)
|
108
|
+
# on_charref code
|
109
|
+
#end
|
110
|
+
|
111
|
+
|
112
|
+
#def on_attr_charref_hex(code)
|
113
|
+
# on_attr_charref code
|
114
|
+
#end
|
115
|
+
|
116
|
+
|
117
|
+
def on_stag(name)
|
118
|
+
@elem.push name
|
119
|
+
@visitor.on_stag name
|
120
|
+
@attr.clear
|
121
|
+
end
|
122
|
+
|
123
|
+
def on_attribute(name)
|
124
|
+
unless @attr.check_unique name then
|
125
|
+
wellformed_error "doubled attribute `#{name}'"
|
126
|
+
end
|
127
|
+
@visitor.on_attribute name
|
128
|
+
end
|
129
|
+
|
130
|
+
def on_attr_value(str)
|
131
|
+
str.tr! "\t\r\n", ' ' # normalize
|
132
|
+
@visitor.on_attr_value str
|
133
|
+
end
|
134
|
+
|
135
|
+
def on_stag_end_empty(name)
|
136
|
+
# @visitor.on_stag_end name
|
137
|
+
# @elem.pop
|
138
|
+
# @visitor.on_etag name
|
139
|
+
@visitor.on_stag_end_empty name
|
140
|
+
@elem.pop
|
141
|
+
end
|
142
|
+
|
143
|
+
def on_etag(name)
|
144
|
+
last = @elem.pop
|
145
|
+
if last == name then
|
146
|
+
@visitor.on_etag name
|
147
|
+
elsif last then
|
148
|
+
wellformed_error "element type `#{name}' is not matched"
|
149
|
+
@visitor.on_etag last
|
150
|
+
else
|
151
|
+
parse_error "end tag `#{name}' appears alone"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
public
|
157
|
+
|
158
|
+
|
159
|
+
def scan_content(s)
|
160
|
+
elem = @elem # for speed
|
161
|
+
src = @src # for speed
|
162
|
+
found_root_element = false
|
163
|
+
|
164
|
+
begin
|
165
|
+
|
166
|
+
# -- first start tag --
|
167
|
+
elem.clear
|
168
|
+
found_stag = false
|
169
|
+
|
170
|
+
while s and not found_stag
|
171
|
+
if (c = s[0]) == ?< then
|
172
|
+
if (c = s[1]) == ?/ then
|
173
|
+
# should be a parse error
|
174
|
+
scan_etag s
|
175
|
+
elsif c == ?! then
|
176
|
+
if s[2] == ?- and s[3] == ?- then
|
177
|
+
scan_comment s
|
178
|
+
elsif /\A<!\[CDATA\[/n =~ s then
|
179
|
+
parse_error "CDATA section is found outside of root element"
|
180
|
+
scan_cdata $'
|
181
|
+
else
|
182
|
+
scan_bang_tag s
|
183
|
+
end
|
184
|
+
elsif c == ?? then
|
185
|
+
scan_pi s
|
186
|
+
else
|
187
|
+
found_root_element = true
|
188
|
+
found_stag = true
|
189
|
+
scan_stag s
|
190
|
+
end
|
191
|
+
else
|
192
|
+
parse_error "content of element is found outside of root element"
|
193
|
+
scan_chardata s
|
194
|
+
end
|
195
|
+
s = src.get
|
196
|
+
end
|
197
|
+
|
198
|
+
if not found_root_element and not found_stag then
|
199
|
+
parse_error "no root element was found"
|
200
|
+
end
|
201
|
+
|
202
|
+
# -- contents --
|
203
|
+
while s and not elem.empty?
|
204
|
+
if (c = s[0]) == ?< then
|
205
|
+
if (c = s[1]) == ?/ then
|
206
|
+
scan_etag s
|
207
|
+
elsif c == ?! then
|
208
|
+
if s[2] == ?- and s[3] == ?- then
|
209
|
+
scan_comment s
|
210
|
+
elsif /\A<!\[CDATA\[/n =~ s then
|
211
|
+
scan_cdata $'
|
212
|
+
else
|
213
|
+
scan_bang_tag s
|
214
|
+
end
|
215
|
+
elsif c == ?? then
|
216
|
+
scan_pi s
|
217
|
+
else
|
218
|
+
scan_stag s
|
219
|
+
end
|
220
|
+
else
|
221
|
+
scan_chardata s
|
222
|
+
end
|
223
|
+
s = src.get
|
224
|
+
end
|
225
|
+
|
226
|
+
unless elem.empty? then
|
227
|
+
while name = elem.pop
|
228
|
+
parse_error "unclosed element `#{name}' meets EOF"
|
229
|
+
@visitor.on_etag name
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# -- epilogue --
|
234
|
+
finish = true
|
235
|
+
|
236
|
+
while s
|
237
|
+
if (c = s[0]) == ?< then
|
238
|
+
if (c = s[1]) == ?/ then
|
239
|
+
finish = false # content out of root element
|
240
|
+
break
|
241
|
+
elsif c == ?! then
|
242
|
+
if s[2] == ?- and s[3] == ?- then
|
243
|
+
scan_comment s
|
244
|
+
else
|
245
|
+
finish = false # content out of root element
|
246
|
+
break
|
247
|
+
end
|
248
|
+
elsif c == ?? then
|
249
|
+
scan_pi s
|
250
|
+
else
|
251
|
+
parse_error "another root element is found" # stag
|
252
|
+
finish = false
|
253
|
+
break
|
254
|
+
end
|
255
|
+
else
|
256
|
+
if s.strip.empty? then
|
257
|
+
on_prolog_space s
|
258
|
+
else
|
259
|
+
finish = false # content out of root element
|
260
|
+
break
|
261
|
+
end
|
262
|
+
end
|
263
|
+
s = src.get
|
264
|
+
end
|
265
|
+
|
266
|
+
end until finish
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
|
278
|
+
|
279
|
+
if $0 == __FILE__ then
|
280
|
+
class TestVisitor
|
281
|
+
include XMLScan::Visitor
|
282
|
+
def parse_error(msg)
|
283
|
+
STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
284
|
+
end
|
285
|
+
def wellformed_error(msg)
|
286
|
+
STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
287
|
+
end
|
288
|
+
def warning(msg)
|
289
|
+
STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
$s = scan = XMLScan::XMLParser.new(TestVisitor.new)
|
294
|
+
src = ARGF
|
295
|
+
def src.path; filename; end
|
296
|
+
t1 = Time.times.utime
|
297
|
+
scan.parse src
|
298
|
+
t2 = Time.times.utime
|
299
|
+
STDERR.printf "%2.3f sec\n", t2 - t1
|
300
|
+
end
|
@@ -0,0 +1,1123 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/scanner.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: scanner.rb,v 1.75.2.3 2003/05/01 15:43:23 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
#
|
11
|
+
# CONSIDERATIONS FOR CHARACTER ENCODINGS:
|
12
|
+
#
|
13
|
+
# There are the following common characteristics in character encodings
|
14
|
+
# which are supported by Ruby's $KCODE feature (ISO-8859-*, Shift_JIS,
|
15
|
+
# EUC, and UTF-8):
|
16
|
+
#
|
17
|
+
# - Stateless.
|
18
|
+
# - ASCII characters are encoded in the same manner as US-ASCII.
|
19
|
+
# - The octet sequences corresponding to non-ASCII characters begin
|
20
|
+
# with an octet greater than 0x80.
|
21
|
+
# - The following characters can be identified by just one octet.
|
22
|
+
# That is, every octets corresponding to the following characters in
|
23
|
+
# US-ASCII never appear as a part of an octet sequence representing a
|
24
|
+
# non-ASCII character.
|
25
|
+
#
|
26
|
+
# Whitespaces("\t", "\n", "\r", and " ") and
|
27
|
+
# ! \ " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
28
|
+
#
|
29
|
+
# Be careful that `[' and `]' are NOT included in the list!
|
30
|
+
#
|
31
|
+
# If we build a regular expression carefully in accordance with these
|
32
|
+
# characteristics, we can get the same match regardless of the value
|
33
|
+
# of $KCODE. Moreover, if it can be premised on them, we can detect
|
34
|
+
# several delimiters without regular expressions. XMLScanner uses this
|
35
|
+
# fact in order to share many regular expressions in all $KCODE modes,
|
36
|
+
# and in order to optimize parsing speed.
|
37
|
+
#
|
38
|
+
|
39
|
+
require 'xmlscan/visitor'
|
40
|
+
|
41
|
+
|
42
|
+
module XMLScan
|
43
|
+
|
44
|
+
class Input
|
45
|
+
|
46
|
+
def initialize(src)
|
47
|
+
@src = src
|
48
|
+
unless src.respond_to? :gets then
|
49
|
+
if src.respond_to? :to_ary then
|
50
|
+
@v = src.to_ary
|
51
|
+
@n = -1
|
52
|
+
def self.gets ; @v.at(@n += 1) ; end
|
53
|
+
def self.lineno ; @n + 1 ; end
|
54
|
+
else
|
55
|
+
@v = @src
|
56
|
+
def self.gets ; s = @v ; @v = nil ; s ; end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
if src.respond_to? :lineno then
|
60
|
+
def self.lineno ; @src.lineno ; end
|
61
|
+
end
|
62
|
+
if src.respond_to? :path then
|
63
|
+
def self.path ; @src.path ; end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
attr_reader :src
|
68
|
+
|
69
|
+
def gets ; @src.gets ; end
|
70
|
+
def lineno ; 0 ; end
|
71
|
+
def path ; '-' ; end
|
72
|
+
|
73
|
+
def self.wrap(src)
|
74
|
+
unless src.respond_to? :gets and src.respond_to? :lineno and
|
75
|
+
src.respond_to? :path then
|
76
|
+
src = new(src)
|
77
|
+
end
|
78
|
+
src
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.unwrap(obj)
|
82
|
+
if self === obj then
|
83
|
+
obj.src
|
84
|
+
else
|
85
|
+
obj
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
class PrivateArray < Array
|
94
|
+
m = superclass.instance_methods - Kernel.instance_methods
|
95
|
+
private(*m)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class Source < PrivateArray
|
100
|
+
# Source inherits Array only for speed.
|
101
|
+
|
102
|
+
def initialize(src)
|
103
|
+
super()
|
104
|
+
@src = Input.wrap(src)
|
105
|
+
@eof = false
|
106
|
+
@last = nil
|
107
|
+
end
|
108
|
+
|
109
|
+
def source
|
110
|
+
Input.unwrap @src
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
def eof?
|
115
|
+
@eof and empty?
|
116
|
+
end
|
117
|
+
|
118
|
+
def abort
|
119
|
+
@eof = true
|
120
|
+
@last = nil
|
121
|
+
clear
|
122
|
+
self
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
def get
|
127
|
+
pop or
|
128
|
+
unless @eof then
|
129
|
+
last = @last
|
130
|
+
begin
|
131
|
+
src = @src.gets
|
132
|
+
unless src then
|
133
|
+
@eof = true
|
134
|
+
unshift last
|
135
|
+
last = nil
|
136
|
+
break
|
137
|
+
end
|
138
|
+
a = src.split(/(?=<|>[<>])|>/, -1)
|
139
|
+
if last then
|
140
|
+
unless /\A[<>]/ =~ a.first then
|
141
|
+
a[0] = last << (a.first || '')
|
142
|
+
else
|
143
|
+
push last
|
144
|
+
end
|
145
|
+
end
|
146
|
+
concat a
|
147
|
+
last = pop
|
148
|
+
end while empty?
|
149
|
+
@last = last
|
150
|
+
reverse!
|
151
|
+
pop
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def prepare
|
157
|
+
s = get
|
158
|
+
s = get and s = '>' << s if s and s.empty? # preserve first `>'
|
159
|
+
s and push s
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def tag_end?
|
164
|
+
s = last || @last and s[0] != ?<
|
165
|
+
end
|
166
|
+
|
167
|
+
def tag_start?
|
168
|
+
s = last || @last and s[0] == ?<
|
169
|
+
end
|
170
|
+
|
171
|
+
def close_tag # tag_end?, and remove a `>'.
|
172
|
+
unless s = last || @last and s[0] != ?< then
|
173
|
+
false
|
174
|
+
else
|
175
|
+
if s == '>' or s.empty? then
|
176
|
+
s1 = get
|
177
|
+
unless s = last || @last and s[0] == ?< then # for speed up
|
178
|
+
out = [ s1 ]
|
179
|
+
out.push get while s = last || @last and s == '>' || s.empty?
|
180
|
+
x=out.pop unless s and s[0] != ?< # De Morgan
|
181
|
+
concat out
|
182
|
+
end
|
183
|
+
end
|
184
|
+
true
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
def get_text # get until tag_start?
|
190
|
+
s = last || @last and s[0] != ?< and get
|
191
|
+
end
|
192
|
+
|
193
|
+
def get_tag # get until tag_end?
|
194
|
+
s = last || @last and s[0] == ?< and get
|
195
|
+
end
|
196
|
+
|
197
|
+
def get_plain
|
198
|
+
s = get
|
199
|
+
s = '>' << s unless not s or (c = s[0]) == ?< or c == ?> # De Morgan
|
200
|
+
s
|
201
|
+
end
|
202
|
+
|
203
|
+
def lineno
|
204
|
+
@src.lineno
|
205
|
+
end
|
206
|
+
|
207
|
+
def path
|
208
|
+
@src.path
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# The following methods are for debug.
|
213
|
+
|
214
|
+
def inspect
|
215
|
+
a = []
|
216
|
+
reverse_each { |i|
|
217
|
+
a.push ">" unless /\A[<>]/ =~ i
|
218
|
+
a.push i.inspect
|
219
|
+
}
|
220
|
+
last = []
|
221
|
+
if @last then
|
222
|
+
last.push ">" unless /\A[<>]/ =~ @last
|
223
|
+
last.push @last.inspect
|
224
|
+
end
|
225
|
+
a.push '#eof' if @eof
|
226
|
+
"((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
|
227
|
+
end
|
228
|
+
|
229
|
+
def each
|
230
|
+
prepare
|
231
|
+
while s = get
|
232
|
+
yield s
|
233
|
+
end
|
234
|
+
self
|
235
|
+
end
|
236
|
+
|
237
|
+
def test
|
238
|
+
last or @last or (s = get and push s and s)
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
class XMLScanner
|
246
|
+
|
247
|
+
class << self
|
248
|
+
|
249
|
+
def provided_options
|
250
|
+
options = []
|
251
|
+
private_instance_methods.each { |i|
|
252
|
+
options.push $' if /\Aapply_option_/ =~ i
|
253
|
+
}
|
254
|
+
options
|
255
|
+
end
|
256
|
+
|
257
|
+
def apply_option(instance, option)
|
258
|
+
instance.__send__ "apply_option_#{option}"
|
259
|
+
end
|
260
|
+
|
261
|
+
def apply_options(instance, options)
|
262
|
+
h = {}
|
263
|
+
options.each { |i| h[i.to_s] = true }
|
264
|
+
options = h
|
265
|
+
ancestors.each { |klass|
|
266
|
+
if klass.respond_to? :provided_options then
|
267
|
+
klass.provided_options.each { |i|
|
268
|
+
if options.include? i then
|
269
|
+
options.delete i
|
270
|
+
klass.apply_option instance, i
|
271
|
+
end
|
272
|
+
}
|
273
|
+
end
|
274
|
+
}
|
275
|
+
unless options.empty? then
|
276
|
+
raise ArgumentError, "undefined option `#{options.keys[0]}'"
|
277
|
+
end
|
278
|
+
instance
|
279
|
+
end
|
280
|
+
private :apply_options
|
281
|
+
|
282
|
+
def new(visitor, *options)
|
283
|
+
instance = super(visitor)
|
284
|
+
apply_options instance, options
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
|
290
|
+
|
291
|
+
def initialize(visitor)
|
292
|
+
@visitor = visitor
|
293
|
+
@decoration = nil
|
294
|
+
@src = nil
|
295
|
+
@optkey = nil
|
296
|
+
end
|
297
|
+
|
298
|
+
attr_accessor :optkey
|
299
|
+
|
300
|
+
def opt_encoding() OptRegexp::RE_ENCODINGS[optkey] end
|
301
|
+
|
302
|
+
|
303
|
+
def decorate(decoration)
|
304
|
+
unless @decoration then
|
305
|
+
@visitor = @decoration = Decoration.new(@visitor)
|
306
|
+
end
|
307
|
+
@decoration.expand decoration
|
308
|
+
end
|
309
|
+
private :decorate
|
310
|
+
|
311
|
+
|
312
|
+
def lineno
|
313
|
+
@src && @src.lineno
|
314
|
+
end
|
315
|
+
|
316
|
+
def path
|
317
|
+
@src && @src.path
|
318
|
+
end
|
319
|
+
|
320
|
+
def source
|
321
|
+
@src.source
|
322
|
+
end
|
323
|
+
|
324
|
+
|
325
|
+
private
|
326
|
+
|
327
|
+
def parse_error(msg)
|
328
|
+
@visitor.parse_error msg
|
329
|
+
end
|
330
|
+
|
331
|
+
def wellformed_error(msg)
|
332
|
+
@visitor.wellformed_error msg
|
333
|
+
end
|
334
|
+
|
335
|
+
def valid_error(msg)
|
336
|
+
@visitor.valid_error msg
|
337
|
+
end
|
338
|
+
|
339
|
+
def warning(msg)
|
340
|
+
@visitor.warning msg
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
def on_xmldecl
|
345
|
+
@visitor.on_xmldecl
|
346
|
+
end
|
347
|
+
|
348
|
+
def on_xmldecl_key(key, str)
|
349
|
+
meth = "on_xmldecl_#{key}"
|
350
|
+
if @visitor.respond_to? meth
|
351
|
+
self.send meth, str
|
352
|
+
else
|
353
|
+
self.send :on_xmldecl_other, key, str
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def on_xmldecl_version(str)
|
358
|
+
@visitor.on_xmldecl_version str
|
359
|
+
end
|
360
|
+
|
361
|
+
def on_xmldecl_encoding(str)
|
362
|
+
@visitor.on_xmldecl_encoding str
|
363
|
+
end
|
364
|
+
|
365
|
+
def on_xmldecl_standalone(str)
|
366
|
+
@visitor.on_xmldecl_standalone str
|
367
|
+
end
|
368
|
+
|
369
|
+
def on_xmldecl_other(name, value)
|
370
|
+
@visitor.on_xmldecl_other name, value
|
371
|
+
end
|
372
|
+
|
373
|
+
def on_xmldecl_end
|
374
|
+
@visitor.on_xmldecl_end
|
375
|
+
end
|
376
|
+
|
377
|
+
def on_doctype(root, pubid, sysid)
|
378
|
+
@visitor.on_doctype root, pubid, sysid
|
379
|
+
end
|
380
|
+
|
381
|
+
def on_prolog_space(str)
|
382
|
+
@visitor.on_prolog_space str
|
383
|
+
end
|
384
|
+
|
385
|
+
def on_comment(str)
|
386
|
+
@visitor.on_comment str
|
387
|
+
end
|
388
|
+
|
389
|
+
def on_pi(target, pi)
|
390
|
+
@visitor.on_pi target, pi
|
391
|
+
end
|
392
|
+
|
393
|
+
def on_chardata(str)
|
394
|
+
@visitor.on_chardata str
|
395
|
+
end
|
396
|
+
|
397
|
+
def on_cdata(str)
|
398
|
+
@visitor.on_cdata str
|
399
|
+
end
|
400
|
+
|
401
|
+
def on_etag(name)
|
402
|
+
@visitor.on_etag name
|
403
|
+
end
|
404
|
+
|
405
|
+
def on_entityref(ref)
|
406
|
+
@visitor.on_entityref ref
|
407
|
+
end
|
408
|
+
|
409
|
+
def on_charref(code)
|
410
|
+
@visitor.on_charref code
|
411
|
+
end
|
412
|
+
|
413
|
+
def on_charref_hex(code)
|
414
|
+
@visitor.on_charref_hex code
|
415
|
+
end
|
416
|
+
|
417
|
+
def on_start_document
|
418
|
+
@visitor.on_start_document
|
419
|
+
end
|
420
|
+
|
421
|
+
def on_end_document
|
422
|
+
@visitor.on_end_document
|
423
|
+
end
|
424
|
+
|
425
|
+
|
426
|
+
# <hoge fuga="foo&bar;&&foo" />HOGE
|
427
|
+
# ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
|
428
|
+
# 1 2 3 4 5 6 7 8 9 A
|
429
|
+
#
|
430
|
+
# The following method will be called with the following arguments
|
431
|
+
# when the parser reaches the above point;
|
432
|
+
#
|
433
|
+
# 1: on_stag ('hoge')
|
434
|
+
# 2: on_attribute ('fuga')
|
435
|
+
# 3: on_attr_value ('foo')
|
436
|
+
# 4: on_attr_entityref ('bar')
|
437
|
+
# 5: on_attr_charref (38)
|
438
|
+
# 6: on_attr_charref_hex (38)
|
439
|
+
# 7: on_attr_value ('foo')
|
440
|
+
# 8: on_attribute_end ('fuga')
|
441
|
+
# 9: on_stag_end_empty ('hoge')
|
442
|
+
# or
|
443
|
+
# on_stag_end ('hoge')
|
444
|
+
#
|
445
|
+
# A: on_chardata ('HOGE')
|
446
|
+
|
447
|
+
def on_stag(name)
|
448
|
+
@visitor.on_stag name
|
449
|
+
end
|
450
|
+
|
451
|
+
def on_attribute(name)
|
452
|
+
@visitor.on_attribute name
|
453
|
+
end
|
454
|
+
|
455
|
+
def on_attr_value(str)
|
456
|
+
@visitor.on_attr_value str
|
457
|
+
end
|
458
|
+
|
459
|
+
def on_attr_entityref(ref)
|
460
|
+
@visitor.on_attr_entityref ref
|
461
|
+
end
|
462
|
+
|
463
|
+
def on_attr_charref(code)
|
464
|
+
@visitor.on_attr_charref code
|
465
|
+
end
|
466
|
+
|
467
|
+
def on_attr_charref_hex(code)
|
468
|
+
@visitor.on_attr_charref_hex code
|
469
|
+
end
|
470
|
+
|
471
|
+
def on_attribute_end(name)
|
472
|
+
@visitor.on_attribute_end name
|
473
|
+
end
|
474
|
+
|
475
|
+
def on_stag_end_empty(name)
|
476
|
+
@visitor.on_stag_end_empty name
|
477
|
+
end
|
478
|
+
|
479
|
+
def on_stag_end(name)
|
480
|
+
@visitor.on_stag_end name
|
481
|
+
end
|
482
|
+
|
483
|
+
|
484
|
+
|
485
|
+
private
|
486
|
+
|
487
|
+
module OptRegexp
|
488
|
+
UTFSTR = "é"
|
489
|
+
S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
|
490
|
+
E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
|
491
|
+
|
492
|
+
RE_ENCODINGS = {
|
493
|
+
:n=>/e/n.encoding,
|
494
|
+
:e=>/#{E_OPT_EXAMPLE}/e.encoding,
|
495
|
+
:s=>/#{S_OPT_EXAMPLE}/s.encoding,
|
496
|
+
:u=>/#{UTFSTR}/u.encoding
|
497
|
+
}
|
498
|
+
|
499
|
+
RE_ENCODING_OPTIONS = {
|
500
|
+
:n=>/e/n.options,
|
501
|
+
:e=>/#{E_OPT_EXAMPLE}/e.options,
|
502
|
+
:s=>/#{S_OPT_EXAMPLE}/s.options,
|
503
|
+
:u=>/#{UTFSTR}/u.options
|
504
|
+
}
|
505
|
+
|
506
|
+
private
|
507
|
+
def opt_regexp(re)
|
508
|
+
h = {}
|
509
|
+
RE_ENCODING_OPTIONS.each { |k,opt|
|
510
|
+
h[k] = Regexp.new(re.encode(RE_ENCODINGS[k]), opt)
|
511
|
+
}
|
512
|
+
h.default = Regexp.new(re)
|
513
|
+
h
|
514
|
+
end
|
515
|
+
end
|
516
|
+
extend OptRegexp
|
517
|
+
|
518
|
+
|
519
|
+
InvalidEntityRef = opt_regexp('(?=[^#\d\w]|\z)')
|
520
|
+
|
521
|
+
def scan_chardata(s)
|
522
|
+
while true
|
523
|
+
unless /&/ =~ s then
|
524
|
+
on_chardata s
|
525
|
+
else
|
526
|
+
s = $`
|
527
|
+
on_chardata s unless s.empty?
|
528
|
+
ref = nil
|
529
|
+
$'.split('&', -1).each { |s|
|
530
|
+
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
531
|
+
if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
|
532
|
+
then
|
533
|
+
parse_error "reference to `#{ref}' doesn't end with `;'"
|
534
|
+
else
|
535
|
+
parse_error "`&' is not used for entity/character references"
|
536
|
+
on_chardata('&' << s)
|
537
|
+
next
|
538
|
+
end
|
539
|
+
end
|
540
|
+
ref = $`
|
541
|
+
s = $'
|
542
|
+
if /\A[^#]/ =~ ref then
|
543
|
+
on_entityref ref
|
544
|
+
elsif /\A#(\d+)\z/ =~ ref then
|
545
|
+
on_charref $1.to_i
|
546
|
+
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
547
|
+
on_charref_hex $1.hex
|
548
|
+
else
|
549
|
+
parse_error "invalid character reference `#{ref}'"
|
550
|
+
end
|
551
|
+
on_chardata s unless s.empty?
|
552
|
+
}
|
553
|
+
end
|
554
|
+
s = @src.get_text
|
555
|
+
break unless s
|
556
|
+
s = '>' << s unless s == '>'
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
|
561
|
+
def scan_attvalue(s) # almostly copy & paste from scan_chardata
|
562
|
+
unless /&/ =~ s then
|
563
|
+
on_attr_value s
|
564
|
+
else
|
565
|
+
s = $`
|
566
|
+
on_attr_value s unless s.empty?
|
567
|
+
ref = nil
|
568
|
+
$'.split('&', -1).each { |s|
|
569
|
+
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
570
|
+
if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
|
571
|
+
then
|
572
|
+
parse_error "reference to `#{ref}' doesn't end with `;'"
|
573
|
+
else
|
574
|
+
parse_error "`&' is not used for entity/character references"
|
575
|
+
on_attr_value('&' << s)
|
576
|
+
next
|
577
|
+
end
|
578
|
+
end
|
579
|
+
ref = $`
|
580
|
+
s = $'
|
581
|
+
if /\A[^#]/ =~ ref then
|
582
|
+
on_attr_entityref ref
|
583
|
+
elsif /\A#(\d+)\z/ =~ ref then
|
584
|
+
on_attr_charref $1.to_i
|
585
|
+
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
586
|
+
on_attr_charref_hex $1.hex
|
587
|
+
else
|
588
|
+
parse_error "invalid character reference `#{ref}'"
|
589
|
+
end
|
590
|
+
on_attr_value s unless s.empty?
|
591
|
+
}
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
|
596
|
+
def scan_comment(s)
|
597
|
+
s[0,4] = '' # remove `<!--'
|
598
|
+
comm = ''
|
599
|
+
until /--/ =~ s
|
600
|
+
comm << s
|
601
|
+
s = @src.get_plain
|
602
|
+
unless s then
|
603
|
+
parse_error "unterminated comment meets EOF"
|
604
|
+
return on_comment(comm)
|
605
|
+
end
|
606
|
+
end
|
607
|
+
comm << $`
|
608
|
+
until (s = $').empty? and @src.close_tag
|
609
|
+
if s == '-' and @src.close_tag then # --->
|
610
|
+
parse_error "comment ending in `--->' is not allowed"
|
611
|
+
comm << s
|
612
|
+
break
|
613
|
+
end
|
614
|
+
parse_error "comment includes `--'"
|
615
|
+
comm << '--'
|
616
|
+
until /--/ =~ s # copy & paste for performance
|
617
|
+
comm << s
|
618
|
+
s = @src.get_plain
|
619
|
+
unless s then
|
620
|
+
parse_error "unterminated comment meets EOF"
|
621
|
+
return on_comment(comm)
|
622
|
+
end
|
623
|
+
end
|
624
|
+
comm << $`
|
625
|
+
end
|
626
|
+
on_comment comm
|
627
|
+
end
|
628
|
+
|
629
|
+
|
630
|
+
def scan_pi(s)
|
631
|
+
unless /\A<\?([^ \t\n\r?]+)(?:[ \t\n\r]+|(?=\?\z))/ =~ s then
|
632
|
+
parse_error "parse error at `<?'"
|
633
|
+
s << '>' if @src.close_tag
|
634
|
+
on_chardata s
|
635
|
+
else
|
636
|
+
target = $1
|
637
|
+
pi = $'
|
638
|
+
until pi[-1] == ?? and @src.close_tag
|
639
|
+
s = @src.get_plain
|
640
|
+
unless s then
|
641
|
+
parse_error "unterminated PI meets EOF"
|
642
|
+
return on_pi(target, pi)
|
643
|
+
end
|
644
|
+
pi << s
|
645
|
+
end
|
646
|
+
pi.chop! # remove last `?'
|
647
|
+
on_pi target, pi
|
648
|
+
end
|
649
|
+
end
|
650
|
+
|
651
|
+
|
652
|
+
CDATAPattern = opt_regexp('\]\]\z')
|
653
|
+
|
654
|
+
def scan_cdata(s)
|
655
|
+
cdata = s
|
656
|
+
re = CDATAPattern[@optkey]
|
657
|
+
until re =~ cdata and @src.close_tag
|
658
|
+
s = @src.get_plain
|
659
|
+
unless s then
|
660
|
+
parse_error "unterminated CDATA section meets EOF"
|
661
|
+
return on_cdata(cdata)
|
662
|
+
end
|
663
|
+
cdata << s
|
664
|
+
end
|
665
|
+
cdata.chop!.chop! # remove ']]'
|
666
|
+
on_cdata cdata
|
667
|
+
end
|
668
|
+
|
669
|
+
|
670
|
+
def found_unclosed_etag(name)
|
671
|
+
if @src.tag_start? then
|
672
|
+
parse_error "unclosed end tag `#{name}' meets another tag"
|
673
|
+
else
|
674
|
+
parse_error "unclosed end tag `#{name}' meets EOF"
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
def found_empty_etag
|
679
|
+
parse_error "parse error at `</'"
|
680
|
+
on_chardata '</>'
|
681
|
+
end
|
682
|
+
|
683
|
+
|
684
|
+
def scan_etag(s)
|
685
|
+
s[0,2] = '' # remove '</'
|
686
|
+
if s.empty? then
|
687
|
+
if @src.close_tag then # </>
|
688
|
+
return found_empty_etag
|
689
|
+
else # </< or </[EOF]
|
690
|
+
parse_error "parse error at `</'"
|
691
|
+
s << '>' if @src.close_tag
|
692
|
+
return on_chardata('</' << s)
|
693
|
+
end
|
694
|
+
elsif /[ \t\n\r]+/ =~ s then
|
695
|
+
s1, s2 = $`, $'
|
696
|
+
if s1.empty? then # </ tag
|
697
|
+
parse_error "parse error at `</'"
|
698
|
+
s << '>' if @src.close_tag
|
699
|
+
return on_chardata('</' + s)
|
700
|
+
elsif not s2.empty? then # </ta g
|
701
|
+
parse_error "illegal whitespace is found within end tag `#{s1}'"
|
702
|
+
while @src.get_tag
|
703
|
+
end
|
704
|
+
end
|
705
|
+
s = s1
|
706
|
+
end
|
707
|
+
found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
|
708
|
+
on_etag s
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
def found_empty_stag
|
713
|
+
parse_error "parse error at `<'"
|
714
|
+
on_chardata '<>'
|
715
|
+
end
|
716
|
+
|
717
|
+
def found_unclosed_stag(name)
|
718
|
+
if @src.tag_start? then
|
719
|
+
parse_error "unclosed start tag `#{name}' meets another tag"
|
720
|
+
else
|
721
|
+
parse_error "unclosed start tag `#{name}' meets EOF"
|
722
|
+
end
|
723
|
+
end
|
724
|
+
|
725
|
+
def found_unclosed_emptyelem(name)
|
726
|
+
if @src.tag_start? then
|
727
|
+
parse_error "unclosed empty element tag `#{name}' meets another tag"
|
728
|
+
else
|
729
|
+
parse_error "unclosed empty element tag `#{name}' meets EOF"
|
730
|
+
end
|
731
|
+
end
|
732
|
+
|
733
|
+
|
734
|
+
def found_stag_error(s)
|
735
|
+
if /\A[\/='"]/ =~ s then
|
736
|
+
tok, s = $&, $'
|
737
|
+
elsif /(?=[ \t\n\r\/='"])/ =~ s then
|
738
|
+
tok, s = $`, $'
|
739
|
+
else
|
740
|
+
tok, s = s, nil
|
741
|
+
end
|
742
|
+
parse_error "parse error at `#{tok}'"
|
743
|
+
s
|
744
|
+
end
|
745
|
+
|
746
|
+
|
747
|
+
def scan_stag(s)
|
748
|
+
unless /(?=[\/ \t\n\r='"])/ =~ s then
|
749
|
+
name = s
|
750
|
+
name[0,1] = '' # remove `<'
|
751
|
+
if name.empty? then
|
752
|
+
if @src.close_tag then # <>
|
753
|
+
return found_empty_stag
|
754
|
+
else # << or <[EOF]
|
755
|
+
parse_error "parse error at `<'"
|
756
|
+
return on_chardata('<')
|
757
|
+
end
|
758
|
+
end
|
759
|
+
on_stag name
|
760
|
+
found_unclosed_stag name unless @src.close_tag
|
761
|
+
on_stag_end name
|
762
|
+
else
|
763
|
+
name = $`
|
764
|
+
s = $'
|
765
|
+
name[0,1] = '' # remove `<'
|
766
|
+
if name.empty? then # `< tag' or `<=`
|
767
|
+
parse_error "parse error at `<'"
|
768
|
+
s << '>' if @src.close_tag
|
769
|
+
return on_chardata('<' << s)
|
770
|
+
end
|
771
|
+
on_stag name
|
772
|
+
emptyelem = false
|
773
|
+
key,val,error,qmark,c = nil
|
774
|
+
begin
|
775
|
+
continue = false
|
776
|
+
s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
|
777
|
+
) { |key,val,error|
|
778
|
+
if key then # key="value"
|
779
|
+
on_attribute key
|
780
|
+
qmark = val.slice!(0,1)
|
781
|
+
if val[-1] == qmark[0] then
|
782
|
+
val.chop!
|
783
|
+
scan_attvalue val unless val.empty?
|
784
|
+
else
|
785
|
+
scan_attvalue val unless val.empty?
|
786
|
+
begin
|
787
|
+
s = @src.get
|
788
|
+
unless s then
|
789
|
+
parse_error "unterminated attribute `#{key}' meets EOF"
|
790
|
+
break
|
791
|
+
end
|
792
|
+
c = s[0]
|
793
|
+
val, s = s.split(qmark, 2)
|
794
|
+
if c == ?< then
|
795
|
+
wellformed_error "`<' is found in attribute `#{key}'"
|
796
|
+
elsif c != ?> then
|
797
|
+
scan_attvalue '>'
|
798
|
+
end
|
799
|
+
scan_attvalue val if c
|
800
|
+
end until s
|
801
|
+
continue = s # if eof then continue is false, else true.
|
802
|
+
end
|
803
|
+
on_attribute_end key
|
804
|
+
elsif error then
|
805
|
+
continue = s = found_stag_error(error)
|
806
|
+
else
|
807
|
+
emptyelem = true
|
808
|
+
end
|
809
|
+
}
|
810
|
+
end while continue
|
811
|
+
unless @src.close_tag then
|
812
|
+
if emptyelem then
|
813
|
+
found_unclosed_emptyelem name
|
814
|
+
else
|
815
|
+
found_unclosed_stag name
|
816
|
+
end
|
817
|
+
end
|
818
|
+
if emptyelem then
|
819
|
+
on_stag_end_empty name
|
820
|
+
else
|
821
|
+
on_stag_end name
|
822
|
+
end
|
823
|
+
end
|
824
|
+
end
|
825
|
+
|
826
|
+
|
827
|
+
def scan_bang_tag(s)
|
828
|
+
parse_error "parse error at `<!'"
|
829
|
+
s << '>' if @src.close_tag
|
830
|
+
on_chardata s
|
831
|
+
end
|
832
|
+
|
833
|
+
|
834
|
+
def scan_content(s)
|
835
|
+
src = @src # for speed
|
836
|
+
while s
|
837
|
+
if (c = s[0]) == ?< then
|
838
|
+
if (c = s[1]) == ?/ then
|
839
|
+
scan_etag s
|
840
|
+
elsif c == ?! then
|
841
|
+
if s[2] == ?- and s[3] == ?- then
|
842
|
+
scan_comment s
|
843
|
+
elsif /\A<!\[CDATA\[/ =~ s then
|
844
|
+
scan_cdata $'
|
845
|
+
else
|
846
|
+
scan_bang_tag s
|
847
|
+
end
|
848
|
+
elsif c == ?? then
|
849
|
+
scan_pi s
|
850
|
+
else
|
851
|
+
scan_stag s
|
852
|
+
end
|
853
|
+
else
|
854
|
+
scan_chardata s
|
855
|
+
end
|
856
|
+
s = src.get
|
857
|
+
end
|
858
|
+
end
|
859
|
+
|
860
|
+
|
861
|
+
def get_until_qmark(str, qmark)
|
862
|
+
begin
|
863
|
+
#s = @src.get_plain
|
864
|
+
s = @src.get
|
865
|
+
break unless s
|
866
|
+
c = s[0]
|
867
|
+
v, s = s.split(qmark, 2)
|
868
|
+
str << '>' unless c == ?< or c == ?> # De Morgan
|
869
|
+
str << v if c
|
870
|
+
end until s
|
871
|
+
s
|
872
|
+
end
|
873
|
+
|
874
|
+
|
875
|
+
XMLDeclPattern = opt_regexp(%q{[ \t\n\r]([\-_\d\w]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|(\?\z)|([\-_.\d\w]+|[^ \t\n\r])})
|
876
|
+
|
877
|
+
def scan_xmldecl(s)
|
878
|
+
endmark = nil
|
879
|
+
info = nil
|
880
|
+
state = 0
|
881
|
+
on_xmldecl
|
882
|
+
begin
|
883
|
+
continue = false
|
884
|
+
s.scan(XMLDeclPattern[@optkey]) { |key,val,endtok,error|
|
885
|
+
if key then
|
886
|
+
qmark = val.slice!(0,1) # remove quotation marks
|
887
|
+
if val[-1] == qmark[0] then
|
888
|
+
val.chop!
|
889
|
+
else
|
890
|
+
continue = s = get_until_qmark(val, qmark)
|
891
|
+
unless s then
|
892
|
+
parse_error "unterminated XML declaration meets EOF"
|
893
|
+
endmark = true
|
894
|
+
end
|
895
|
+
end
|
896
|
+
newstate = case state
|
897
|
+
when 0; key == 'version' ? 1 : 4
|
898
|
+
when 1; key == 'encoding' ? 2 : key == 'standalone' ? 3 : 4
|
899
|
+
else key == 'standalone' ? 3 : 4
|
900
|
+
end
|
901
|
+
state = if newstate == 4
|
902
|
+
known=%w{version encoding standalone}.member?(key)
|
903
|
+
parse_error known ? "#{key} declaration must not be here" :
|
904
|
+
"unknown declaration `#{key}' in XML declaration"
|
905
|
+
state < 2 ? 2 : 3
|
906
|
+
else newstate end
|
907
|
+
on_xmldecl_key key, val
|
908
|
+
elsif endtok then
|
909
|
+
endmark = if ct=@src.close_tag
|
910
|
+
true
|
911
|
+
else
|
912
|
+
parse_error "unexpected `#{endmark}' found in XML declaration"
|
913
|
+
nil
|
914
|
+
end
|
915
|
+
# here always exit the loop.
|
916
|
+
else
|
917
|
+
parse_error "parse error at `#{error}'"
|
918
|
+
end
|
919
|
+
}
|
920
|
+
end while !endmark and continue || s = @src.get_plain
|
921
|
+
parse_error "unterminated XML declaration meets EOF" unless s or endmark
|
922
|
+
parse_error "no declaration found in XML declaration" if state == 0
|
923
|
+
on_xmldecl_end
|
924
|
+
end
|
925
|
+
|
926
|
+
|
927
|
+
SkipDTD = opt_regexp(%q{(['"]|\A<!--|\A<\?|--\z|\?\z)|\]\s*\z}) #'
|
928
|
+
|
929
|
+
def skip_internal_dtd(s)
|
930
|
+
quote = nil
|
931
|
+
continue = true
|
932
|
+
begin # skip until `]>'
|
933
|
+
s.scan(SkipDTD[@optkey]) { |q,| #'
|
934
|
+
if quote then
|
935
|
+
quote = nil if quote == q and quote.size == 1 || @src.tag_end?
|
936
|
+
elsif q then
|
937
|
+
if q == '<!--' then
|
938
|
+
quote = '--'
|
939
|
+
elsif q == '<?' then
|
940
|
+
quote = '?'
|
941
|
+
elsif q == '"' or q == "'" then
|
942
|
+
quote = q
|
943
|
+
end
|
944
|
+
elsif @src.close_tag then
|
945
|
+
continue = false
|
946
|
+
end
|
947
|
+
}
|
948
|
+
end while continue and s = @src.get
|
949
|
+
parse_error "unterminated internal DTD subset meets EOF" unless s
|
950
|
+
end
|
951
|
+
|
952
|
+
|
953
|
+
def scan_internal_dtd(s)
|
954
|
+
warning "internal DTD subset is not supported"
|
955
|
+
skip_internal_dtd s
|
956
|
+
end
|
957
|
+
|
958
|
+
|
959
|
+
def found_invalid_pubsys(pubsys)
|
960
|
+
parse_error "`PUBLIC' or `SYSTEM' should be here"
|
961
|
+
'SYSTEM'
|
962
|
+
end
|
963
|
+
|
964
|
+
|
965
|
+
DoctypePattern = opt_regexp(%q{[ \t\n\r](?:([^ \t\n\r\/'"=\[]+)|('[^']*'?|"[^"]*"?))|([\-_.\d\w]+|[^ \t\n\r])}) #"
|
966
|
+
|
967
|
+
def scan_doctype(s)
|
968
|
+
root = syspub = sysid = pubid = nil
|
969
|
+
internal_dtd = false
|
970
|
+
re = DoctypePattern[@opt]
|
971
|
+
begin
|
972
|
+
if re =~ s then
|
973
|
+
name, str, delim, s = $1, $2, $3, $'
|
974
|
+
if name then
|
975
|
+
if not root then
|
976
|
+
root = name
|
977
|
+
elsif not syspub then
|
978
|
+
unless name == 'PUBLIC' or name == 'SYSTEM' then
|
979
|
+
name = found_invalid_pubsys(name)
|
980
|
+
end
|
981
|
+
syspub = name
|
982
|
+
else
|
983
|
+
parse_error "parse error at `#{name}'"
|
984
|
+
end
|
985
|
+
elsif str then
|
986
|
+
qmark = str.slice!(0,1) # remove quotation marks
|
987
|
+
unless syspub then
|
988
|
+
parse_error "parse error at `#{qmark}'"
|
989
|
+
s = str << s
|
990
|
+
else
|
991
|
+
if str[-1] == qmark[0] then
|
992
|
+
str.chop!
|
993
|
+
else
|
994
|
+
s = get_until_qmark(str, qmark) || ''
|
995
|
+
end
|
996
|
+
if not sysid then
|
997
|
+
sysid = str
|
998
|
+
elsif not pubid and syspub == 'PUBLIC' then
|
999
|
+
pubid = sysid
|
1000
|
+
sysid = str
|
1001
|
+
else
|
1002
|
+
parse_error "too many external ID literals in DOCTYPE"
|
1003
|
+
end
|
1004
|
+
end
|
1005
|
+
elsif delim == '[' then
|
1006
|
+
internal_dtd = true
|
1007
|
+
break
|
1008
|
+
else
|
1009
|
+
parse_error "parse error at `#{delim}'"
|
1010
|
+
end
|
1011
|
+
else
|
1012
|
+
s = ''
|
1013
|
+
end
|
1014
|
+
if s.empty? then
|
1015
|
+
break if @src.close_tag
|
1016
|
+
s = @src.get_plain
|
1017
|
+
end
|
1018
|
+
end while s
|
1019
|
+
parse_error "unterminated DOCTYPE declaration meets EOF" unless s
|
1020
|
+
unless root then
|
1021
|
+
parse_error "no root element is specified in DOCTYPE"
|
1022
|
+
end
|
1023
|
+
if syspub and not sysid then
|
1024
|
+
parse_error "too few external ID literals in DOCTYPE"
|
1025
|
+
end
|
1026
|
+
if syspub == 'PUBLIC' and not pubid then
|
1027
|
+
pubid, sysid = sysid, nil
|
1028
|
+
end
|
1029
|
+
on_doctype root, pubid, sysid
|
1030
|
+
scan_internal_dtd s if internal_dtd
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
|
1034
|
+
def scan_prolog(s)
|
1035
|
+
if /\A<\?xml(?=[ \t\n\r])/ =~ s then
|
1036
|
+
scan_xmldecl $'
|
1037
|
+
s = @src.get
|
1038
|
+
end
|
1039
|
+
doctype = true
|
1040
|
+
src = @src # for speed
|
1041
|
+
while s
|
1042
|
+
if s[0] == ?< then
|
1043
|
+
if (c = s[1]) == ?! then
|
1044
|
+
if s[2] == ?- and s[3] == ?- then
|
1045
|
+
scan_comment s
|
1046
|
+
elsif /\A<!DOCTYPE(?=[ \t\n\r])/ =~ s and doctype then
|
1047
|
+
doctype = false
|
1048
|
+
scan_doctype $'
|
1049
|
+
else
|
1050
|
+
break
|
1051
|
+
end
|
1052
|
+
elsif c == ?? then
|
1053
|
+
scan_pi s
|
1054
|
+
else
|
1055
|
+
break
|
1056
|
+
end
|
1057
|
+
s = src.get
|
1058
|
+
elsif /[^ \t\r\n]/ !~ s then
|
1059
|
+
on_prolog_space s unless s.empty?
|
1060
|
+
s = src.get_plain
|
1061
|
+
else
|
1062
|
+
break
|
1063
|
+
end
|
1064
|
+
end
|
1065
|
+
scan_content(s || src.get)
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
|
1069
|
+
def scan_document
|
1070
|
+
on_start_document
|
1071
|
+
@src.prepare
|
1072
|
+
scan_prolog @src.get
|
1073
|
+
on_end_document
|
1074
|
+
end
|
1075
|
+
|
1076
|
+
|
1077
|
+
def make_source(src)
|
1078
|
+
Source.new src
|
1079
|
+
end
|
1080
|
+
|
1081
|
+
|
1082
|
+
public
|
1083
|
+
|
1084
|
+
def parse_document(src)
|
1085
|
+
@src = make_source(src)
|
1086
|
+
begin
|
1087
|
+
scan_document
|
1088
|
+
ensure
|
1089
|
+
@src = nil
|
1090
|
+
end
|
1091
|
+
self
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
alias parse parse_document
|
1095
|
+
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
|
1102
|
+
|
1103
|
+
|
1104
|
+
|
1105
|
+
if $0 == __FILE__ then
|
1106
|
+
class TestVisitor
|
1107
|
+
include XMLScan::Visitor
|
1108
|
+
def parse_error(msg)
|
1109
|
+
STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
1110
|
+
end
|
1111
|
+
def wellformed_error(msg)
|
1112
|
+
STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
1113
|
+
end
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
$s = scan = XMLScan::XMLScanner.new(TestVisitor.new)
|
1117
|
+
src = ARGF
|
1118
|
+
def src.path; filename; end
|
1119
|
+
t1 = Time.times.utime
|
1120
|
+
scan.parse src
|
1121
|
+
t2 = Time.times.utime
|
1122
|
+
STDERR.printf "%2.3f sec\n", t2 - t1
|
1123
|
+
end
|