xmlscan 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/THANKS ADDED
@@ -0,0 +1,11 @@
1
+ $Id: THANKS,v 1.4.2.1 2003/02/15 13:22:01 katsu Exp $
2
+
3
+ Thanks to all of the following for their valuable hints, fixes,
4
+ discussions, and contributions:
5
+
6
+ Yoshida Masato <yoshidam@yoshidam.net>
7
+ TAKAHASHI Masayoshi <maki@inac.co.jp>
8
+ NAKAMURA, Hiroshi <nakahiro@sarion.co.jp>
9
+ James Britt <james@jamesbritt.com>
10
+ Takaaki Tateishi <ttate@kt.jaist.ac.jp>
11
+ Tanaka Akira <akr@m17n.org>
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.3
data/install.rb ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # install.rb
4
+ #
5
+ # $Id: install.rb,v 1.2 2002/12/26 21:09:38 katsu Exp $
6
+
7
+ require 'rbconfig'
8
+ require 'ftools'
9
+ require 'find'
10
+ require 'getoptlong'
11
+
12
+ DEFAULT_DESTDIR = Config::CONFIG['sitelibdir'] || Config::CONFIG['sitedir']
13
+ SRCDIR = File.dirname(__FILE__)
14
+
15
+
16
+ def install_rb(from, to)
17
+ from = SRCDIR + '/' + from
18
+ Find.find(from) { |src|
19
+ next unless File.file? src
20
+ next unless /\.rb\z/ =~ src
21
+ dst = src.sub(/\A#{Regexp.escape(from)}/, to)
22
+ File.makedirs File.dirname(dst), true
23
+ File.install src, dst, 0644, true
24
+ }
25
+ end
26
+
27
+
28
+ destdir = DEFAULT_DESTDIR
29
+ begin
30
+ GetoptLong.new([ "-d", "--destdir", GetoptLong::REQUIRED_ARGUMENT ]
31
+ ).each_option { |opt, arg|
32
+ case opt
33
+ when '-d' then
34
+ destdir = arg
35
+ end
36
+ }
37
+ rescue
38
+ exit 2
39
+ end
40
+
41
+ install_rb "lib", destdir
@@ -0,0 +1,290 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/htmlscan.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: htmlscan.rb,v 1.16.2.2 2003/05/01 15:43:23 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/scanner'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class HTMLScanner < XMLScanner
16
+
17
+ private
18
+
19
+ def wellformed_error(msg)
20
+ # All wellformed error raised by XMLScanner are ignored.
21
+ # XMLScanner only raises wellformed error in stan_stag, which is a
22
+ # method completely overrided by HTMLScanner, so this method is
23
+ # never called in fact.
24
+ end
25
+
26
+ def on_xmldecl
27
+ raise "[BUG] this method must be never called"
28
+ end
29
+
30
+ def on_xmldecl_version(str)
31
+ raise "[BUG] this method must be never called"
32
+ end
33
+
34
+ def on_xmldecl_encoding(str)
35
+ raise "[BUG] this method must be never called"
36
+ end
37
+
38
+ def on_xmldecl_standalone(str)
39
+ raise "[BUG] this method must be never called"
40
+ end
41
+
42
+ def on_xmldecl_other(name, value)
43
+ raise "[BUG] this method must be never called"
44
+ end
45
+
46
+ def on_xmldecl_end
47
+ raise "[BUG] this method must be never called"
48
+ end
49
+
50
+ def on_stag_end_empty(name)
51
+ raise "[BUG] this method must be never called"
52
+ end
53
+
54
+
55
+ private
56
+
57
+ def scan_comment(s)
58
+ s[0,4] = '' # remove `<!--'
59
+ comm = ''
60
+ until /--/n =~ s
61
+ comm << s
62
+ s = @src.get_plain
63
+ unless s then
64
+ parse_error "unterminated comment meets EOF"
65
+ return on_comment(comm)
66
+ end
67
+ end
68
+ comm << $`
69
+ s = $'
70
+ until s.empty? || s.strip.empty? and @src.close_tag # --> or -- >
71
+ comm << '--'
72
+ if /\A\s*--/n =~ s then # <!--hoge-- --
73
+ comm << $&
74
+ s = $'
75
+ if s.empty? and @src.close_tag then # <!--hoge-- -->
76
+ parse_error "`-->' is found but comment must not end here"
77
+ comm.chop!.chop!
78
+ break
79
+ end
80
+ else # <!--hoge-- fuga
81
+ parse_error "only whitespace can appear between two comments"
82
+ end
83
+ if /\A-\s*\z/n =~ s and @src.close_tag then # <!--hoge--->
84
+ parse_error "`-->' is found but comment must not end here"
85
+ comm.chop!
86
+ break
87
+ end
88
+ until /--/n =~ s # copy & paste for performance
89
+ comm << s
90
+ s = @src.get_plain
91
+ unless s then
92
+ parse_error "unterminated comment meets EOF"
93
+ return on_comment(comm)
94
+ end
95
+ end
96
+ comm << $`
97
+ s = $'
98
+ end
99
+ on_comment comm
100
+ end
101
+
102
+
103
+ alias scan_xml_pi scan_pi # PIO "<?" PIC "?>" -- <? PI ?> --
104
+
105
+
106
+ def scan_pi(s) # <?PI > this is default in SGML.
107
+ s[0,2] = '' # remove `<?'
108
+ pi = s
109
+ until @src.close_tag
110
+ s = @src.get_plain
111
+ unless s then
112
+ parse_error "unterminated PI meets EOF"
113
+ break
114
+ end
115
+ pi << s
116
+ end
117
+ on_pi '', pi
118
+ end
119
+
120
+
121
+ def scan_stag(s)
122
+ unless /(?=[\/\s='"])/n =~ s then
123
+ name = s
124
+ name[0,1] = '' # remove `<'
125
+ if name.empty? then # <> or <<
126
+ if @src.close_tag then
127
+ return found_empty_stag
128
+ else
129
+ parse_error "parse error at `<'"
130
+ return on_chardata('<')
131
+ end
132
+ end
133
+ on_stag name
134
+ found_unclosed_stag name unless @src.close_tag
135
+ on_stag_end name
136
+ else
137
+ name = $`
138
+ s = $'
139
+ name[0,1] = '' # remove `<'
140
+ if name.empty? then # `< tag' or `<=`
141
+ parse_error "parse error at `<'"
142
+ if @src.close_tag then
143
+ s << '>'
144
+ end
145
+ return on_chardata('<' << s)
146
+ end
147
+ on_stag name
148
+ begin
149
+ continue = false
150
+ s.scan(
151
+ /([^\s=\/'"]+)(?:\s*=\s*(?:('[^']*'?|"[^"]*"?)|([^\s='"]+)))?|(\S)/n
152
+ ) { |key,val,val2,error|
153
+ if key then
154
+ if val then # key="value"
155
+ on_attribute key
156
+ qmark = val.slice!(0,1)
157
+ if val[-1] == qmark[0] then
158
+ val.chop!
159
+ scan_attvalue val unless val.empty?
160
+ else
161
+ scan_attvalue val unless val.empty?
162
+ begin
163
+ s = @src.get
164
+ unless s then
165
+ parse_error "unterminated attribute `#{key}' meets EOF"
166
+ break
167
+ end
168
+ c = s[0]
169
+ val, s = s.split(qmark, 2)
170
+ scan_attvalue '>' unless c == ?< or c == ?>
171
+ scan_attvalue val if c
172
+ end until s
173
+ continue = s
174
+ end
175
+ on_attribute_end key
176
+ elsif val2 then # key=value
177
+ on_attribute key
178
+ on_attr_value val2
179
+ on_attribute_end key
180
+ else # value
181
+ on_attribute nil
182
+ on_attr_value key
183
+ on_attribute_end nil
184
+ end
185
+ else
186
+ parse_error "parse error at `#{error}'"
187
+ end
188
+ }
189
+ end while continue
190
+ found_unclosed_stag name unless @src.close_tag
191
+ on_stag_end name
192
+ end
193
+ end
194
+
195
+
196
+ # This method should be called only from on_stag_end.
197
+ def get_cdata_content
198
+ unless not s = @src.test or s[0] == ?< && s[1] == ?/ then
199
+ dst = @src.get
200
+ until not s = @src.test or s[0] == ?< && s[1] == ?/
201
+ dst << @src.get_plain
202
+ end
203
+ dst
204
+ else
205
+ ''
206
+ end
207
+ end
208
+ public :get_cdata_content
209
+
210
+
211
+ def scan_bang_tag(s)
212
+ if s == '<!' and @src.close_tag then # <!>
213
+ on_comment ''
214
+ else
215
+ parse_error "parse error at `<!'"
216
+ while s and not @src.close_tag # skip entire
217
+ s = @src.get_plain
218
+ end
219
+ end
220
+ end
221
+
222
+
223
+ def scan_internal_dtd(s)
224
+ parse_error "DTD subset is found but it is not permitted in HTML"
225
+ skip_internal_dtd s
226
+ end
227
+
228
+
229
+ def found_invalid_pubsys(pubsys)
230
+ s = pubsys.upcase
231
+ return s if s == 'PUBLIC' or s == 'SYSTEM'
232
+ super
233
+ end
234
+
235
+
236
+ def scan_prolog(s)
237
+ doctype = 0
238
+ while s
239
+ if s[0] == ?< then
240
+ if (c = s[1]) == ?! then
241
+ if s[2] == ?- and s[3] == ?- then
242
+ scan_comment s
243
+ elsif /\A<!doctype(?=\s)/in =~ s then
244
+ doctype += 1
245
+ if doctype > 1 then
246
+ parse_error "another document type declaration is found"
247
+ end
248
+ scan_doctype $'
249
+ else
250
+ break
251
+ end
252
+ elsif c == ?? then
253
+ scan_pi s
254
+ else
255
+ break
256
+ end
257
+ elsif s.strip.empty? then
258
+ on_prolog_space s
259
+ else
260
+ break
261
+ end
262
+ s = @src.get
263
+ end
264
+ scan_content(s || @src.get)
265
+ end
266
+
267
+ end
268
+
269
+ end
270
+
271
+
272
+
273
+
274
+
275
+ if $0 == __FILE__ then
276
+ class TestVisitor
277
+ include XMLScan::Visitor
278
+ def parse_error(msg)
279
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
280
+ end
281
+ end
282
+
283
+ $s = scan = XMLScan::HTMLScanner.new(TestVisitor.new)
284
+ src = ARGF
285
+ def src.path; filename; end
286
+ t1 = Time.times.utime
287
+ scan.parse src
288
+ t2 = Time.times.utime
289
+ STDERR.printf "%2.3f sec\n", t2 - t1
290
+ end
@@ -0,0 +1,353 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/namespace.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: namespace.rb,v 1.13 2003/01/22 13:06:18 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/parser'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class NSParseError < ParseError ; end
16
+ class NSNotWellFormedError < NotWellFormedError ; end
17
+ class NSNotValidError < NotValidError ; end
18
+
19
+
20
+ module NSVisitor
21
+
22
+ include Visitor
23
+
24
+ def ns_parse_error(msg)
25
+ raise NSParseError.new(msg)
26
+ end
27
+
28
+ def ns_wellformed_error(msg)
29
+ raise NSNotWellFormedError.new(msg)
30
+ end
31
+
32
+ def ns_valid_error(msg)
33
+ raise NSNotValidError.new(msg)
34
+ end
35
+
36
+ #
37
+ # <foo:bar hoge:fuga='' hoge='' >
38
+ # <foo hoge:fuga='' hoge='' >
39
+ # ^ ^ ^ ^ ^ ^
40
+ # 1 2 3 4 5 6
41
+ #
42
+ # The following method will be called with the following arguments
43
+ # when the parser reaches the above point;
44
+ #
45
+ # 1: on_stag_ns ('foo:bar', 'foo', 'bar')
46
+ # or
47
+ # on_stag_ns ('foo', '', 'foo')
48
+ # 2: on_attribute_ns ('hoge:fuga', 'hoge', 'fuga')
49
+ # 3: on_attribute_end ('hoge:fuga')
50
+ # 4: on_attribute_ns ('hoge', nil, 'hoge')
51
+ # 5: on_attribute_end ('hoge')
52
+ # 6: on_stag_end_ns ('foo:bar', { 'foo' => '', ... })
53
+ # or
54
+ # on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
55
+ #
56
+
57
+ def on_stag_ns(qname, prefix, localpart)
58
+ end
59
+
60
+ def on_attribute_ns(qname, prefix, localpart)
61
+ end
62
+
63
+ def on_stag_end_ns(qname, namespaces)
64
+ end
65
+
66
+ def on_stag_end_empty_ns(qname, namespaces)
67
+ end
68
+
69
+ end
70
+
71
+
72
+
73
+
74
+ class XMLNamespaceDecoration < Decoration
75
+
76
+ proc {
77
+ h = {'foo'=>true} ; h['foo'] = nil
78
+ raise "requires Ruby-1.6 or above" unless h.key? 'foo'
79
+ }.call
80
+
81
+ PredefinedNamespace = {
82
+ 'xml' => 'http://www.w3.org/XML/1998/namespace',
83
+ 'xmlns' => 'http://www.w3.org/2000/xmlns/',
84
+ }
85
+
86
+ ReservedNamespace = PredefinedNamespace.invert
87
+
88
+
89
+ def ns_parse_error(msg)
90
+ @orig_visitor.ns_parse_error msg
91
+ end
92
+
93
+ def ns_wellformed_error(msg)
94
+ @orig_visitor.ns_wellformed_error msg
95
+ end
96
+
97
+ def ns_valid_error(msg)
98
+ @orig_visitor.ns_valid_error msg
99
+ end
100
+
101
+
102
+ def on_start_document
103
+ @namespace = {} #PredefinedNamespace.dup
104
+ @ns_hist = []
105
+ @ns_undeclared = {} # for checking undeclared namespace prefixes.
106
+ @prev_prefix = {} # for checking doubled attributes.
107
+ @dont_same = [] # ditto.
108
+ @xmlns = NamespaceDeclaration.new(self)
109
+ @orig_visitor = @visitor
110
+ @visitor.on_start_document
111
+ end
112
+
113
+
114
+ def on_stag(name)
115
+ @ns_hist.push nil
116
+ unless /:/n =~ name then
117
+ @visitor.on_stag_ns name, '', name
118
+ else
119
+ prefix, localpart = $`, $'
120
+ if localpart.include? ?: then
121
+ ns_parse_error "localpart `#{localpart}' includes `:'"
122
+ end
123
+ if prefix == 'xmlns' then
124
+ ns_wellformed_error \
125
+ "prefix `xmlns' is not used for namespace prefix declaration"
126
+ end
127
+ unless @namespace.key? prefix then
128
+ if uri = PredefinedNamespace[prefix] then
129
+ @namespace[prefix] = uri
130
+ else
131
+ @ns_undeclared[prefix] = true
132
+ end
133
+ end
134
+ @visitor.on_stag_ns name, prefix, localpart
135
+ end
136
+ end
137
+
138
+
139
+ def on_attribute(name)
140
+ if /:/n =~ name then
141
+ prefix, localpart = $`, $'
142
+ if localpart.include? ?: then
143
+ ns_parse_error "localpart `#{localpart}' includes `:'"
144
+ end
145
+ unless @namespace.key? prefix then
146
+ if uri = PredefinedNamespace[prefix] then
147
+ @namespace[prefix] = uri
148
+ else
149
+ @ns_undeclared[prefix] = true
150
+ end
151
+ end
152
+ if prefix == 'xmlns' then
153
+ @visitor = @xmlns
154
+ @xmlns.on_xmlns_start localpart
155
+ else
156
+ if prev = @prev_prefix[localpart] then
157
+ @dont_same.push [ prev, prefix, localpart ]
158
+ end
159
+ @prev_prefix[localpart] = prefix
160
+ @visitor.on_attribute_ns name, prefix, localpart
161
+ end
162
+ elsif name == 'xmlns' then
163
+ @visitor = @xmlns
164
+ @xmlns.on_xmlns_start ''
165
+ else
166
+ @visitor.on_attribute_ns name, nil, name
167
+ end
168
+ end
169
+
170
+
171
+ class NamespaceDeclaration
172
+
173
+ include XMLScan::Visitor
174
+
175
+ def initialize(parent)
176
+ @parent = parent
177
+ end
178
+
179
+ def on_xmlns_start(prefix)
180
+ @prefix = prefix
181
+ @nsdecl = ''
182
+ end
183
+
184
+ def on_attr_value(str)
185
+ @nsdecl << str
186
+ end
187
+
188
+ def on_attr_entityref(ref)
189
+ @parent.ns_wellformed_error \
190
+ "xmlns includes undeclared entity reference"
191
+ end
192
+
193
+ def on_attr_charref(code)
194
+ @nsdecl << [code].pack('U')
195
+ end
196
+
197
+ def on_attr_charref_hex(code)
198
+ @nsdecl << [code].pack('U')
199
+ end
200
+
201
+ def on_attribute_end(name)
202
+ @parent.on_xmlns_end @prefix, @nsdecl
203
+ end
204
+
205
+ end
206
+
207
+
208
+ def on_xmlns_end(prefix, uri)
209
+ @visitor = @orig_visitor
210
+ if PredefinedNamespace.key? prefix then
211
+ if prefix == 'xmlns' then
212
+ ns_wellformed_error \
213
+ "prefix `xmlns' can't be bound to any namespace explicitly"
214
+ elsif (s = PredefinedNamespace[prefix]) != uri then
215
+ ns_wellformed_error \
216
+ "prefix `#{prefix}' can't be bound to any namespace except `#{s}'"
217
+ end
218
+ end
219
+ if uri.empty? then
220
+ if prefix.empty? then
221
+ uri = nil
222
+ else
223
+ ns_parse_error "`#{prefix}' is bound to empty namespace name"
224
+ end
225
+ elsif ReservedNamespace.key? uri then
226
+ unless (s = ReservedNamespace[uri]) == prefix then
227
+ ns_wellformed_error \
228
+ "namespace `#{uri}' is reserved for prefix `#{s}'"
229
+ end
230
+ end
231
+ (@ns_hist.last || @ns_hist[-1] = {})[prefix] = @namespace[prefix]
232
+ @namespace[prefix] = uri
233
+ @ns_undeclared.delete prefix
234
+ end
235
+
236
+
237
+ def fix_namespace
238
+ unless @ns_undeclared.empty? then
239
+ @ns_undeclared.each_key { |i|
240
+ @visitor.ns_wellformed_error "prefix `#{i}' is not declared"
241
+ }
242
+ @ns_undeclared.clear
243
+ end
244
+ unless @dont_same.empty? then
245
+ @dont_same.each { |n1,n2,l|
246
+ if @namespace[n1] == @namespace[n2] then
247
+ ns_wellformed_error \
248
+ "doubled localpart `#{l}' in the same namespace"
249
+ end
250
+ }
251
+ @dont_same.clear
252
+ end
253
+ @prev_prefix.clear
254
+ end
255
+
256
+
257
+ def on_stag_end(name)
258
+ fix_namespace
259
+ @visitor.on_stag_end_ns name, @namespace
260
+ end
261
+
262
+
263
+ def on_etag(name)
264
+ h = @ns_hist.pop and @namespace.update h
265
+ @visitor.on_etag name
266
+ end
267
+
268
+
269
+ def on_stag_end_empty(name)
270
+ fix_namespace
271
+ @visitor.on_stag_end_empty_ns name, @namespace
272
+ h = @ns_hist.pop and @namespace.update h
273
+ end
274
+
275
+
276
+ def on_doctype(root, pubid, sysid)
277
+ if root.count(':') > 1 then
278
+ ns_parse_error "qualified name `#{root}' includes `:'"
279
+ end
280
+ @visitor.on_doctype root, pubid, sysid
281
+ end
282
+
283
+
284
+ def on_pi(target, pi)
285
+ if target.include? ?: then
286
+ ns_parse_error "PI target `#{target}' includes `:'"
287
+ end
288
+ @visitor.on_pi target, pi
289
+ end
290
+
291
+
292
+ def on_entityref(ref)
293
+ if ref.include? ?: then
294
+ ns_parse_error "entity reference `#{ref}' includes `:'"
295
+ end
296
+ @visitor.on_entityref ref
297
+ end
298
+
299
+
300
+ def on_attr_entityref(ref)
301
+ if ref.include? ?: then
302
+ ns_parse_error "entity reference `#{ref}' includes `:'"
303
+ end
304
+ @visitor.on_attr_entityref ref
305
+ end
306
+
307
+ end
308
+
309
+
310
+
311
+ class XMLParserNS < XMLParser
312
+
313
+ def initialize(*)
314
+ super
315
+ @visitor = @decoration = XMLNamespaceDecoration.new(@visitor)
316
+ end
317
+
318
+ end
319
+
320
+ end
321
+
322
+
323
+
324
+
325
+
326
+ if $0 == __FILE__ then
327
+ class TestVisitor
328
+ include XMLScan::NSVisitor
329
+ def parse_error(msg)
330
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
331
+ end
332
+ def wellformed_error(msg)
333
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
334
+ end
335
+ def warning(msg)
336
+ STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
337
+ end
338
+ def ns_parse_error(msg)
339
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
340
+ end
341
+ def ns_wellformed_error(msg)
342
+ STDERR.printf("%s:%d: NSC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
343
+ end
344
+ end
345
+
346
+ $s = scan = XMLScan::XMLParserNS.new(TestVisitor.new)
347
+ src = ARGF
348
+ def src.path; filename; end
349
+ t1 = Time.times.utime
350
+ scan.parse src
351
+ t2 = Time.times.utime
352
+ STDERR.printf "%2.3f sec\n", t2 - t1
353
+ end