xmlscan 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/THANKS ADDED
@@ -0,0 +1,11 @@
1
+ $Id: THANKS,v 1.4.2.1 2003/02/15 13:22:01 katsu Exp $
2
+
3
+ Thanks to all of the following for their valuable hints, fixes,
4
+ discussions, and contributions:
5
+
6
+ Yoshida Masato <yoshidam@yoshidam.net>
7
+ TAKAHASHI Masayoshi <maki@inac.co.jp>
8
+ NAKAMURA, Hiroshi <nakahiro@sarion.co.jp>
9
+ James Britt <james@jamesbritt.com>
10
+ Takaaki Tateishi <ttate@kt.jaist.ac.jp>
11
+ Tanaka Akira <akr@m17n.org>
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.3
data/install.rb ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # install.rb
4
+ #
5
+ # $Id: install.rb,v 1.2 2002/12/26 21:09:38 katsu Exp $
6
+
7
+ require 'rbconfig'
8
+ require 'ftools'
9
+ require 'find'
10
+ require 'getoptlong'
11
+
12
+ DEFAULT_DESTDIR = Config::CONFIG['sitelibdir'] || Config::CONFIG['sitedir']
13
+ SRCDIR = File.dirname(__FILE__)
14
+
15
+
16
+ def install_rb(from, to)
17
+ from = SRCDIR + '/' + from
18
+ Find.find(from) { |src|
19
+ next unless File.file? src
20
+ next unless /\.rb\z/ =~ src
21
+ dst = src.sub(/\A#{Regexp.escape(from)}/, to)
22
+ File.makedirs File.dirname(dst), true
23
+ File.install src, dst, 0644, true
24
+ }
25
+ end
26
+
27
+
28
+ destdir = DEFAULT_DESTDIR
29
+ begin
30
+ GetoptLong.new([ "-d", "--destdir", GetoptLong::REQUIRED_ARGUMENT ]
31
+ ).each_option { |opt, arg|
32
+ case opt
33
+ when '-d' then
34
+ destdir = arg
35
+ end
36
+ }
37
+ rescue
38
+ exit 2
39
+ end
40
+
41
+ install_rb "lib", destdir
@@ -0,0 +1,290 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/htmlscan.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: htmlscan.rb,v 1.16.2.2 2003/05/01 15:43:23 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/scanner'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class HTMLScanner < XMLScanner
16
+
17
+ private
18
+
19
+ def wellformed_error(msg)
20
+ # All wellformed error raised by XMLScanner are ignored.
21
+ # XMLScanner only raises wellformed error in stan_stag, which is a
22
+ # method completely overrided by HTMLScanner, so this method is
23
+ # never called in fact.
24
+ end
25
+
26
+ def on_xmldecl
27
+ raise "[BUG] this method must be never called"
28
+ end
29
+
30
+ def on_xmldecl_version(str)
31
+ raise "[BUG] this method must be never called"
32
+ end
33
+
34
+ def on_xmldecl_encoding(str)
35
+ raise "[BUG] this method must be never called"
36
+ end
37
+
38
+ def on_xmldecl_standalone(str)
39
+ raise "[BUG] this method must be never called"
40
+ end
41
+
42
+ def on_xmldecl_other(name, value)
43
+ raise "[BUG] this method must be never called"
44
+ end
45
+
46
+ def on_xmldecl_end
47
+ raise "[BUG] this method must be never called"
48
+ end
49
+
50
+ def on_stag_end_empty(name)
51
+ raise "[BUG] this method must be never called"
52
+ end
53
+
54
+
55
+ private
56
+
57
+ def scan_comment(s)
58
+ s[0,4] = '' # remove `<!--'
59
+ comm = ''
60
+ until /--/n =~ s
61
+ comm << s
62
+ s = @src.get_plain
63
+ unless s then
64
+ parse_error "unterminated comment meets EOF"
65
+ return on_comment(comm)
66
+ end
67
+ end
68
+ comm << $`
69
+ s = $'
70
+ until s.empty? || s.strip.empty? and @src.close_tag # --> or -- >
71
+ comm << '--'
72
+ if /\A\s*--/n =~ s then # <!--hoge-- --
73
+ comm << $&
74
+ s = $'
75
+ if s.empty? and @src.close_tag then # <!--hoge-- -->
76
+ parse_error "`-->' is found but comment must not end here"
77
+ comm.chop!.chop!
78
+ break
79
+ end
80
+ else # <!--hoge-- fuga
81
+ parse_error "only whitespace can appear between two comments"
82
+ end
83
+ if /\A-\s*\z/n =~ s and @src.close_tag then # <!--hoge--->
84
+ parse_error "`-->' is found but comment must not end here"
85
+ comm.chop!
86
+ break
87
+ end
88
+ until /--/n =~ s # copy & paste for performance
89
+ comm << s
90
+ s = @src.get_plain
91
+ unless s then
92
+ parse_error "unterminated comment meets EOF"
93
+ return on_comment(comm)
94
+ end
95
+ end
96
+ comm << $`
97
+ s = $'
98
+ end
99
+ on_comment comm
100
+ end
101
+
102
+
103
+ alias scan_xml_pi scan_pi # PIO "<?" PIC "?>" -- <? PI ?> --
104
+
105
+
106
+ def scan_pi(s) # <?PI > this is default in SGML.
107
+ s[0,2] = '' # remove `<?'
108
+ pi = s
109
+ until @src.close_tag
110
+ s = @src.get_plain
111
+ unless s then
112
+ parse_error "unterminated PI meets EOF"
113
+ break
114
+ end
115
+ pi << s
116
+ end
117
+ on_pi '', pi
118
+ end
119
+
120
+
121
+ def scan_stag(s)
122
+ unless /(?=[\/\s='"])/n =~ s then
123
+ name = s
124
+ name[0,1] = '' # remove `<'
125
+ if name.empty? then # <> or <<
126
+ if @src.close_tag then
127
+ return found_empty_stag
128
+ else
129
+ parse_error "parse error at `<'"
130
+ return on_chardata('<')
131
+ end
132
+ end
133
+ on_stag name
134
+ found_unclosed_stag name unless @src.close_tag
135
+ on_stag_end name
136
+ else
137
+ name = $`
138
+ s = $'
139
+ name[0,1] = '' # remove `<'
140
+ if name.empty? then # `< tag' or `<=`
141
+ parse_error "parse error at `<'"
142
+ if @src.close_tag then
143
+ s << '>'
144
+ end
145
+ return on_chardata('<' << s)
146
+ end
147
+ on_stag name
148
+ begin
149
+ continue = false
150
+ s.scan(
151
+ /([^\s=\/'"]+)(?:\s*=\s*(?:('[^']*'?|"[^"]*"?)|([^\s='"]+)))?|(\S)/n
152
+ ) { |key,val,val2,error|
153
+ if key then
154
+ if val then # key="value"
155
+ on_attribute key
156
+ qmark = val.slice!(0,1)
157
+ if val[-1] == qmark[0] then
158
+ val.chop!
159
+ scan_attvalue val unless val.empty?
160
+ else
161
+ scan_attvalue val unless val.empty?
162
+ begin
163
+ s = @src.get
164
+ unless s then
165
+ parse_error "unterminated attribute `#{key}' meets EOF"
166
+ break
167
+ end
168
+ c = s[0]
169
+ val, s = s.split(qmark, 2)
170
+ scan_attvalue '>' unless c == ?< or c == ?>
171
+ scan_attvalue val if c
172
+ end until s
173
+ continue = s
174
+ end
175
+ on_attribute_end key
176
+ elsif val2 then # key=value
177
+ on_attribute key
178
+ on_attr_value val2
179
+ on_attribute_end key
180
+ else # value
181
+ on_attribute nil
182
+ on_attr_value key
183
+ on_attribute_end nil
184
+ end
185
+ else
186
+ parse_error "parse error at `#{error}'"
187
+ end
188
+ }
189
+ end while continue
190
+ found_unclosed_stag name unless @src.close_tag
191
+ on_stag_end name
192
+ end
193
+ end
194
+
195
+
196
+ # This method should be called only from on_stag_end.
197
+ def get_cdata_content
198
+ unless not s = @src.test or s[0] == ?< && s[1] == ?/ then
199
+ dst = @src.get
200
+ until not s = @src.test or s[0] == ?< && s[1] == ?/
201
+ dst << @src.get_plain
202
+ end
203
+ dst
204
+ else
205
+ ''
206
+ end
207
+ end
208
+ public :get_cdata_content
209
+
210
+
211
+ def scan_bang_tag(s)
212
+ if s == '<!' and @src.close_tag then # <!>
213
+ on_comment ''
214
+ else
215
+ parse_error "parse error at `<!'"
216
+ while s and not @src.close_tag # skip entire
217
+ s = @src.get_plain
218
+ end
219
+ end
220
+ end
221
+
222
+
223
+ def scan_internal_dtd(s)
224
+ parse_error "DTD subset is found but it is not permitted in HTML"
225
+ skip_internal_dtd s
226
+ end
227
+
228
+
229
+ def found_invalid_pubsys(pubsys)
230
+ s = pubsys.upcase
231
+ return s if s == 'PUBLIC' or s == 'SYSTEM'
232
+ super
233
+ end
234
+
235
+
236
+ def scan_prolog(s)
237
+ doctype = 0
238
+ while s
239
+ if s[0] == ?< then
240
+ if (c = s[1]) == ?! then
241
+ if s[2] == ?- and s[3] == ?- then
242
+ scan_comment s
243
+ elsif /\A<!doctype(?=\s)/in =~ s then
244
+ doctype += 1
245
+ if doctype > 1 then
246
+ parse_error "another document type declaration is found"
247
+ end
248
+ scan_doctype $'
249
+ else
250
+ break
251
+ end
252
+ elsif c == ?? then
253
+ scan_pi s
254
+ else
255
+ break
256
+ end
257
+ elsif s.strip.empty? then
258
+ on_prolog_space s
259
+ else
260
+ break
261
+ end
262
+ s = @src.get
263
+ end
264
+ scan_content(s || @src.get)
265
+ end
266
+
267
+ end
268
+
269
+ end
270
+
271
+
272
+
273
+
274
+
275
+ if $0 == __FILE__ then
276
+ class TestVisitor
277
+ include XMLScan::Visitor
278
+ def parse_error(msg)
279
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
280
+ end
281
+ end
282
+
283
+ $s = scan = XMLScan::HTMLScanner.new(TestVisitor.new)
284
+ src = ARGF
285
+ def src.path; filename; end
286
+ t1 = Time.times.utime
287
+ scan.parse src
288
+ t2 = Time.times.utime
289
+ STDERR.printf "%2.3f sec\n", t2 - t1
290
+ end
@@ -0,0 +1,353 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/namespace.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: namespace.rb,v 1.13 2003/01/22 13:06:18 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/parser'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class NSParseError < ParseError ; end
16
+ class NSNotWellFormedError < NotWellFormedError ; end
17
+ class NSNotValidError < NotValidError ; end
18
+
19
+
20
+ module NSVisitor
21
+
22
+ include Visitor
23
+
24
+ def ns_parse_error(msg)
25
+ raise NSParseError.new(msg)
26
+ end
27
+
28
+ def ns_wellformed_error(msg)
29
+ raise NSNotWellFormedError.new(msg)
30
+ end
31
+
32
+ def ns_valid_error(msg)
33
+ raise NSNotValidError.new(msg)
34
+ end
35
+
36
+ #
37
+ # <foo:bar hoge:fuga='' hoge='' >
38
+ # <foo hoge:fuga='' hoge='' >
39
+ # ^ ^ ^ ^ ^ ^
40
+ # 1 2 3 4 5 6
41
+ #
42
+ # The following method will be called with the following arguments
43
+ # when the parser reaches the above point;
44
+ #
45
+ # 1: on_stag_ns ('foo:bar', 'foo', 'bar')
46
+ # or
47
+ # on_stag_ns ('foo', '', 'foo')
48
+ # 2: on_attribute_ns ('hoge:fuga', 'hoge', 'fuga')
49
+ # 3: on_attribute_end ('hoge:fuga')
50
+ # 4: on_attribute_ns ('hoge', nil, 'hoge')
51
+ # 5: on_attribute_end ('hoge')
52
+ # 6: on_stag_end_ns ('foo:bar', { 'foo' => '', ... })
53
+ # or
54
+ # on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
55
+ #
56
+
57
+ def on_stag_ns(qname, prefix, localpart)
58
+ end
59
+
60
+ def on_attribute_ns(qname, prefix, localpart)
61
+ end
62
+
63
+ def on_stag_end_ns(qname, namespaces)
64
+ end
65
+
66
+ def on_stag_end_empty_ns(qname, namespaces)
67
+ end
68
+
69
+ end
70
+
71
+
72
+
73
+
74
+ class XMLNamespaceDecoration < Decoration
75
+
76
+ proc {
77
+ h = {'foo'=>true} ; h['foo'] = nil
78
+ raise "requires Ruby-1.6 or above" unless h.key? 'foo'
79
+ }.call
80
+
81
+ PredefinedNamespace = {
82
+ 'xml' => 'http://www.w3.org/XML/1998/namespace',
83
+ 'xmlns' => 'http://www.w3.org/2000/xmlns/',
84
+ }
85
+
86
+ ReservedNamespace = PredefinedNamespace.invert
87
+
88
+
89
+ def ns_parse_error(msg)
90
+ @orig_visitor.ns_parse_error msg
91
+ end
92
+
93
+ def ns_wellformed_error(msg)
94
+ @orig_visitor.ns_wellformed_error msg
95
+ end
96
+
97
+ def ns_valid_error(msg)
98
+ @orig_visitor.ns_valid_error msg
99
+ end
100
+
101
+
102
+ def on_start_document
103
+ @namespace = {} #PredefinedNamespace.dup
104
+ @ns_hist = []
105
+ @ns_undeclared = {} # for checking undeclared namespace prefixes.
106
+ @prev_prefix = {} # for checking doubled attributes.
107
+ @dont_same = [] # ditto.
108
+ @xmlns = NamespaceDeclaration.new(self)
109
+ @orig_visitor = @visitor
110
+ @visitor.on_start_document
111
+ end
112
+
113
+
114
+ def on_stag(name)
115
+ @ns_hist.push nil
116
+ unless /:/n =~ name then
117
+ @visitor.on_stag_ns name, '', name
118
+ else
119
+ prefix, localpart = $`, $'
120
+ if localpart.include? ?: then
121
+ ns_parse_error "localpart `#{localpart}' includes `:'"
122
+ end
123
+ if prefix == 'xmlns' then
124
+ ns_wellformed_error \
125
+ "prefix `xmlns' is not used for namespace prefix declaration"
126
+ end
127
+ unless @namespace.key? prefix then
128
+ if uri = PredefinedNamespace[prefix] then
129
+ @namespace[prefix] = uri
130
+ else
131
+ @ns_undeclared[prefix] = true
132
+ end
133
+ end
134
+ @visitor.on_stag_ns name, prefix, localpart
135
+ end
136
+ end
137
+
138
+
139
+ def on_attribute(name)
140
+ if /:/n =~ name then
141
+ prefix, localpart = $`, $'
142
+ if localpart.include? ?: then
143
+ ns_parse_error "localpart `#{localpart}' includes `:'"
144
+ end
145
+ unless @namespace.key? prefix then
146
+ if uri = PredefinedNamespace[prefix] then
147
+ @namespace[prefix] = uri
148
+ else
149
+ @ns_undeclared[prefix] = true
150
+ end
151
+ end
152
+ if prefix == 'xmlns' then
153
+ @visitor = @xmlns
154
+ @xmlns.on_xmlns_start localpart
155
+ else
156
+ if prev = @prev_prefix[localpart] then
157
+ @dont_same.push [ prev, prefix, localpart ]
158
+ end
159
+ @prev_prefix[localpart] = prefix
160
+ @visitor.on_attribute_ns name, prefix, localpart
161
+ end
162
+ elsif name == 'xmlns' then
163
+ @visitor = @xmlns
164
+ @xmlns.on_xmlns_start ''
165
+ else
166
+ @visitor.on_attribute_ns name, nil, name
167
+ end
168
+ end
169
+
170
+
171
+ class NamespaceDeclaration
172
+
173
+ include XMLScan::Visitor
174
+
175
+ def initialize(parent)
176
+ @parent = parent
177
+ end
178
+
179
+ def on_xmlns_start(prefix)
180
+ @prefix = prefix
181
+ @nsdecl = ''
182
+ end
183
+
184
+ def on_attr_value(str)
185
+ @nsdecl << str
186
+ end
187
+
188
+ def on_attr_entityref(ref)
189
+ @parent.ns_wellformed_error \
190
+ "xmlns includes undeclared entity reference"
191
+ end
192
+
193
+ def on_attr_charref(code)
194
+ @nsdecl << [code].pack('U')
195
+ end
196
+
197
+ def on_attr_charref_hex(code)
198
+ @nsdecl << [code].pack('U')
199
+ end
200
+
201
+ def on_attribute_end(name)
202
+ @parent.on_xmlns_end @prefix, @nsdecl
203
+ end
204
+
205
+ end
206
+
207
+
208
+ def on_xmlns_end(prefix, uri)
209
+ @visitor = @orig_visitor
210
+ if PredefinedNamespace.key? prefix then
211
+ if prefix == 'xmlns' then
212
+ ns_wellformed_error \
213
+ "prefix `xmlns' can't be bound to any namespace explicitly"
214
+ elsif (s = PredefinedNamespace[prefix]) != uri then
215
+ ns_wellformed_error \
216
+ "prefix `#{prefix}' can't be bound to any namespace except `#{s}'"
217
+ end
218
+ end
219
+ if uri.empty? then
220
+ if prefix.empty? then
221
+ uri = nil
222
+ else
223
+ ns_parse_error "`#{prefix}' is bound to empty namespace name"
224
+ end
225
+ elsif ReservedNamespace.key? uri then
226
+ unless (s = ReservedNamespace[uri]) == prefix then
227
+ ns_wellformed_error \
228
+ "namespace `#{uri}' is reserved for prefix `#{s}'"
229
+ end
230
+ end
231
+ (@ns_hist.last || @ns_hist[-1] = {})[prefix] = @namespace[prefix]
232
+ @namespace[prefix] = uri
233
+ @ns_undeclared.delete prefix
234
+ end
235
+
236
+
237
+ def fix_namespace
238
+ unless @ns_undeclared.empty? then
239
+ @ns_undeclared.each_key { |i|
240
+ @visitor.ns_wellformed_error "prefix `#{i}' is not declared"
241
+ }
242
+ @ns_undeclared.clear
243
+ end
244
+ unless @dont_same.empty? then
245
+ @dont_same.each { |n1,n2,l|
246
+ if @namespace[n1] == @namespace[n2] then
247
+ ns_wellformed_error \
248
+ "doubled localpart `#{l}' in the same namespace"
249
+ end
250
+ }
251
+ @dont_same.clear
252
+ end
253
+ @prev_prefix.clear
254
+ end
255
+
256
+
257
+ def on_stag_end(name)
258
+ fix_namespace
259
+ @visitor.on_stag_end_ns name, @namespace
260
+ end
261
+
262
+
263
+ def on_etag(name)
264
+ h = @ns_hist.pop and @namespace.update h
265
+ @visitor.on_etag name
266
+ end
267
+
268
+
269
+ def on_stag_end_empty(name)
270
+ fix_namespace
271
+ @visitor.on_stag_end_empty_ns name, @namespace
272
+ h = @ns_hist.pop and @namespace.update h
273
+ end
274
+
275
+
276
+ def on_doctype(root, pubid, sysid)
277
+ if root.count(':') > 1 then
278
+ ns_parse_error "qualified name `#{root}' includes `:'"
279
+ end
280
+ @visitor.on_doctype root, pubid, sysid
281
+ end
282
+
283
+
284
+ def on_pi(target, pi)
285
+ if target.include? ?: then
286
+ ns_parse_error "PI target `#{target}' includes `:'"
287
+ end
288
+ @visitor.on_pi target, pi
289
+ end
290
+
291
+
292
+ def on_entityref(ref)
293
+ if ref.include? ?: then
294
+ ns_parse_error "entity reference `#{ref}' includes `:'"
295
+ end
296
+ @visitor.on_entityref ref
297
+ end
298
+
299
+
300
+ def on_attr_entityref(ref)
301
+ if ref.include? ?: then
302
+ ns_parse_error "entity reference `#{ref}' includes `:'"
303
+ end
304
+ @visitor.on_attr_entityref ref
305
+ end
306
+
307
+ end
308
+
309
+
310
+
311
+ class XMLParserNS < XMLParser
312
+
313
+ def initialize(*)
314
+ super
315
+ @visitor = @decoration = XMLNamespaceDecoration.new(@visitor)
316
+ end
317
+
318
+ end
319
+
320
+ end
321
+
322
+
323
+
324
+
325
+
326
+ if $0 == __FILE__ then
327
+ class TestVisitor
328
+ include XMLScan::NSVisitor
329
+ def parse_error(msg)
330
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
331
+ end
332
+ def wellformed_error(msg)
333
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
334
+ end
335
+ def warning(msg)
336
+ STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
337
+ end
338
+ def ns_parse_error(msg)
339
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
340
+ end
341
+ def ns_wellformed_error(msg)
342
+ STDERR.printf("%s:%d: NSC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
343
+ end
344
+ end
345
+
346
+ $s = scan = XMLScan::XMLParserNS.new(TestVisitor.new)
347
+ src = ARGF
348
+ def src.path; filename; end
349
+ t1 = Time.times.utime
350
+ scan.parse src
351
+ t2 = Time.times.utime
352
+ STDERR.printf "%2.3f sec\n", t2 - t1
353
+ end