xmlscan 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,300 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/parser.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: parser.rb,v 1.10 2003/01/22 13:06:18 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/scanner'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class XMLParser < XMLScanner
16
+
17
+ class AttributeChecker < Hash
18
+ # AttributeChecker inherits Hash only for speed.
19
+
20
+ def check_unique(name)
21
+ not key? name and store(name, true)
22
+ end
23
+
24
+ end
25
+
26
+
27
+ #PredefinedEntity = {
28
+ # 'lt' => '<',
29
+ # 'gt' => '>',
30
+ # 'amp' => '&',
31
+ # 'quot' => '"',
32
+ # 'apos' => "'",
33
+ #}
34
+
35
+
36
+ def parse(*)
37
+ @elem = []
38
+ @attr = AttributeChecker.new
39
+ @standalone = nil
40
+ super
41
+ end
42
+
43
+
44
+ private
45
+
46
+ def on_xmldecl_version(str)
47
+ unless str == '1.0' then
48
+ warning "unsupported XML version `#{str}'"
49
+ end
50
+ @visitor.on_xmldecl_version str
51
+ end
52
+
53
+
54
+ def on_xmldecl_standalone(str)
55
+ if str == 'yes' then
56
+ @standalone = true
57
+ elsif str == 'no' then
58
+ @standalone = false
59
+ else
60
+ parse_error "standalone declaration must be either `yes' or `no'"
61
+ end
62
+ @visitor.on_xmldecl_standalone str
63
+ end
64
+
65
+
66
+ def on_doctype(name, pubid, sysid)
67
+ if pubid and not sysid then
68
+ parse_error "public external ID must have both public ID and system ID"
69
+ end
70
+ @visitor.on_doctype name, pubid, sysid
71
+ end
72
+
73
+
74
+ def on_prolog_space(s)
75
+ # just ignore it.
76
+ end
77
+
78
+
79
+ def on_pi(target, pi)
80
+ if target.downcase == 'xml' then
81
+ parse_error "reserved PI target `#{target}'"
82
+ end
83
+ @visitor.on_pi target, pi
84
+ end
85
+
86
+
87
+ #def on_entityref(ref)
88
+ # rep = PredefinedEntity[ref]
89
+ # if rep then
90
+ # @visitor.on_chardata rep
91
+ # else
92
+ # @visitor.on_entityref ref
93
+ # end
94
+ #end
95
+
96
+
97
+ #def on_attr_entityref(ref)
98
+ # rep = PredefinedEntity[ref]
99
+ # if rep then
100
+ # @visitor.on_attr_value rep
101
+ # else
102
+ # @visitor.on_attr_entityref ref
103
+ # end
104
+ #end
105
+
106
+
107
+ #def on_charref_hex(code)
108
+ # on_charref code
109
+ #end
110
+
111
+
112
+ #def on_attr_charref_hex(code)
113
+ # on_attr_charref code
114
+ #end
115
+
116
+
117
+ def on_stag(name)
118
+ @elem.push name
119
+ @visitor.on_stag name
120
+ @attr.clear
121
+ end
122
+
123
+ def on_attribute(name)
124
+ unless @attr.check_unique name then
125
+ wellformed_error "doubled attribute `#{name}'"
126
+ end
127
+ @visitor.on_attribute name
128
+ end
129
+
130
+ def on_attr_value(str)
131
+ str.tr! "\t\r\n", ' ' # normalize
132
+ @visitor.on_attr_value str
133
+ end
134
+
135
+ def on_stag_end_empty(name)
136
+ # @visitor.on_stag_end name
137
+ # @elem.pop
138
+ # @visitor.on_etag name
139
+ @visitor.on_stag_end_empty name
140
+ @elem.pop
141
+ end
142
+
143
+ def on_etag(name)
144
+ last = @elem.pop
145
+ if last == name then
146
+ @visitor.on_etag name
147
+ elsif last then
148
+ wellformed_error "element type `#{name}' is not matched"
149
+ @visitor.on_etag last
150
+ else
151
+ parse_error "end tag `#{name}' appears alone"
152
+ end
153
+ end
154
+
155
+
156
+ public
157
+
158
+
159
+ def scan_content(s)
160
+ elem = @elem # for speed
161
+ src = @src # for speed
162
+ found_root_element = false
163
+
164
+ begin
165
+
166
+ # -- first start tag --
167
+ elem.clear
168
+ found_stag = false
169
+
170
+ while s and not found_stag
171
+ if (c = s[0]) == ?< then
172
+ if (c = s[1]) == ?/ then
173
+ # should be a parse error
174
+ scan_etag s
175
+ elsif c == ?! then
176
+ if s[2] == ?- and s[3] == ?- then
177
+ scan_comment s
178
+ elsif /\A<!\[CDATA\[/n =~ s then
179
+ parse_error "CDATA section is found outside of root element"
180
+ scan_cdata $'
181
+ else
182
+ scan_bang_tag s
183
+ end
184
+ elsif c == ?? then
185
+ scan_pi s
186
+ else
187
+ found_root_element = true
188
+ found_stag = true
189
+ scan_stag s
190
+ end
191
+ else
192
+ parse_error "content of element is found outside of root element"
193
+ scan_chardata s
194
+ end
195
+ s = src.get
196
+ end
197
+
198
+ if not found_root_element and not found_stag then
199
+ parse_error "no root element was found"
200
+ end
201
+
202
+ # -- contents --
203
+ while s and not elem.empty?
204
+ if (c = s[0]) == ?< then
205
+ if (c = s[1]) == ?/ then
206
+ scan_etag s
207
+ elsif c == ?! then
208
+ if s[2] == ?- and s[3] == ?- then
209
+ scan_comment s
210
+ elsif /\A<!\[CDATA\[/n =~ s then
211
+ scan_cdata $'
212
+ else
213
+ scan_bang_tag s
214
+ end
215
+ elsif c == ?? then
216
+ scan_pi s
217
+ else
218
+ scan_stag s
219
+ end
220
+ else
221
+ scan_chardata s
222
+ end
223
+ s = src.get
224
+ end
225
+
226
+ unless elem.empty? then
227
+ while name = elem.pop
228
+ parse_error "unclosed element `#{name}' meets EOF"
229
+ @visitor.on_etag name
230
+ end
231
+ end
232
+
233
+ # -- epilogue --
234
+ finish = true
235
+
236
+ while s
237
+ if (c = s[0]) == ?< then
238
+ if (c = s[1]) == ?/ then
239
+ finish = false # content out of root element
240
+ break
241
+ elsif c == ?! then
242
+ if s[2] == ?- and s[3] == ?- then
243
+ scan_comment s
244
+ else
245
+ finish = false # content out of root element
246
+ break
247
+ end
248
+ elsif c == ?? then
249
+ scan_pi s
250
+ else
251
+ parse_error "another root element is found" # stag
252
+ finish = false
253
+ break
254
+ end
255
+ else
256
+ if s.strip.empty? then
257
+ on_prolog_space s
258
+ else
259
+ finish = false # content out of root element
260
+ break
261
+ end
262
+ end
263
+ s = src.get
264
+ end
265
+
266
+ end until finish
267
+
268
+ end
269
+ end
270
+
271
+
272
+ end
273
+
274
+
275
+
276
+
277
+
278
+
279
+ if $0 == __FILE__ then
280
+ class TestVisitor
281
+ include XMLScan::Visitor
282
+ def parse_error(msg)
283
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
284
+ end
285
+ def wellformed_error(msg)
286
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
287
+ end
288
+ def warning(msg)
289
+ STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
290
+ end
291
+ end
292
+
293
+ $s = scan = XMLScan::XMLParser.new(TestVisitor.new)
294
+ src = ARGF
295
+ def src.path; filename; end
296
+ t1 = Time.times.utime
297
+ scan.parse src
298
+ t2 = Time.times.utime
299
+ STDERR.printf "%2.3f sec\n", t2 - t1
300
+ end
@@ -0,0 +1,1123 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/scanner.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: scanner.rb,v 1.75.2.3 2003/05/01 15:43:23 katsu Exp $
8
+ #
9
+
10
+ #
11
+ # CONSIDERATIONS FOR CHARACTER ENCODINGS:
12
+ #
13
+ # There are the following common characteristics in character encodings
14
+ # which are supported by Ruby's $KCODE feature (ISO-8859-*, Shift_JIS,
15
+ # EUC, and UTF-8):
16
+ #
17
+ # - Stateless.
18
+ # - ASCII characters are encoded in the same manner as US-ASCII.
19
+ # - The octet sequences corresponding to non-ASCII characters begin
20
+ # with an octet greater than 0x80.
21
+ # - The following characters can be identified by just one octet.
22
+ # That is, every octets corresponding to the following characters in
23
+ # US-ASCII never appear as a part of an octet sequence representing a
24
+ # non-ASCII character.
25
+ #
26
+ # Whitespaces("\t", "\n", "\r", and " ") and
27
+ # ! \ " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
28
+ #
29
+ # Be careful that `[' and `]' are NOT included in the list!
30
+ #
31
+ # If we build a regular expression carefully in accordance with these
32
+ # characteristics, we can get the same match regardless of the value
33
+ # of $KCODE. Moreover, if it can be premised on them, we can detect
34
+ # several delimiters without regular expressions. XMLScanner uses this
35
+ # fact in order to share many regular expressions in all $KCODE modes,
36
+ # and in order to optimize parsing speed.
37
+ #
38
+
39
+ require 'xmlscan/visitor'
40
+
41
+
42
+ module XMLScan
43
+
44
+ class Input
45
+
46
+ def initialize(src)
47
+ @src = src
48
+ unless src.respond_to? :gets then
49
+ if src.respond_to? :to_ary then
50
+ @v = src.to_ary
51
+ @n = -1
52
+ def self.gets ; @v.at(@n += 1) ; end
53
+ def self.lineno ; @n + 1 ; end
54
+ else
55
+ @v = @src
56
+ def self.gets ; s = @v ; @v = nil ; s ; end
57
+ end
58
+ end
59
+ if src.respond_to? :lineno then
60
+ def self.lineno ; @src.lineno ; end
61
+ end
62
+ if src.respond_to? :path then
63
+ def self.path ; @src.path ; end
64
+ end
65
+ end
66
+
67
+ attr_reader :src
68
+
69
+ def gets ; @src.gets ; end
70
+ def lineno ; 0 ; end
71
+ def path ; '-' ; end
72
+
73
+ def self.wrap(src)
74
+ unless src.respond_to? :gets and src.respond_to? :lineno and
75
+ src.respond_to? :path then
76
+ src = new(src)
77
+ end
78
+ src
79
+ end
80
+
81
+ def self.unwrap(obj)
82
+ if self === obj then
83
+ obj.src
84
+ else
85
+ obj
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+
92
+
93
+ class PrivateArray < Array
94
+ m = superclass.instance_methods - Kernel.instance_methods
95
+ private(*m)
96
+ end
97
+
98
+
99
+ class Source < PrivateArray
100
+ # Source inherits Array only for speed.
101
+
102
+ def initialize(src)
103
+ super()
104
+ @src = Input.wrap(src)
105
+ @eof = false
106
+ @last = nil
107
+ end
108
+
109
+ def source
110
+ Input.unwrap @src
111
+ end
112
+
113
+
114
+ def eof?
115
+ @eof and empty?
116
+ end
117
+
118
+ def abort
119
+ @eof = true
120
+ @last = nil
121
+ clear
122
+ self
123
+ end
124
+
125
+
126
+ def get
127
+ pop or
128
+ unless @eof then
129
+ last = @last
130
+ begin
131
+ src = @src.gets
132
+ unless src then
133
+ @eof = true
134
+ unshift last
135
+ last = nil
136
+ break
137
+ end
138
+ a = src.split(/(?=<|>[<>])|>/, -1)
139
+ if last then
140
+ unless /\A[<>]/ =~ a.first then
141
+ a[0] = last << (a.first || '')
142
+ else
143
+ push last
144
+ end
145
+ end
146
+ concat a
147
+ last = pop
148
+ end while empty?
149
+ @last = last
150
+ reverse!
151
+ pop
152
+ end
153
+ end
154
+
155
+
156
+ def prepare
157
+ s = get
158
+ s = get and s = '>' << s if s and s.empty? # preserve first `>'
159
+ s and push s
160
+ end
161
+
162
+
163
+ def tag_end?
164
+ s = last || @last and s[0] != ?<
165
+ end
166
+
167
+ def tag_start?
168
+ s = last || @last and s[0] == ?<
169
+ end
170
+
171
+ def close_tag # tag_end?, and remove a `>'.
172
+ unless s = last || @last and s[0] != ?< then
173
+ false
174
+ else
175
+ if s == '>' or s.empty? then
176
+ s1 = get
177
+ unless s = last || @last and s[0] == ?< then # for speed up
178
+ out = [ s1 ]
179
+ out.push get while s = last || @last and s == '>' || s.empty?
180
+ x=out.pop unless s and s[0] != ?< # De Morgan
181
+ concat out
182
+ end
183
+ end
184
+ true
185
+ end
186
+ end
187
+
188
+
189
+ def get_text # get until tag_start?
190
+ s = last || @last and s[0] != ?< and get
191
+ end
192
+
193
+ def get_tag # get until tag_end?
194
+ s = last || @last and s[0] == ?< and get
195
+ end
196
+
197
+ def get_plain
198
+ s = get
199
+ s = '>' << s unless not s or (c = s[0]) == ?< or c == ?> # De Morgan
200
+ s
201
+ end
202
+
203
+ def lineno
204
+ @src.lineno
205
+ end
206
+
207
+ def path
208
+ @src.path
209
+ end
210
+
211
+
212
+ # The following methods are for debug.
213
+
214
+ def inspect
215
+ a = []
216
+ reverse_each { |i|
217
+ a.push ">" unless /\A[<>]/ =~ i
218
+ a.push i.inspect
219
+ }
220
+ last = []
221
+ if @last then
222
+ last.push ">" unless /\A[<>]/ =~ @last
223
+ last.push @last.inspect
224
+ end
225
+ a.push '#eof' if @eof
226
+ "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
227
+ end
228
+
229
+ def each
230
+ prepare
231
+ while s = get
232
+ yield s
233
+ end
234
+ self
235
+ end
236
+
237
+ def test
238
+ last or @last or (s = get and push s and s)
239
+ end
240
+
241
+ end
242
+
243
+
244
+
245
+ class XMLScanner
246
+
247
+ class << self
248
+
249
+ def provided_options
250
+ options = []
251
+ private_instance_methods.each { |i|
252
+ options.push $' if /\Aapply_option_/ =~ i
253
+ }
254
+ options
255
+ end
256
+
257
+ def apply_option(instance, option)
258
+ instance.__send__ "apply_option_#{option}"
259
+ end
260
+
261
+ def apply_options(instance, options)
262
+ h = {}
263
+ options.each { |i| h[i.to_s] = true }
264
+ options = h
265
+ ancestors.each { |klass|
266
+ if klass.respond_to? :provided_options then
267
+ klass.provided_options.each { |i|
268
+ if options.include? i then
269
+ options.delete i
270
+ klass.apply_option instance, i
271
+ end
272
+ }
273
+ end
274
+ }
275
+ unless options.empty? then
276
+ raise ArgumentError, "undefined option `#{options.keys[0]}'"
277
+ end
278
+ instance
279
+ end
280
+ private :apply_options
281
+
282
+ def new(visitor, *options)
283
+ instance = super(visitor)
284
+ apply_options instance, options
285
+ end
286
+
287
+ end
288
+
289
+
290
+
291
+ def initialize(visitor)
292
+ @visitor = visitor
293
+ @decoration = nil
294
+ @src = nil
295
+ @optkey = nil
296
+ end
297
+
298
+ attr_accessor :optkey
299
+
300
+ def opt_encoding() OptRegexp::RE_ENCODINGS[optkey] end
301
+
302
+
303
+ def decorate(decoration)
304
+ unless @decoration then
305
+ @visitor = @decoration = Decoration.new(@visitor)
306
+ end
307
+ @decoration.expand decoration
308
+ end
309
+ private :decorate
310
+
311
+
312
+ def lineno
313
+ @src && @src.lineno
314
+ end
315
+
316
+ def path
317
+ @src && @src.path
318
+ end
319
+
320
+ def source
321
+ @src.source
322
+ end
323
+
324
+
325
+ private
326
+
327
+ def parse_error(msg)
328
+ @visitor.parse_error msg
329
+ end
330
+
331
+ def wellformed_error(msg)
332
+ @visitor.wellformed_error msg
333
+ end
334
+
335
+ def valid_error(msg)
336
+ @visitor.valid_error msg
337
+ end
338
+
339
+ def warning(msg)
340
+ @visitor.warning msg
341
+ end
342
+
343
+
344
+ def on_xmldecl
345
+ @visitor.on_xmldecl
346
+ end
347
+
348
+ def on_xmldecl_key(key, str)
349
+ meth = "on_xmldecl_#{key}"
350
+ if @visitor.respond_to? meth
351
+ self.send meth, str
352
+ else
353
+ self.send :on_xmldecl_other, key, str
354
+ end
355
+ end
356
+
357
+ def on_xmldecl_version(str)
358
+ @visitor.on_xmldecl_version str
359
+ end
360
+
361
+ def on_xmldecl_encoding(str)
362
+ @visitor.on_xmldecl_encoding str
363
+ end
364
+
365
+ def on_xmldecl_standalone(str)
366
+ @visitor.on_xmldecl_standalone str
367
+ end
368
+
369
+ def on_xmldecl_other(name, value)
370
+ @visitor.on_xmldecl_other name, value
371
+ end
372
+
373
+ def on_xmldecl_end
374
+ @visitor.on_xmldecl_end
375
+ end
376
+
377
+ def on_doctype(root, pubid, sysid)
378
+ @visitor.on_doctype root, pubid, sysid
379
+ end
380
+
381
+ def on_prolog_space(str)
382
+ @visitor.on_prolog_space str
383
+ end
384
+
385
+ def on_comment(str)
386
+ @visitor.on_comment str
387
+ end
388
+
389
+ def on_pi(target, pi)
390
+ @visitor.on_pi target, pi
391
+ end
392
+
393
+ def on_chardata(str)
394
+ @visitor.on_chardata str
395
+ end
396
+
397
+ def on_cdata(str)
398
+ @visitor.on_cdata str
399
+ end
400
+
401
+ def on_etag(name)
402
+ @visitor.on_etag name
403
+ end
404
+
405
+ def on_entityref(ref)
406
+ @visitor.on_entityref ref
407
+ end
408
+
409
+ def on_charref(code)
410
+ @visitor.on_charref code
411
+ end
412
+
413
+ def on_charref_hex(code)
414
+ @visitor.on_charref_hex code
415
+ end
416
+
417
+ def on_start_document
418
+ @visitor.on_start_document
419
+ end
420
+
421
+ def on_end_document
422
+ @visitor.on_end_document
423
+ end
424
+
425
+
426
+ # <hoge fuga="foo&bar;&#38;&#x26;foo" />HOGE
427
+ # ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
428
+ # 1 2 3 4 5 6 7 8 9 A
429
+ #
430
+ # The following method will be called with the following arguments
431
+ # when the parser reaches the above point;
432
+ #
433
+ # 1: on_stag ('hoge')
434
+ # 2: on_attribute ('fuga')
435
+ # 3: on_attr_value ('foo')
436
+ # 4: on_attr_entityref ('bar')
437
+ # 5: on_attr_charref (38)
438
+ # 6: on_attr_charref_hex (38)
439
+ # 7: on_attr_value ('foo')
440
+ # 8: on_attribute_end ('fuga')
441
+ # 9: on_stag_end_empty ('hoge')
442
+ # or
443
+ # on_stag_end ('hoge')
444
+ #
445
+ # A: on_chardata ('HOGE')
446
+
447
+ def on_stag(name)
448
+ @visitor.on_stag name
449
+ end
450
+
451
+ def on_attribute(name)
452
+ @visitor.on_attribute name
453
+ end
454
+
455
+ def on_attr_value(str)
456
+ @visitor.on_attr_value str
457
+ end
458
+
459
+ def on_attr_entityref(ref)
460
+ @visitor.on_attr_entityref ref
461
+ end
462
+
463
+ def on_attr_charref(code)
464
+ @visitor.on_attr_charref code
465
+ end
466
+
467
+ def on_attr_charref_hex(code)
468
+ @visitor.on_attr_charref_hex code
469
+ end
470
+
471
+ def on_attribute_end(name)
472
+ @visitor.on_attribute_end name
473
+ end
474
+
475
+ def on_stag_end_empty(name)
476
+ @visitor.on_stag_end_empty name
477
+ end
478
+
479
+ def on_stag_end(name)
480
+ @visitor.on_stag_end name
481
+ end
482
+
483
+
484
+
485
+ private
486
+
487
+ module OptRegexp
488
+ UTFSTR = "é"
489
+ S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
+ E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
+
492
+ RE_ENCODINGS = {
493
+ :n=>/e/n.encoding,
494
+ :e=>/#{E_OPT_EXAMPLE}/e.encoding,
495
+ :s=>/#{S_OPT_EXAMPLE}/s.encoding,
496
+ :u=>/#{UTFSTR}/u.encoding
497
+ }
498
+
499
+ RE_ENCODING_OPTIONS = {
500
+ :n=>/e/n.options,
501
+ :e=>/#{E_OPT_EXAMPLE}/e.options,
502
+ :s=>/#{S_OPT_EXAMPLE}/s.options,
503
+ :u=>/#{UTFSTR}/u.options
504
+ }
505
+
506
+ private
507
+ def opt_regexp(re)
508
+ h = {}
509
+ RE_ENCODING_OPTIONS.each { |k,opt|
510
+ h[k] = Regexp.new(re.encode(RE_ENCODINGS[k]), opt)
511
+ }
512
+ h.default = Regexp.new(re)
513
+ h
514
+ end
515
+ end
516
+ extend OptRegexp
517
+
518
+
519
+ InvalidEntityRef = opt_regexp('(?=[^#\d\w]|\z)')
520
+
521
+ def scan_chardata(s)
522
+ while true
523
+ unless /&/ =~ s then
524
+ on_chardata s
525
+ else
526
+ s = $`
527
+ on_chardata s unless s.empty?
528
+ ref = nil
529
+ $'.split('&', -1).each { |s|
530
+ unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
531
+ if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
532
+ then
533
+ parse_error "reference to `#{ref}' doesn't end with `;'"
534
+ else
535
+ parse_error "`&' is not used for entity/character references"
536
+ on_chardata('&' << s)
537
+ next
538
+ end
539
+ end
540
+ ref = $`
541
+ s = $'
542
+ if /\A[^#]/ =~ ref then
543
+ on_entityref ref
544
+ elsif /\A#(\d+)\z/ =~ ref then
545
+ on_charref $1.to_i
546
+ elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
+ on_charref_hex $1.hex
548
+ else
549
+ parse_error "invalid character reference `#{ref}'"
550
+ end
551
+ on_chardata s unless s.empty?
552
+ }
553
+ end
554
+ s = @src.get_text
555
+ break unless s
556
+ s = '>' << s unless s == '>'
557
+ end
558
+ end
559
+
560
+
561
+ def scan_attvalue(s) # almostly copy & paste from scan_chardata
562
+ unless /&/ =~ s then
563
+ on_attr_value s
564
+ else
565
+ s = $`
566
+ on_attr_value s unless s.empty?
567
+ ref = nil
568
+ $'.split('&', -1).each { |s|
569
+ unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
570
+ if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
571
+ then
572
+ parse_error "reference to `#{ref}' doesn't end with `;'"
573
+ else
574
+ parse_error "`&' is not used for entity/character references"
575
+ on_attr_value('&' << s)
576
+ next
577
+ end
578
+ end
579
+ ref = $`
580
+ s = $'
581
+ if /\A[^#]/ =~ ref then
582
+ on_attr_entityref ref
583
+ elsif /\A#(\d+)\z/ =~ ref then
584
+ on_attr_charref $1.to_i
585
+ elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
+ on_attr_charref_hex $1.hex
587
+ else
588
+ parse_error "invalid character reference `#{ref}'"
589
+ end
590
+ on_attr_value s unless s.empty?
591
+ }
592
+ end
593
+ end
594
+
595
+
596
+ def scan_comment(s)
597
+ s[0,4] = '' # remove `<!--'
598
+ comm = ''
599
+ until /--/ =~ s
600
+ comm << s
601
+ s = @src.get_plain
602
+ unless s then
603
+ parse_error "unterminated comment meets EOF"
604
+ return on_comment(comm)
605
+ end
606
+ end
607
+ comm << $`
608
+ until (s = $').empty? and @src.close_tag
609
+ if s == '-' and @src.close_tag then # --->
610
+ parse_error "comment ending in `--->' is not allowed"
611
+ comm << s
612
+ break
613
+ end
614
+ parse_error "comment includes `--'"
615
+ comm << '--'
616
+ until /--/ =~ s # copy & paste for performance
617
+ comm << s
618
+ s = @src.get_plain
619
+ unless s then
620
+ parse_error "unterminated comment meets EOF"
621
+ return on_comment(comm)
622
+ end
623
+ end
624
+ comm << $`
625
+ end
626
+ on_comment comm
627
+ end
628
+
629
+
630
+ def scan_pi(s)
631
+ unless /\A<\?([^ \t\n\r?]+)(?:[ \t\n\r]+|(?=\?\z))/ =~ s then
632
+ parse_error "parse error at `<?'"
633
+ s << '>' if @src.close_tag
634
+ on_chardata s
635
+ else
636
+ target = $1
637
+ pi = $'
638
+ until pi[-1] == ?? and @src.close_tag
639
+ s = @src.get_plain
640
+ unless s then
641
+ parse_error "unterminated PI meets EOF"
642
+ return on_pi(target, pi)
643
+ end
644
+ pi << s
645
+ end
646
+ pi.chop! # remove last `?'
647
+ on_pi target, pi
648
+ end
649
+ end
650
+
651
+
652
+ CDATAPattern = opt_regexp('\]\]\z')
653
+
654
+ def scan_cdata(s)
655
+ cdata = s
656
+ re = CDATAPattern[@optkey]
657
+ until re =~ cdata and @src.close_tag
658
+ s = @src.get_plain
659
+ unless s then
660
+ parse_error "unterminated CDATA section meets EOF"
661
+ return on_cdata(cdata)
662
+ end
663
+ cdata << s
664
+ end
665
+ cdata.chop!.chop! # remove ']]'
666
+ on_cdata cdata
667
+ end
668
+
669
+
670
+ def found_unclosed_etag(name)
671
+ if @src.tag_start? then
672
+ parse_error "unclosed end tag `#{name}' meets another tag"
673
+ else
674
+ parse_error "unclosed end tag `#{name}' meets EOF"
675
+ end
676
+ end
677
+
678
+ def found_empty_etag
679
+ parse_error "parse error at `</'"
680
+ on_chardata '</>'
681
+ end
682
+
683
+
684
+ def scan_etag(s)
685
+ s[0,2] = '' # remove '</'
686
+ if s.empty? then
687
+ if @src.close_tag then # </>
688
+ return found_empty_etag
689
+ else # </< or </[EOF]
690
+ parse_error "parse error at `</'"
691
+ s << '>' if @src.close_tag
692
+ return on_chardata('</' << s)
693
+ end
694
+ elsif /[ \t\n\r]+/ =~ s then
695
+ s1, s2 = $`, $'
696
+ if s1.empty? then # </ tag
697
+ parse_error "parse error at `</'"
698
+ s << '>' if @src.close_tag
699
+ return on_chardata('</' + s)
700
+ elsif not s2.empty? then # </ta g
701
+ parse_error "illegal whitespace is found within end tag `#{s1}'"
702
+ while @src.get_tag
703
+ end
704
+ end
705
+ s = s1
706
+ end
707
+ found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
+ on_etag s
709
+ end
710
+
711
+
712
+ def found_empty_stag
713
+ parse_error "parse error at `<'"
714
+ on_chardata '<>'
715
+ end
716
+
717
+ def found_unclosed_stag(name)
718
+ if @src.tag_start? then
719
+ parse_error "unclosed start tag `#{name}' meets another tag"
720
+ else
721
+ parse_error "unclosed start tag `#{name}' meets EOF"
722
+ end
723
+ end
724
+
725
+ def found_unclosed_emptyelem(name)
726
+ if @src.tag_start? then
727
+ parse_error "unclosed empty element tag `#{name}' meets another tag"
728
+ else
729
+ parse_error "unclosed empty element tag `#{name}' meets EOF"
730
+ end
731
+ end
732
+
733
+
734
+ def found_stag_error(s)
735
+ if /\A[\/='"]/ =~ s then
736
+ tok, s = $&, $'
737
+ elsif /(?=[ \t\n\r\/='"])/ =~ s then
738
+ tok, s = $`, $'
739
+ else
740
+ tok, s = s, nil
741
+ end
742
+ parse_error "parse error at `#{tok}'"
743
+ s
744
+ end
745
+
746
+
747
+ def scan_stag(s)
748
+ unless /(?=[\/ \t\n\r='"])/ =~ s then
749
+ name = s
750
+ name[0,1] = '' # remove `<'
751
+ if name.empty? then
752
+ if @src.close_tag then # <>
753
+ return found_empty_stag
754
+ else # << or <[EOF]
755
+ parse_error "parse error at `<'"
756
+ return on_chardata('<')
757
+ end
758
+ end
759
+ on_stag name
760
+ found_unclosed_stag name unless @src.close_tag
761
+ on_stag_end name
762
+ else
763
+ name = $`
764
+ s = $'
765
+ name[0,1] = '' # remove `<'
766
+ if name.empty? then # `< tag' or `<=`
767
+ parse_error "parse error at `<'"
768
+ s << '>' if @src.close_tag
769
+ return on_chardata('<' << s)
770
+ end
771
+ on_stag name
772
+ emptyelem = false
773
+ key,val,error,qmark,c = nil
774
+ begin
775
+ continue = false
776
+ s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
+ ) { |key,val,error|
778
+ if key then # key="value"
779
+ on_attribute key
780
+ qmark = val.slice!(0,1)
781
+ if val[-1] == qmark[0] then
782
+ val.chop!
783
+ scan_attvalue val unless val.empty?
784
+ else
785
+ scan_attvalue val unless val.empty?
786
+ begin
787
+ s = @src.get
788
+ unless s then
789
+ parse_error "unterminated attribute `#{key}' meets EOF"
790
+ break
791
+ end
792
+ c = s[0]
793
+ val, s = s.split(qmark, 2)
794
+ if c == ?< then
795
+ wellformed_error "`<' is found in attribute `#{key}'"
796
+ elsif c != ?> then
797
+ scan_attvalue '>'
798
+ end
799
+ scan_attvalue val if c
800
+ end until s
801
+ continue = s # if eof then continue is false, else true.
802
+ end
803
+ on_attribute_end key
804
+ elsif error then
805
+ continue = s = found_stag_error(error)
806
+ else
807
+ emptyelem = true
808
+ end
809
+ }
810
+ end while continue
811
+ unless @src.close_tag then
812
+ if emptyelem then
813
+ found_unclosed_emptyelem name
814
+ else
815
+ found_unclosed_stag name
816
+ end
817
+ end
818
+ if emptyelem then
819
+ on_stag_end_empty name
820
+ else
821
+ on_stag_end name
822
+ end
823
+ end
824
+ end
825
+
826
+
827
+ def scan_bang_tag(s)
828
+ parse_error "parse error at `<!'"
829
+ s << '>' if @src.close_tag
830
+ on_chardata s
831
+ end
832
+
833
+
834
+ def scan_content(s)
835
+ src = @src # for speed
836
+ while s
837
+ if (c = s[0]) == ?< then
838
+ if (c = s[1]) == ?/ then
839
+ scan_etag s
840
+ elsif c == ?! then
841
+ if s[2] == ?- and s[3] == ?- then
842
+ scan_comment s
843
+ elsif /\A<!\[CDATA\[/ =~ s then
844
+ scan_cdata $'
845
+ else
846
+ scan_bang_tag s
847
+ end
848
+ elsif c == ?? then
849
+ scan_pi s
850
+ else
851
+ scan_stag s
852
+ end
853
+ else
854
+ scan_chardata s
855
+ end
856
+ s = src.get
857
+ end
858
+ end
859
+
860
+
861
+ def get_until_qmark(str, qmark)
862
+ begin
863
+ #s = @src.get_plain
864
+ s = @src.get
865
+ break unless s
866
+ c = s[0]
867
+ v, s = s.split(qmark, 2)
868
+ str << '>' unless c == ?< or c == ?> # De Morgan
869
+ str << v if c
870
+ end until s
871
+ s
872
+ end
873
+
874
+
875
+ XMLDeclPattern = opt_regexp(%q{[ \t\n\r]([\-_\d\w]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|(\?\z)|([\-_.\d\w]+|[^ \t\n\r])})
876
+
877
+ def scan_xmldecl(s)
878
+ endmark = nil
879
+ info = nil
880
+ state = 0
881
+ on_xmldecl
882
+ begin
883
+ continue = false
884
+ s.scan(XMLDeclPattern[@optkey]) { |key,val,endtok,error|
885
+ if key then
886
+ qmark = val.slice!(0,1) # remove quotation marks
887
+ if val[-1] == qmark[0] then
888
+ val.chop!
889
+ else
890
+ continue = s = get_until_qmark(val, qmark)
891
+ unless s then
892
+ parse_error "unterminated XML declaration meets EOF"
893
+ endmark = true
894
+ end
895
+ end
896
+ newstate = case state
897
+ when 0; key == 'version' ? 1 : 4
898
+ when 1; key == 'encoding' ? 2 : key == 'standalone' ? 3 : 4
899
+ else key == 'standalone' ? 3 : 4
900
+ end
901
+ state = if newstate == 4
902
+ known=%w{version encoding standalone}.member?(key)
903
+ parse_error known ? "#{key} declaration must not be here" :
904
+ "unknown declaration `#{key}' in XML declaration"
905
+ state < 2 ? 2 : 3
906
+ else newstate end
907
+ on_xmldecl_key key, val
908
+ elsif endtok then
909
+ endmark = if ct=@src.close_tag
910
+ true
911
+ else
912
+ parse_error "unexpected `#{endmark}' found in XML declaration"
913
+ nil
914
+ end
915
+ # here always exit the loop.
916
+ else
917
+ parse_error "parse error at `#{error}'"
918
+ end
919
+ }
920
+ end while !endmark and continue || s = @src.get_plain
921
+ parse_error "unterminated XML declaration meets EOF" unless s or endmark
922
+ parse_error "no declaration found in XML declaration" if state == 0
923
+ on_xmldecl_end
924
+ end
925
+
926
+
927
+ SkipDTD = opt_regexp(%q{(['"]|\A<!--|\A<\?|--\z|\?\z)|\]\s*\z}) #'
928
+
929
+ def skip_internal_dtd(s)
930
+ quote = nil
931
+ continue = true
932
+ begin # skip until `]>'
933
+ s.scan(SkipDTD[@optkey]) { |q,| #'
934
+ if quote then
935
+ quote = nil if quote == q and quote.size == 1 || @src.tag_end?
936
+ elsif q then
937
+ if q == '<!--' then
938
+ quote = '--'
939
+ elsif q == '<?' then
940
+ quote = '?'
941
+ elsif q == '"' or q == "'" then
942
+ quote = q
943
+ end
944
+ elsif @src.close_tag then
945
+ continue = false
946
+ end
947
+ }
948
+ end while continue and s = @src.get
949
+ parse_error "unterminated internal DTD subset meets EOF" unless s
950
+ end
951
+
952
+
953
+ def scan_internal_dtd(s)
954
+ warning "internal DTD subset is not supported"
955
+ skip_internal_dtd s
956
+ end
957
+
958
+
959
+ def found_invalid_pubsys(pubsys)
960
+ parse_error "`PUBLIC' or `SYSTEM' should be here"
961
+ 'SYSTEM'
962
+ end
963
+
964
+
965
+ DoctypePattern = opt_regexp(%q{[ \t\n\r](?:([^ \t\n\r\/'"=\[]+)|('[^']*'?|"[^"]*"?))|([\-_.\d\w]+|[^ \t\n\r])}) #"
966
+
967
+ def scan_doctype(s)
968
+ root = syspub = sysid = pubid = nil
969
+ internal_dtd = false
970
+ re = DoctypePattern[@opt]
971
+ begin
972
+ if re =~ s then
973
+ name, str, delim, s = $1, $2, $3, $'
974
+ if name then
975
+ if not root then
976
+ root = name
977
+ elsif not syspub then
978
+ unless name == 'PUBLIC' or name == 'SYSTEM' then
979
+ name = found_invalid_pubsys(name)
980
+ end
981
+ syspub = name
982
+ else
983
+ parse_error "parse error at `#{name}'"
984
+ end
985
+ elsif str then
986
+ qmark = str.slice!(0,1) # remove quotation marks
987
+ unless syspub then
988
+ parse_error "parse error at `#{qmark}'"
989
+ s = str << s
990
+ else
991
+ if str[-1] == qmark[0] then
992
+ str.chop!
993
+ else
994
+ s = get_until_qmark(str, qmark) || ''
995
+ end
996
+ if not sysid then
997
+ sysid = str
998
+ elsif not pubid and syspub == 'PUBLIC' then
999
+ pubid = sysid
1000
+ sysid = str
1001
+ else
1002
+ parse_error "too many external ID literals in DOCTYPE"
1003
+ end
1004
+ end
1005
+ elsif delim == '[' then
1006
+ internal_dtd = true
1007
+ break
1008
+ else
1009
+ parse_error "parse error at `#{delim}'"
1010
+ end
1011
+ else
1012
+ s = ''
1013
+ end
1014
+ if s.empty? then
1015
+ break if @src.close_tag
1016
+ s = @src.get_plain
1017
+ end
1018
+ end while s
1019
+ parse_error "unterminated DOCTYPE declaration meets EOF" unless s
1020
+ unless root then
1021
+ parse_error "no root element is specified in DOCTYPE"
1022
+ end
1023
+ if syspub and not sysid then
1024
+ parse_error "too few external ID literals in DOCTYPE"
1025
+ end
1026
+ if syspub == 'PUBLIC' and not pubid then
1027
+ pubid, sysid = sysid, nil
1028
+ end
1029
+ on_doctype root, pubid, sysid
1030
+ scan_internal_dtd s if internal_dtd
1031
+ end
1032
+
1033
+
1034
+ def scan_prolog(s)
1035
+ if /\A<\?xml(?=[ \t\n\r])/ =~ s then
1036
+ scan_xmldecl $'
1037
+ s = @src.get
1038
+ end
1039
+ doctype = true
1040
+ src = @src # for speed
1041
+ while s
1042
+ if s[0] == ?< then
1043
+ if (c = s[1]) == ?! then
1044
+ if s[2] == ?- and s[3] == ?- then
1045
+ scan_comment s
1046
+ elsif /\A<!DOCTYPE(?=[ \t\n\r])/ =~ s and doctype then
1047
+ doctype = false
1048
+ scan_doctype $'
1049
+ else
1050
+ break
1051
+ end
1052
+ elsif c == ?? then
1053
+ scan_pi s
1054
+ else
1055
+ break
1056
+ end
1057
+ s = src.get
1058
+ elsif /[^ \t\r\n]/ !~ s then
1059
+ on_prolog_space s unless s.empty?
1060
+ s = src.get_plain
1061
+ else
1062
+ break
1063
+ end
1064
+ end
1065
+ scan_content(s || src.get)
1066
+ end
1067
+
1068
+
1069
+ def scan_document
1070
+ on_start_document
1071
+ @src.prepare
1072
+ scan_prolog @src.get
1073
+ on_end_document
1074
+ end
1075
+
1076
+
1077
+ def make_source(src)
1078
+ Source.new src
1079
+ end
1080
+
1081
+
1082
+ public
1083
+
1084
+ def parse_document(src)
1085
+ @src = make_source(src)
1086
+ begin
1087
+ scan_document
1088
+ ensure
1089
+ @src = nil
1090
+ end
1091
+ self
1092
+ end
1093
+
1094
+ alias parse parse_document
1095
+
1096
+ end
1097
+
1098
+
1099
+ end
1100
+
1101
+
1102
+
1103
+
1104
+
1105
+ if $0 == __FILE__ then
1106
+ class TestVisitor
1107
+ include XMLScan::Visitor
1108
+ def parse_error(msg)
1109
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
1110
+ end
1111
+ def wellformed_error(msg)
1112
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
1113
+ end
1114
+ end
1115
+
1116
+ $s = scan = XMLScan::XMLScanner.new(TestVisitor.new)
1117
+ src = ARGF
1118
+ def src.path; filename; end
1119
+ t1 = Time.times.utime
1120
+ scan.parse src
1121
+ t2 = Time.times.utime
1122
+ STDERR.printf "%2.3f sec\n", t2 - t1
1123
+ end