xmlscan 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/parser.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: parser.rb,v 1.10 2003/01/22 13:06:18 katsu Exp $
8
+ #
9
+
10
+ require 'xmlscan/scanner'
11
+
12
+
13
+ module XMLScan
14
+
15
+ class XMLParser < XMLScanner
16
+
17
+ class AttributeChecker < Hash
18
+ # AttributeChecker inherits Hash only for speed.
19
+
20
+ def check_unique(name)
21
+ not key? name and store(name, true)
22
+ end
23
+
24
+ end
25
+
26
+
27
+ #PredefinedEntity = {
28
+ # 'lt' => '<',
29
+ # 'gt' => '>',
30
+ # 'amp' => '&',
31
+ # 'quot' => '"',
32
+ # 'apos' => "'",
33
+ #}
34
+
35
+
36
+ def parse(*)
37
+ @elem = []
38
+ @attr = AttributeChecker.new
39
+ @standalone = nil
40
+ super
41
+ end
42
+
43
+
44
+ private
45
+
46
+ def on_xmldecl_version(str)
47
+ unless str == '1.0' then
48
+ warning "unsupported XML version `#{str}'"
49
+ end
50
+ @visitor.on_xmldecl_version str
51
+ end
52
+
53
+
54
+ def on_xmldecl_standalone(str)
55
+ if str == 'yes' then
56
+ @standalone = true
57
+ elsif str == 'no' then
58
+ @standalone = false
59
+ else
60
+ parse_error "standalone declaration must be either `yes' or `no'"
61
+ end
62
+ @visitor.on_xmldecl_standalone str
63
+ end
64
+
65
+
66
+ def on_doctype(name, pubid, sysid)
67
+ if pubid and not sysid then
68
+ parse_error "public external ID must have both public ID and system ID"
69
+ end
70
+ @visitor.on_doctype name, pubid, sysid
71
+ end
72
+
73
+
74
+ def on_prolog_space(s)
75
+ # just ignore it.
76
+ end
77
+
78
+
79
+ def on_pi(target, pi)
80
+ if target.downcase == 'xml' then
81
+ parse_error "reserved PI target `#{target}'"
82
+ end
83
+ @visitor.on_pi target, pi
84
+ end
85
+
86
+
87
+ #def on_entityref(ref)
88
+ # rep = PredefinedEntity[ref]
89
+ # if rep then
90
+ # @visitor.on_chardata rep
91
+ # else
92
+ # @visitor.on_entityref ref
93
+ # end
94
+ #end
95
+
96
+
97
+ #def on_attr_entityref(ref)
98
+ # rep = PredefinedEntity[ref]
99
+ # if rep then
100
+ # @visitor.on_attr_value rep
101
+ # else
102
+ # @visitor.on_attr_entityref ref
103
+ # end
104
+ #end
105
+
106
+
107
+ #def on_charref_hex(code)
108
+ # on_charref code
109
+ #end
110
+
111
+
112
+ #def on_attr_charref_hex(code)
113
+ # on_attr_charref code
114
+ #end
115
+
116
+
117
+ def on_stag(name)
118
+ @elem.push name
119
+ @visitor.on_stag name
120
+ @attr.clear
121
+ end
122
+
123
+ def on_attribute(name)
124
+ unless @attr.check_unique name then
125
+ wellformed_error "doubled attribute `#{name}'"
126
+ end
127
+ @visitor.on_attribute name
128
+ end
129
+
130
+ def on_attr_value(str)
131
+ str.tr! "\t\r\n", ' ' # normalize
132
+ @visitor.on_attr_value str
133
+ end
134
+
135
+ def on_stag_end_empty(name)
136
+ # @visitor.on_stag_end name
137
+ # @elem.pop
138
+ # @visitor.on_etag name
139
+ @visitor.on_stag_end_empty name
140
+ @elem.pop
141
+ end
142
+
143
+ def on_etag(name)
144
+ last = @elem.pop
145
+ if last == name then
146
+ @visitor.on_etag name
147
+ elsif last then
148
+ wellformed_error "element type `#{name}' is not matched"
149
+ @visitor.on_etag last
150
+ else
151
+ parse_error "end tag `#{name}' appears alone"
152
+ end
153
+ end
154
+
155
+
156
+ public
157
+
158
+
159
+ def scan_content(s)
160
+ elem = @elem # for speed
161
+ src = @src # for speed
162
+ found_root_element = false
163
+
164
+ begin
165
+
166
+ # -- first start tag --
167
+ elem.clear
168
+ found_stag = false
169
+
170
+ while s and not found_stag
171
+ if (c = s[0]) == ?< then
172
+ if (c = s[1]) == ?/ then
173
+ # should be a parse error
174
+ scan_etag s
175
+ elsif c == ?! then
176
+ if s[2] == ?- and s[3] == ?- then
177
+ scan_comment s
178
+ elsif /\A<!\[CDATA\[/n =~ s then
179
+ parse_error "CDATA section is found outside of root element"
180
+ scan_cdata $'
181
+ else
182
+ scan_bang_tag s
183
+ end
184
+ elsif c == ?? then
185
+ scan_pi s
186
+ else
187
+ found_root_element = true
188
+ found_stag = true
189
+ scan_stag s
190
+ end
191
+ else
192
+ parse_error "content of element is found outside of root element"
193
+ scan_chardata s
194
+ end
195
+ s = src.get
196
+ end
197
+
198
+ if not found_root_element and not found_stag then
199
+ parse_error "no root element was found"
200
+ end
201
+
202
+ # -- contents --
203
+ while s and not elem.empty?
204
+ if (c = s[0]) == ?< then
205
+ if (c = s[1]) == ?/ then
206
+ scan_etag s
207
+ elsif c == ?! then
208
+ if s[2] == ?- and s[3] == ?- then
209
+ scan_comment s
210
+ elsif /\A<!\[CDATA\[/n =~ s then
211
+ scan_cdata $'
212
+ else
213
+ scan_bang_tag s
214
+ end
215
+ elsif c == ?? then
216
+ scan_pi s
217
+ else
218
+ scan_stag s
219
+ end
220
+ else
221
+ scan_chardata s
222
+ end
223
+ s = src.get
224
+ end
225
+
226
+ unless elem.empty? then
227
+ while name = elem.pop
228
+ parse_error "unclosed element `#{name}' meets EOF"
229
+ @visitor.on_etag name
230
+ end
231
+ end
232
+
233
+ # -- epilogue --
234
+ finish = true
235
+
236
+ while s
237
+ if (c = s[0]) == ?< then
238
+ if (c = s[1]) == ?/ then
239
+ finish = false # content out of root element
240
+ break
241
+ elsif c == ?! then
242
+ if s[2] == ?- and s[3] == ?- then
243
+ scan_comment s
244
+ else
245
+ finish = false # content out of root element
246
+ break
247
+ end
248
+ elsif c == ?? then
249
+ scan_pi s
250
+ else
251
+ parse_error "another root element is found" # stag
252
+ finish = false
253
+ break
254
+ end
255
+ else
256
+ if s.strip.empty? then
257
+ on_prolog_space s
258
+ else
259
+ finish = false # content out of root element
260
+ break
261
+ end
262
+ end
263
+ s = src.get
264
+ end
265
+
266
+ end until finish
267
+
268
+ end
269
+ end
270
+
271
+
272
+ end
273
+
274
+
275
+
276
+
277
+
278
+
279
+ if $0 == __FILE__ then
280
+ class TestVisitor
281
+ include XMLScan::Visitor
282
+ def parse_error(msg)
283
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
284
+ end
285
+ def wellformed_error(msg)
286
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
287
+ end
288
+ def warning(msg)
289
+ STDERR.printf("%s:%d: warning: %s\n", $s.path,$s.lineno, msg) if $VERBOSE
290
+ end
291
+ end
292
+
293
+ $s = scan = XMLScan::XMLParser.new(TestVisitor.new)
294
+ src = ARGF
295
+ def src.path; filename; end
296
+ t1 = Time.times.utime
297
+ scan.parse src
298
+ t2 = Time.times.utime
299
+ STDERR.printf "%2.3f sec\n", t2 - t1
300
+ end
@@ -0,0 +1,1123 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # xmlscan/scanner.rb
4
+ #
5
+ # Copyright (C) Ueno Katsuhiro 2002
6
+ #
7
+ # $Id: scanner.rb,v 1.75.2.3 2003/05/01 15:43:23 katsu Exp $
8
+ #
9
+
10
+ #
11
+ # CONSIDERATIONS FOR CHARACTER ENCODINGS:
12
+ #
13
+ # There are the following common characteristics in character encodings
14
+ # which are supported by Ruby's $KCODE feature (ISO-8859-*, Shift_JIS,
15
+ # EUC, and UTF-8):
16
+ #
17
+ # - Stateless.
18
+ # - ASCII characters are encoded in the same manner as US-ASCII.
19
+ # - The octet sequences corresponding to non-ASCII characters begin
20
+ # with an octet greater than 0x80.
21
+ # - The following characters can be identified by just one octet.
22
+ # That is, every octets corresponding to the following characters in
23
+ # US-ASCII never appear as a part of an octet sequence representing a
24
+ # non-ASCII character.
25
+ #
26
+ # Whitespaces("\t", "\n", "\r", and " ") and
27
+ # ! \ " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
28
+ #
29
+ # Be careful that `[' and `]' are NOT included in the list!
30
+ #
31
+ # If we build a regular expression carefully in accordance with these
32
+ # characteristics, we can get the same match regardless of the value
33
+ # of $KCODE. Moreover, if it can be premised on them, we can detect
34
+ # several delimiters without regular expressions. XMLScanner uses this
35
+ # fact in order to share many regular expressions in all $KCODE modes,
36
+ # and in order to optimize parsing speed.
37
+ #
38
+
39
+ require 'xmlscan/visitor'
40
+
41
+
42
+ module XMLScan
43
+
44
+ class Input
45
+
46
+ def initialize(src)
47
+ @src = src
48
+ unless src.respond_to? :gets then
49
+ if src.respond_to? :to_ary then
50
+ @v = src.to_ary
51
+ @n = -1
52
+ def self.gets ; @v.at(@n += 1) ; end
53
+ def self.lineno ; @n + 1 ; end
54
+ else
55
+ @v = @src
56
+ def self.gets ; s = @v ; @v = nil ; s ; end
57
+ end
58
+ end
59
+ if src.respond_to? :lineno then
60
+ def self.lineno ; @src.lineno ; end
61
+ end
62
+ if src.respond_to? :path then
63
+ def self.path ; @src.path ; end
64
+ end
65
+ end
66
+
67
+ attr_reader :src
68
+
69
+ def gets ; @src.gets ; end
70
+ def lineno ; 0 ; end
71
+ def path ; '-' ; end
72
+
73
+ def self.wrap(src)
74
+ unless src.respond_to? :gets and src.respond_to? :lineno and
75
+ src.respond_to? :path then
76
+ src = new(src)
77
+ end
78
+ src
79
+ end
80
+
81
+ def self.unwrap(obj)
82
+ if self === obj then
83
+ obj.src
84
+ else
85
+ obj
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+
92
+
93
+ class PrivateArray < Array
94
+ m = superclass.instance_methods - Kernel.instance_methods
95
+ private(*m)
96
+ end
97
+
98
+
99
+ class Source < PrivateArray
100
+ # Source inherits Array only for speed.
101
+
102
+ def initialize(src)
103
+ super()
104
+ @src = Input.wrap(src)
105
+ @eof = false
106
+ @last = nil
107
+ end
108
+
109
+ def source
110
+ Input.unwrap @src
111
+ end
112
+
113
+
114
+ def eof?
115
+ @eof and empty?
116
+ end
117
+
118
+ def abort
119
+ @eof = true
120
+ @last = nil
121
+ clear
122
+ self
123
+ end
124
+
125
+
126
+ def get
127
+ pop or
128
+ unless @eof then
129
+ last = @last
130
+ begin
131
+ src = @src.gets
132
+ unless src then
133
+ @eof = true
134
+ unshift last
135
+ last = nil
136
+ break
137
+ end
138
+ a = src.split(/(?=<|>[<>])|>/, -1)
139
+ if last then
140
+ unless /\A[<>]/ =~ a.first then
141
+ a[0] = last << (a.first || '')
142
+ else
143
+ push last
144
+ end
145
+ end
146
+ concat a
147
+ last = pop
148
+ end while empty?
149
+ @last = last
150
+ reverse!
151
+ pop
152
+ end
153
+ end
154
+
155
+
156
+ def prepare
157
+ s = get
158
+ s = get and s = '>' << s if s and s.empty? # preserve first `>'
159
+ s and push s
160
+ end
161
+
162
+
163
+ def tag_end?
164
+ s = last || @last and s[0] != ?<
165
+ end
166
+
167
+ def tag_start?
168
+ s = last || @last and s[0] == ?<
169
+ end
170
+
171
+ def close_tag # tag_end?, and remove a `>'.
172
+ unless s = last || @last and s[0] != ?< then
173
+ false
174
+ else
175
+ if s == '>' or s.empty? then
176
+ s1 = get
177
+ unless s = last || @last and s[0] == ?< then # for speed up
178
+ out = [ s1 ]
179
+ out.push get while s = last || @last and s == '>' || s.empty?
180
+ x=out.pop unless s and s[0] != ?< # De Morgan
181
+ concat out
182
+ end
183
+ end
184
+ true
185
+ end
186
+ end
187
+
188
+
189
+ def get_text # get until tag_start?
190
+ s = last || @last and s[0] != ?< and get
191
+ end
192
+
193
+ def get_tag # get until tag_end?
194
+ s = last || @last and s[0] == ?< and get
195
+ end
196
+
197
+ def get_plain
198
+ s = get
199
+ s = '>' << s unless not s or (c = s[0]) == ?< or c == ?> # De Morgan
200
+ s
201
+ end
202
+
203
+ def lineno
204
+ @src.lineno
205
+ end
206
+
207
+ def path
208
+ @src.path
209
+ end
210
+
211
+
212
+ # The following methods are for debug.
213
+
214
+ def inspect
215
+ a = []
216
+ reverse_each { |i|
217
+ a.push ">" unless /\A[<>]/ =~ i
218
+ a.push i.inspect
219
+ }
220
+ last = []
221
+ if @last then
222
+ last.push ">" unless /\A[<>]/ =~ @last
223
+ last.push @last.inspect
224
+ end
225
+ a.push '#eof' if @eof
226
+ "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
227
+ end
228
+
229
+ def each
230
+ prepare
231
+ while s = get
232
+ yield s
233
+ end
234
+ self
235
+ end
236
+
237
+ def test
238
+ last or @last or (s = get and push s and s)
239
+ end
240
+
241
+ end
242
+
243
+
244
+
245
+ class XMLScanner
246
+
247
+ class << self
248
+
249
+ def provided_options
250
+ options = []
251
+ private_instance_methods.each { |i|
252
+ options.push $' if /\Aapply_option_/ =~ i
253
+ }
254
+ options
255
+ end
256
+
257
+ def apply_option(instance, option)
258
+ instance.__send__ "apply_option_#{option}"
259
+ end
260
+
261
+ def apply_options(instance, options)
262
+ h = {}
263
+ options.each { |i| h[i.to_s] = true }
264
+ options = h
265
+ ancestors.each { |klass|
266
+ if klass.respond_to? :provided_options then
267
+ klass.provided_options.each { |i|
268
+ if options.include? i then
269
+ options.delete i
270
+ klass.apply_option instance, i
271
+ end
272
+ }
273
+ end
274
+ }
275
+ unless options.empty? then
276
+ raise ArgumentError, "undefined option `#{options.keys[0]}'"
277
+ end
278
+ instance
279
+ end
280
+ private :apply_options
281
+
282
+ def new(visitor, *options)
283
+ instance = super(visitor)
284
+ apply_options instance, options
285
+ end
286
+
287
+ end
288
+
289
+
290
+
291
+ def initialize(visitor)
292
+ @visitor = visitor
293
+ @decoration = nil
294
+ @src = nil
295
+ @optkey = nil
296
+ end
297
+
298
+ attr_accessor :optkey
299
+
300
+ def opt_encoding() OptRegexp::RE_ENCODINGS[optkey] end
301
+
302
+
303
+ def decorate(decoration)
304
+ unless @decoration then
305
+ @visitor = @decoration = Decoration.new(@visitor)
306
+ end
307
+ @decoration.expand decoration
308
+ end
309
+ private :decorate
310
+
311
+
312
+ def lineno
313
+ @src && @src.lineno
314
+ end
315
+
316
+ def path
317
+ @src && @src.path
318
+ end
319
+
320
+ def source
321
+ @src.source
322
+ end
323
+
324
+
325
+ private
326
+
327
+ def parse_error(msg)
328
+ @visitor.parse_error msg
329
+ end
330
+
331
+ def wellformed_error(msg)
332
+ @visitor.wellformed_error msg
333
+ end
334
+
335
+ def valid_error(msg)
336
+ @visitor.valid_error msg
337
+ end
338
+
339
+ def warning(msg)
340
+ @visitor.warning msg
341
+ end
342
+
343
+
344
+ def on_xmldecl
345
+ @visitor.on_xmldecl
346
+ end
347
+
348
+ def on_xmldecl_key(key, str)
349
+ meth = "on_xmldecl_#{key}"
350
+ if @visitor.respond_to? meth
351
+ self.send meth, str
352
+ else
353
+ self.send :on_xmldecl_other, key, str
354
+ end
355
+ end
356
+
357
+ def on_xmldecl_version(str)
358
+ @visitor.on_xmldecl_version str
359
+ end
360
+
361
+ def on_xmldecl_encoding(str)
362
+ @visitor.on_xmldecl_encoding str
363
+ end
364
+
365
+ def on_xmldecl_standalone(str)
366
+ @visitor.on_xmldecl_standalone str
367
+ end
368
+
369
+ def on_xmldecl_other(name, value)
370
+ @visitor.on_xmldecl_other name, value
371
+ end
372
+
373
+ def on_xmldecl_end
374
+ @visitor.on_xmldecl_end
375
+ end
376
+
377
+ def on_doctype(root, pubid, sysid)
378
+ @visitor.on_doctype root, pubid, sysid
379
+ end
380
+
381
+ def on_prolog_space(str)
382
+ @visitor.on_prolog_space str
383
+ end
384
+
385
+ def on_comment(str)
386
+ @visitor.on_comment str
387
+ end
388
+
389
+ def on_pi(target, pi)
390
+ @visitor.on_pi target, pi
391
+ end
392
+
393
+ def on_chardata(str)
394
+ @visitor.on_chardata str
395
+ end
396
+
397
+ def on_cdata(str)
398
+ @visitor.on_cdata str
399
+ end
400
+
401
+ def on_etag(name)
402
+ @visitor.on_etag name
403
+ end
404
+
405
+ def on_entityref(ref)
406
+ @visitor.on_entityref ref
407
+ end
408
+
409
+ def on_charref(code)
410
+ @visitor.on_charref code
411
+ end
412
+
413
+ def on_charref_hex(code)
414
+ @visitor.on_charref_hex code
415
+ end
416
+
417
+ def on_start_document
418
+ @visitor.on_start_document
419
+ end
420
+
421
+ def on_end_document
422
+ @visitor.on_end_document
423
+ end
424
+
425
+
426
+ # <hoge fuga="foo&bar;&#38;&#x26;foo" />HOGE
427
+ # ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
428
+ # 1 2 3 4 5 6 7 8 9 A
429
+ #
430
+ # The following method will be called with the following arguments
431
+ # when the parser reaches the above point;
432
+ #
433
+ # 1: on_stag ('hoge')
434
+ # 2: on_attribute ('fuga')
435
+ # 3: on_attr_value ('foo')
436
+ # 4: on_attr_entityref ('bar')
437
+ # 5: on_attr_charref (38)
438
+ # 6: on_attr_charref_hex (38)
439
+ # 7: on_attr_value ('foo')
440
+ # 8: on_attribute_end ('fuga')
441
+ # 9: on_stag_end_empty ('hoge')
442
+ # or
443
+ # on_stag_end ('hoge')
444
+ #
445
+ # A: on_chardata ('HOGE')
446
+
447
+ def on_stag(name)
448
+ @visitor.on_stag name
449
+ end
450
+
451
+ def on_attribute(name)
452
+ @visitor.on_attribute name
453
+ end
454
+
455
+ def on_attr_value(str)
456
+ @visitor.on_attr_value str
457
+ end
458
+
459
+ def on_attr_entityref(ref)
460
+ @visitor.on_attr_entityref ref
461
+ end
462
+
463
+ def on_attr_charref(code)
464
+ @visitor.on_attr_charref code
465
+ end
466
+
467
+ def on_attr_charref_hex(code)
468
+ @visitor.on_attr_charref_hex code
469
+ end
470
+
471
+ def on_attribute_end(name)
472
+ @visitor.on_attribute_end name
473
+ end
474
+
475
+ def on_stag_end_empty(name)
476
+ @visitor.on_stag_end_empty name
477
+ end
478
+
479
+ def on_stag_end(name)
480
+ @visitor.on_stag_end name
481
+ end
482
+
483
+
484
+
485
+ private
486
+
487
+ module OptRegexp
488
+ UTFSTR = "é"
489
+ S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
+ E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
+
492
+ RE_ENCODINGS = {
493
+ :n=>/e/n.encoding,
494
+ :e=>/#{E_OPT_EXAMPLE}/e.encoding,
495
+ :s=>/#{S_OPT_EXAMPLE}/s.encoding,
496
+ :u=>/#{UTFSTR}/u.encoding
497
+ }
498
+
499
+ RE_ENCODING_OPTIONS = {
500
+ :n=>/e/n.options,
501
+ :e=>/#{E_OPT_EXAMPLE}/e.options,
502
+ :s=>/#{S_OPT_EXAMPLE}/s.options,
503
+ :u=>/#{UTFSTR}/u.options
504
+ }
505
+
506
+ private
507
+ def opt_regexp(re)
508
+ h = {}
509
+ RE_ENCODING_OPTIONS.each { |k,opt|
510
+ h[k] = Regexp.new(re.encode(RE_ENCODINGS[k]), opt)
511
+ }
512
+ h.default = Regexp.new(re)
513
+ h
514
+ end
515
+ end
516
+ extend OptRegexp
517
+
518
+
519
+ InvalidEntityRef = opt_regexp('(?=[^#\d\w]|\z)')
520
+
521
+ def scan_chardata(s)
522
+ while true
523
+ unless /&/ =~ s then
524
+ on_chardata s
525
+ else
526
+ s = $`
527
+ on_chardata s unless s.empty?
528
+ ref = nil
529
+ $'.split('&', -1).each { |s|
530
+ unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
531
+ if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
532
+ then
533
+ parse_error "reference to `#{ref}' doesn't end with `;'"
534
+ else
535
+ parse_error "`&' is not used for entity/character references"
536
+ on_chardata('&' << s)
537
+ next
538
+ end
539
+ end
540
+ ref = $`
541
+ s = $'
542
+ if /\A[^#]/ =~ ref then
543
+ on_entityref ref
544
+ elsif /\A#(\d+)\z/ =~ ref then
545
+ on_charref $1.to_i
546
+ elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
+ on_charref_hex $1.hex
548
+ else
549
+ parse_error "invalid character reference `#{ref}'"
550
+ end
551
+ on_chardata s unless s.empty?
552
+ }
553
+ end
554
+ s = @src.get_text
555
+ break unless s
556
+ s = '>' << s unless s == '>'
557
+ end
558
+ end
559
+
560
+
561
+ def scan_attvalue(s) # almostly copy & paste from scan_chardata
562
+ unless /&/ =~ s then
563
+ on_attr_value s
564
+ else
565
+ s = $`
566
+ on_attr_value s unless s.empty?
567
+ ref = nil
568
+ $'.split('&', -1).each { |s|
569
+ unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
570
+ if InvalidEntityRef[@optkey] =~ s and not (ref = $`).strip.empty?
571
+ then
572
+ parse_error "reference to `#{ref}' doesn't end with `;'"
573
+ else
574
+ parse_error "`&' is not used for entity/character references"
575
+ on_attr_value('&' << s)
576
+ next
577
+ end
578
+ end
579
+ ref = $`
580
+ s = $'
581
+ if /\A[^#]/ =~ ref then
582
+ on_attr_entityref ref
583
+ elsif /\A#(\d+)\z/ =~ ref then
584
+ on_attr_charref $1.to_i
585
+ elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
+ on_attr_charref_hex $1.hex
587
+ else
588
+ parse_error "invalid character reference `#{ref}'"
589
+ end
590
+ on_attr_value s unless s.empty?
591
+ }
592
+ end
593
+ end
594
+
595
+
596
+ def scan_comment(s)
597
+ s[0,4] = '' # remove `<!--'
598
+ comm = ''
599
+ until /--/ =~ s
600
+ comm << s
601
+ s = @src.get_plain
602
+ unless s then
603
+ parse_error "unterminated comment meets EOF"
604
+ return on_comment(comm)
605
+ end
606
+ end
607
+ comm << $`
608
+ until (s = $').empty? and @src.close_tag
609
+ if s == '-' and @src.close_tag then # --->
610
+ parse_error "comment ending in `--->' is not allowed"
611
+ comm << s
612
+ break
613
+ end
614
+ parse_error "comment includes `--'"
615
+ comm << '--'
616
+ until /--/ =~ s # copy & paste for performance
617
+ comm << s
618
+ s = @src.get_plain
619
+ unless s then
620
+ parse_error "unterminated comment meets EOF"
621
+ return on_comment(comm)
622
+ end
623
+ end
624
+ comm << $`
625
+ end
626
+ on_comment comm
627
+ end
628
+
629
+
630
+ def scan_pi(s)
631
+ unless /\A<\?([^ \t\n\r?]+)(?:[ \t\n\r]+|(?=\?\z))/ =~ s then
632
+ parse_error "parse error at `<?'"
633
+ s << '>' if @src.close_tag
634
+ on_chardata s
635
+ else
636
+ target = $1
637
+ pi = $'
638
+ until pi[-1] == ?? and @src.close_tag
639
+ s = @src.get_plain
640
+ unless s then
641
+ parse_error "unterminated PI meets EOF"
642
+ return on_pi(target, pi)
643
+ end
644
+ pi << s
645
+ end
646
+ pi.chop! # remove last `?'
647
+ on_pi target, pi
648
+ end
649
+ end
650
+
651
+
652
+ CDATAPattern = opt_regexp('\]\]\z')
653
+
654
+ def scan_cdata(s)
655
+ cdata = s
656
+ re = CDATAPattern[@optkey]
657
+ until re =~ cdata and @src.close_tag
658
+ s = @src.get_plain
659
+ unless s then
660
+ parse_error "unterminated CDATA section meets EOF"
661
+ return on_cdata(cdata)
662
+ end
663
+ cdata << s
664
+ end
665
+ cdata.chop!.chop! # remove ']]'
666
+ on_cdata cdata
667
+ end
668
+
669
+
670
+ def found_unclosed_etag(name)
671
+ if @src.tag_start? then
672
+ parse_error "unclosed end tag `#{name}' meets another tag"
673
+ else
674
+ parse_error "unclosed end tag `#{name}' meets EOF"
675
+ end
676
+ end
677
+
678
+ def found_empty_etag
679
+ parse_error "parse error at `</'"
680
+ on_chardata '</>'
681
+ end
682
+
683
+
684
+ def scan_etag(s)
685
+ s[0,2] = '' # remove '</'
686
+ if s.empty? then
687
+ if @src.close_tag then # </>
688
+ return found_empty_etag
689
+ else # </< or </[EOF]
690
+ parse_error "parse error at `</'"
691
+ s << '>' if @src.close_tag
692
+ return on_chardata('</' << s)
693
+ end
694
+ elsif /[ \t\n\r]+/ =~ s then
695
+ s1, s2 = $`, $'
696
+ if s1.empty? then # </ tag
697
+ parse_error "parse error at `</'"
698
+ s << '>' if @src.close_tag
699
+ return on_chardata('</' + s)
700
+ elsif not s2.empty? then # </ta g
701
+ parse_error "illegal whitespace is found within end tag `#{s1}'"
702
+ while @src.get_tag
703
+ end
704
+ end
705
+ s = s1
706
+ end
707
+ found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
+ on_etag s
709
+ end
710
+
711
+
712
+ def found_empty_stag
713
+ parse_error "parse error at `<'"
714
+ on_chardata '<>'
715
+ end
716
+
717
+ def found_unclosed_stag(name)
718
+ if @src.tag_start? then
719
+ parse_error "unclosed start tag `#{name}' meets another tag"
720
+ else
721
+ parse_error "unclosed start tag `#{name}' meets EOF"
722
+ end
723
+ end
724
+
725
+ def found_unclosed_emptyelem(name)
726
+ if @src.tag_start? then
727
+ parse_error "unclosed empty element tag `#{name}' meets another tag"
728
+ else
729
+ parse_error "unclosed empty element tag `#{name}' meets EOF"
730
+ end
731
+ end
732
+
733
+
734
+ def found_stag_error(s)
735
+ if /\A[\/='"]/ =~ s then
736
+ tok, s = $&, $'
737
+ elsif /(?=[ \t\n\r\/='"])/ =~ s then
738
+ tok, s = $`, $'
739
+ else
740
+ tok, s = s, nil
741
+ end
742
+ parse_error "parse error at `#{tok}'"
743
+ s
744
+ end
745
+
746
+
747
+ def scan_stag(s)
748
+ unless /(?=[\/ \t\n\r='"])/ =~ s then
749
+ name = s
750
+ name[0,1] = '' # remove `<'
751
+ if name.empty? then
752
+ if @src.close_tag then # <>
753
+ return found_empty_stag
754
+ else # << or <[EOF]
755
+ parse_error "parse error at `<'"
756
+ return on_chardata('<')
757
+ end
758
+ end
759
+ on_stag name
760
+ found_unclosed_stag name unless @src.close_tag
761
+ on_stag_end name
762
+ else
763
+ name = $`
764
+ s = $'
765
+ name[0,1] = '' # remove `<'
766
+ if name.empty? then # `< tag' or `<=`
767
+ parse_error "parse error at `<'"
768
+ s << '>' if @src.close_tag
769
+ return on_chardata('<' << s)
770
+ end
771
+ on_stag name
772
+ emptyelem = false
773
+ key,val,error,qmark,c = nil
774
+ begin
775
+ continue = false
776
+ s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
+ ) { |key,val,error|
778
+ if key then # key="value"
779
+ on_attribute key
780
+ qmark = val.slice!(0,1)
781
+ if val[-1] == qmark[0] then
782
+ val.chop!
783
+ scan_attvalue val unless val.empty?
784
+ else
785
+ scan_attvalue val unless val.empty?
786
+ begin
787
+ s = @src.get
788
+ unless s then
789
+ parse_error "unterminated attribute `#{key}' meets EOF"
790
+ break
791
+ end
792
+ c = s[0]
793
+ val, s = s.split(qmark, 2)
794
+ if c == ?< then
795
+ wellformed_error "`<' is found in attribute `#{key}'"
796
+ elsif c != ?> then
797
+ scan_attvalue '>'
798
+ end
799
+ scan_attvalue val if c
800
+ end until s
801
+ continue = s # if eof then continue is false, else true.
802
+ end
803
+ on_attribute_end key
804
+ elsif error then
805
+ continue = s = found_stag_error(error)
806
+ else
807
+ emptyelem = true
808
+ end
809
+ }
810
+ end while continue
811
+ unless @src.close_tag then
812
+ if emptyelem then
813
+ found_unclosed_emptyelem name
814
+ else
815
+ found_unclosed_stag name
816
+ end
817
+ end
818
+ if emptyelem then
819
+ on_stag_end_empty name
820
+ else
821
+ on_stag_end name
822
+ end
823
+ end
824
+ end
825
+
826
+
827
+ def scan_bang_tag(s)
828
+ parse_error "parse error at `<!'"
829
+ s << '>' if @src.close_tag
830
+ on_chardata s
831
+ end
832
+
833
+
834
+ def scan_content(s)
835
+ src = @src # for speed
836
+ while s
837
+ if (c = s[0]) == ?< then
838
+ if (c = s[1]) == ?/ then
839
+ scan_etag s
840
+ elsif c == ?! then
841
+ if s[2] == ?- and s[3] == ?- then
842
+ scan_comment s
843
+ elsif /\A<!\[CDATA\[/ =~ s then
844
+ scan_cdata $'
845
+ else
846
+ scan_bang_tag s
847
+ end
848
+ elsif c == ?? then
849
+ scan_pi s
850
+ else
851
+ scan_stag s
852
+ end
853
+ else
854
+ scan_chardata s
855
+ end
856
+ s = src.get
857
+ end
858
+ end
859
+
860
+
861
+ def get_until_qmark(str, qmark)
862
+ begin
863
+ #s = @src.get_plain
864
+ s = @src.get
865
+ break unless s
866
+ c = s[0]
867
+ v, s = s.split(qmark, 2)
868
+ str << '>' unless c == ?< or c == ?> # De Morgan
869
+ str << v if c
870
+ end until s
871
+ s
872
+ end
873
+
874
+
875
+ XMLDeclPattern = opt_regexp(%q{[ \t\n\r]([\-_\d\w]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|(\?\z)|([\-_.\d\w]+|[^ \t\n\r])})
876
+
877
+ def scan_xmldecl(s)
878
+ endmark = nil
879
+ info = nil
880
+ state = 0
881
+ on_xmldecl
882
+ begin
883
+ continue = false
884
+ s.scan(XMLDeclPattern[@optkey]) { |key,val,endtok,error|
885
+ if key then
886
+ qmark = val.slice!(0,1) # remove quotation marks
887
+ if val[-1] == qmark[0] then
888
+ val.chop!
889
+ else
890
+ continue = s = get_until_qmark(val, qmark)
891
+ unless s then
892
+ parse_error "unterminated XML declaration meets EOF"
893
+ endmark = true
894
+ end
895
+ end
896
+ newstate = case state
897
+ when 0; key == 'version' ? 1 : 4
898
+ when 1; key == 'encoding' ? 2 : key == 'standalone' ? 3 : 4
899
+ else key == 'standalone' ? 3 : 4
900
+ end
901
+ state = if newstate == 4
902
+ known=%w{version encoding standalone}.member?(key)
903
+ parse_error known ? "#{key} declaration must not be here" :
904
+ "unknown declaration `#{key}' in XML declaration"
905
+ state < 2 ? 2 : 3
906
+ else newstate end
907
+ on_xmldecl_key key, val
908
+ elsif endtok then
909
+ endmark = if ct=@src.close_tag
910
+ true
911
+ else
912
+ parse_error "unexpected `#{endmark}' found in XML declaration"
913
+ nil
914
+ end
915
+ # here always exit the loop.
916
+ else
917
+ parse_error "parse error at `#{error}'"
918
+ end
919
+ }
920
+ end while !endmark and continue || s = @src.get_plain
921
+ parse_error "unterminated XML declaration meets EOF" unless s or endmark
922
+ parse_error "no declaration found in XML declaration" if state == 0
923
+ on_xmldecl_end
924
+ end
925
+
926
+
927
+ SkipDTD = opt_regexp(%q{(['"]|\A<!--|\A<\?|--\z|\?\z)|\]\s*\z}) #'
928
+
929
+ def skip_internal_dtd(s)
930
+ quote = nil
931
+ continue = true
932
+ begin # skip until `]>'
933
+ s.scan(SkipDTD[@optkey]) { |q,| #'
934
+ if quote then
935
+ quote = nil if quote == q and quote.size == 1 || @src.tag_end?
936
+ elsif q then
937
+ if q == '<!--' then
938
+ quote = '--'
939
+ elsif q == '<?' then
940
+ quote = '?'
941
+ elsif q == '"' or q == "'" then
942
+ quote = q
943
+ end
944
+ elsif @src.close_tag then
945
+ continue = false
946
+ end
947
+ }
948
+ end while continue and s = @src.get
949
+ parse_error "unterminated internal DTD subset meets EOF" unless s
950
+ end
951
+
952
+
953
+ def scan_internal_dtd(s)
954
+ warning "internal DTD subset is not supported"
955
+ skip_internal_dtd s
956
+ end
957
+
958
+
959
+ def found_invalid_pubsys(pubsys)
960
+ parse_error "`PUBLIC' or `SYSTEM' should be here"
961
+ 'SYSTEM'
962
+ end
963
+
964
+
965
+ DoctypePattern = opt_regexp(%q{[ \t\n\r](?:([^ \t\n\r\/'"=\[]+)|('[^']*'?|"[^"]*"?))|([\-_.\d\w]+|[^ \t\n\r])}) #"
966
+
967
+ def scan_doctype(s)
968
+ root = syspub = sysid = pubid = nil
969
+ internal_dtd = false
970
+ re = DoctypePattern[@opt]
971
+ begin
972
+ if re =~ s then
973
+ name, str, delim, s = $1, $2, $3, $'
974
+ if name then
975
+ if not root then
976
+ root = name
977
+ elsif not syspub then
978
+ unless name == 'PUBLIC' or name == 'SYSTEM' then
979
+ name = found_invalid_pubsys(name)
980
+ end
981
+ syspub = name
982
+ else
983
+ parse_error "parse error at `#{name}'"
984
+ end
985
+ elsif str then
986
+ qmark = str.slice!(0,1) # remove quotation marks
987
+ unless syspub then
988
+ parse_error "parse error at `#{qmark}'"
989
+ s = str << s
990
+ else
991
+ if str[-1] == qmark[0] then
992
+ str.chop!
993
+ else
994
+ s = get_until_qmark(str, qmark) || ''
995
+ end
996
+ if not sysid then
997
+ sysid = str
998
+ elsif not pubid and syspub == 'PUBLIC' then
999
+ pubid = sysid
1000
+ sysid = str
1001
+ else
1002
+ parse_error "too many external ID literals in DOCTYPE"
1003
+ end
1004
+ end
1005
+ elsif delim == '[' then
1006
+ internal_dtd = true
1007
+ break
1008
+ else
1009
+ parse_error "parse error at `#{delim}'"
1010
+ end
1011
+ else
1012
+ s = ''
1013
+ end
1014
+ if s.empty? then
1015
+ break if @src.close_tag
1016
+ s = @src.get_plain
1017
+ end
1018
+ end while s
1019
+ parse_error "unterminated DOCTYPE declaration meets EOF" unless s
1020
+ unless root then
1021
+ parse_error "no root element is specified in DOCTYPE"
1022
+ end
1023
+ if syspub and not sysid then
1024
+ parse_error "too few external ID literals in DOCTYPE"
1025
+ end
1026
+ if syspub == 'PUBLIC' and not pubid then
1027
+ pubid, sysid = sysid, nil
1028
+ end
1029
+ on_doctype root, pubid, sysid
1030
+ scan_internal_dtd s if internal_dtd
1031
+ end
1032
+
1033
+
1034
+ def scan_prolog(s)
1035
+ if /\A<\?xml(?=[ \t\n\r])/ =~ s then
1036
+ scan_xmldecl $'
1037
+ s = @src.get
1038
+ end
1039
+ doctype = true
1040
+ src = @src # for speed
1041
+ while s
1042
+ if s[0] == ?< then
1043
+ if (c = s[1]) == ?! then
1044
+ if s[2] == ?- and s[3] == ?- then
1045
+ scan_comment s
1046
+ elsif /\A<!DOCTYPE(?=[ \t\n\r])/ =~ s and doctype then
1047
+ doctype = false
1048
+ scan_doctype $'
1049
+ else
1050
+ break
1051
+ end
1052
+ elsif c == ?? then
1053
+ scan_pi s
1054
+ else
1055
+ break
1056
+ end
1057
+ s = src.get
1058
+ elsif /[^ \t\r\n]/ !~ s then
1059
+ on_prolog_space s unless s.empty?
1060
+ s = src.get_plain
1061
+ else
1062
+ break
1063
+ end
1064
+ end
1065
+ scan_content(s || src.get)
1066
+ end
1067
+
1068
+
1069
+ def scan_document
1070
+ on_start_document
1071
+ @src.prepare
1072
+ scan_prolog @src.get
1073
+ on_end_document
1074
+ end
1075
+
1076
+
1077
+ def make_source(src)
1078
+ Source.new src
1079
+ end
1080
+
1081
+
1082
+ public
1083
+
1084
+ def parse_document(src)
1085
+ @src = make_source(src)
1086
+ begin
1087
+ scan_document
1088
+ ensure
1089
+ @src = nil
1090
+ end
1091
+ self
1092
+ end
1093
+
1094
+ alias parse parse_document
1095
+
1096
+ end
1097
+
1098
+
1099
+ end
1100
+
1101
+
1102
+
1103
+
1104
+
1105
+ if $0 == __FILE__ then
1106
+ class TestVisitor
1107
+ include XMLScan::Visitor
1108
+ def parse_error(msg)
1109
+ STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
1110
+ end
1111
+ def wellformed_error(msg)
1112
+ STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
1113
+ end
1114
+ end
1115
+
1116
+ $s = scan = XMLScan::XMLScanner.new(TestVisitor.new)
1117
+ src = ARGF
1118
+ def src.path; filename; end
1119
+ t1 = Time.times.utime
1120
+ scan.parse src
1121
+ t2 = Time.times.utime
1122
+ STDERR.printf "%2.3f sec\n", t2 - t1
1123
+ end