xmlscan 0.2.3 → 0.3.0preb
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +7 -4
- data/VERSION +1 -1
- data/lib/xmlscan/htmlscan.rb +7 -7
- data/lib/xmlscan/namespace.rb +33 -33
- data/lib/xmlscan/parser.rb +17 -13
- data/lib/xmlscan/processor.rb +109 -0
- data/lib/xmlscan/scanner.rb +118 -89
- data/lib/xmlscan/version.rb +4 -10
- data/lib/xmlscan/visitor.rb +31 -29
- data/lib/xmlscan/xmlchar.rb +18 -18
- metadata +15 -16
- data/install.rb +0 -41
- data/test.rb +0 -7
data/lib/xmlscan/scanner.rb
CHANGED
@@ -122,20 +122,29 @@ module XMLScan
|
|
122
122
|
self
|
123
123
|
end
|
124
124
|
|
125
|
-
|
125
|
+
=begin
|
126
|
+
Managing source in a private array.
|
127
|
+
* tag oriented (?< and ?> are the key tokens
|
128
|
+
* ?> that aren't followed by another ?< or ?> are stripped in splitting
|
129
|
+
=end
|
126
130
|
def get
|
127
131
|
pop or
|
128
132
|
unless @eof then
|
129
133
|
last = @last
|
130
134
|
begin
|
131
|
-
|
132
|
-
unless src then
|
135
|
+
unless chunk = @src.gets then
|
133
136
|
@eof = true
|
134
|
-
|
135
|
-
last
|
136
|
-
|
137
|
+
@last = nil
|
138
|
+
return last
|
139
|
+
#unshift last # to be popped after reverse!
|
140
|
+
#last = nil
|
141
|
+
#break
|
137
142
|
end
|
138
|
-
|
143
|
+
# negative lookahead: < or >< or >>
|
144
|
+
# so don't consume those (but split leaving them always at the
|
145
|
+
# end of chunks)
|
146
|
+
# consume (>) and split on >
|
147
|
+
a = chunk.split(/(?=<|>[<>])|>/, -1)
|
139
148
|
if last then
|
140
149
|
unless /\A[<>]/ =~ a.first then
|
141
150
|
a[0] = last << (a.first || '')
|
@@ -143,6 +152,7 @@ module XMLScan
|
|
143
152
|
push last
|
144
153
|
end
|
145
154
|
end
|
155
|
+
raise "size #{size}" if size > 1
|
146
156
|
concat a
|
147
157
|
last = pop
|
148
158
|
end while empty?
|
@@ -223,7 +233,7 @@ module XMLScan
|
|
223
233
|
last.push @last.inspect
|
224
234
|
end
|
225
235
|
a.push '#eof' if @eof
|
226
|
-
"((#{a
|
236
|
+
"((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
|
227
237
|
end
|
228
238
|
|
229
239
|
def each
|
@@ -354,72 +364,72 @@ module XMLScan
|
|
354
364
|
end
|
355
365
|
end
|
356
366
|
|
357
|
-
def on_xmldecl_version(str)
|
358
|
-
@visitor.on_xmldecl_version str
|
367
|
+
def on_xmldecl_version(str, *a)
|
368
|
+
@visitor.on_xmldecl_version str, *a
|
359
369
|
end
|
360
370
|
|
361
|
-
def on_xmldecl_encoding(str)
|
362
|
-
@visitor.on_xmldecl_encoding str
|
371
|
+
def on_xmldecl_encoding(str, *a)
|
372
|
+
@visitor.on_xmldecl_encoding str, *a
|
363
373
|
end
|
364
374
|
|
365
|
-
def on_xmldecl_standalone(str)
|
366
|
-
@visitor.on_xmldecl_standalone str
|
375
|
+
def on_xmldecl_standalone(str, *a)
|
376
|
+
@visitor.on_xmldecl_standalone str, *a
|
367
377
|
end
|
368
378
|
|
369
|
-
def on_xmldecl_other(name, value)
|
370
|
-
@visitor.on_xmldecl_other name, value
|
379
|
+
def on_xmldecl_other(name, value, *a)
|
380
|
+
@visitor.on_xmldecl_other name, value, *a
|
371
381
|
end
|
372
382
|
|
373
|
-
def on_xmldecl_end
|
374
|
-
@visitor.on_xmldecl_end
|
383
|
+
def on_xmldecl_end(*a)
|
384
|
+
@visitor.on_xmldecl_end *a
|
375
385
|
end
|
376
386
|
|
377
|
-
def on_doctype(root, pubid, sysid)
|
378
|
-
@visitor.on_doctype root, pubid, sysid
|
387
|
+
def on_doctype(root, pubid, sysid, *a)
|
388
|
+
@visitor.on_doctype root, pubid, sysid, *a
|
379
389
|
end
|
380
390
|
|
381
|
-
def on_prolog_space(str)
|
382
|
-
@visitor.on_prolog_space str
|
391
|
+
def on_prolog_space(str, *a)
|
392
|
+
@visitor.on_prolog_space str, *a
|
383
393
|
end
|
384
394
|
|
385
|
-
def on_comment(str)
|
386
|
-
@visitor.on_comment str
|
395
|
+
def on_comment(str, *a)
|
396
|
+
@visitor.on_comment str, *a
|
387
397
|
end
|
388
398
|
|
389
|
-
def on_pi(target, pi)
|
390
|
-
@visitor.on_pi target, pi
|
399
|
+
def on_pi(target, pi, *a)
|
400
|
+
@visitor.on_pi target, pi, *a
|
391
401
|
end
|
392
402
|
|
393
|
-
def on_chardata(str)
|
394
|
-
@visitor.on_chardata str
|
403
|
+
def on_chardata(str, *a)
|
404
|
+
@visitor.on_chardata str, *a
|
395
405
|
end
|
396
406
|
|
397
|
-
def on_cdata(str)
|
398
|
-
@visitor.on_cdata str
|
407
|
+
def on_cdata(str, *a)
|
408
|
+
@visitor.on_cdata str, *a
|
399
409
|
end
|
400
410
|
|
401
|
-
def on_etag(name)
|
402
|
-
@visitor.on_etag name
|
411
|
+
def on_etag(name, *a)
|
412
|
+
@visitor.on_etag name, *a
|
403
413
|
end
|
404
414
|
|
405
|
-
def on_entityref(ref)
|
406
|
-
@visitor.on_entityref ref
|
415
|
+
def on_entityref(ref, *a)
|
416
|
+
@visitor.on_entityref ref, *a
|
407
417
|
end
|
408
418
|
|
409
|
-
def on_charref(code)
|
410
|
-
@visitor.on_charref code
|
419
|
+
def on_charref(code, *a)
|
420
|
+
@visitor.on_charref code, *a
|
411
421
|
end
|
412
422
|
|
413
|
-
def on_charref_hex(code)
|
414
|
-
@visitor.on_charref_hex code
|
423
|
+
def on_charref_hex(code, *a)
|
424
|
+
@visitor.on_charref_hex code, *a
|
415
425
|
end
|
416
426
|
|
417
|
-
def on_start_document
|
418
|
-
@visitor.on_start_document
|
427
|
+
def on_start_document(*a)
|
428
|
+
@visitor.on_start_document *a
|
419
429
|
end
|
420
430
|
|
421
|
-
def on_end_document
|
422
|
-
@visitor.on_end_document
|
431
|
+
def on_end_document(*a)
|
432
|
+
@visitor.on_end_document *a
|
423
433
|
end
|
424
434
|
|
425
435
|
|
@@ -444,50 +454,51 @@ module XMLScan
|
|
444
454
|
#
|
445
455
|
# A: on_chardata ('HOGE')
|
446
456
|
|
447
|
-
def on_stag(name)
|
448
|
-
@visitor.on_stag name
|
457
|
+
def on_stag(name, *a)
|
458
|
+
@visitor.on_stag name, *a
|
449
459
|
end
|
450
460
|
|
451
|
-
def on_attribute(name)
|
452
|
-
@visitor.on_attribute name
|
461
|
+
def on_attribute(name, *a)
|
462
|
+
@visitor.on_attribute name, *a
|
453
463
|
end
|
454
464
|
|
455
|
-
def on_attr_value(str)
|
456
|
-
@visitor.on_attr_value str
|
465
|
+
def on_attr_value(str, *a)
|
466
|
+
@visitor.on_attr_value str, *a
|
457
467
|
end
|
458
468
|
|
459
|
-
def on_attr_entityref(ref)
|
460
|
-
@visitor.on_attr_entityref ref
|
469
|
+
def on_attr_entityref(ref, *a)
|
470
|
+
@visitor.on_attr_entityref ref, *a
|
461
471
|
end
|
462
472
|
|
463
|
-
def on_attr_charref(code)
|
464
|
-
@visitor.on_attr_charref code
|
473
|
+
def on_attr_charref(code, *a)
|
474
|
+
@visitor.on_attr_charref code, *a
|
465
475
|
end
|
466
476
|
|
467
|
-
def on_attr_charref_hex(code)
|
468
|
-
@visitor.on_attr_charref_hex code
|
477
|
+
def on_attr_charref_hex(code, *a)
|
478
|
+
@visitor.on_attr_charref_hex code, *a
|
469
479
|
end
|
470
480
|
|
471
|
-
def on_attribute_end(name)
|
472
|
-
@visitor.on_attribute_end name
|
481
|
+
def on_attribute_end(name, *a)
|
482
|
+
@visitor.on_attribute_end name, *a, *a
|
473
483
|
end
|
474
484
|
|
475
|
-
def on_stag_end_empty(name)
|
476
|
-
@visitor.on_stag_end_empty name
|
485
|
+
def on_stag_end_empty(name, *a)
|
486
|
+
@visitor.on_stag_end_empty name, *a
|
477
487
|
end
|
478
488
|
|
479
|
-
def on_stag_end(name)
|
480
|
-
|
489
|
+
def on_stag_end(name, *a)
|
490
|
+
#STDERR << "ose #{name}, #{a.inspect}\n"
|
491
|
+
@visitor.on_stag_end name, *a
|
481
492
|
end
|
482
493
|
|
483
494
|
|
495
|
+
S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
|
496
|
+
E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
|
484
497
|
|
485
498
|
private
|
486
499
|
|
487
500
|
module OptRegexp
|
488
501
|
UTFSTR = "é"
|
489
|
-
S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
|
490
|
-
E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
|
491
502
|
|
492
503
|
RE_ENCODINGS = {
|
493
504
|
:n=>/e/n.encoding,
|
@@ -525,6 +536,7 @@ module XMLScan
|
|
525
536
|
else
|
526
537
|
s = $`
|
527
538
|
on_chardata s unless s.empty?
|
539
|
+
#orig = $'.sub(/(?=;).*$/,'')
|
528
540
|
ref = nil
|
529
541
|
$'.split('&', -1).each { |s|
|
530
542
|
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
@@ -533,18 +545,18 @@ module XMLScan
|
|
533
545
|
parse_error "reference to `#{ref}' doesn't end with `;'"
|
534
546
|
else
|
535
547
|
parse_error "`&' is not used for entity/character references"
|
536
|
-
on_chardata
|
548
|
+
on_chardata '&'+s
|
537
549
|
next
|
538
550
|
end
|
539
551
|
end
|
540
|
-
ref = $`
|
552
|
+
orig = ?& + (ref = $`) + ?;
|
541
553
|
s = $'
|
542
554
|
if /\A[^#]/ =~ ref then
|
543
|
-
on_entityref ref
|
555
|
+
on_entityref ref, orig
|
544
556
|
elsif /\A#(\d+)\z/ =~ ref then
|
545
|
-
on_charref $1.to_i
|
557
|
+
on_charref $1.to_i, orig
|
546
558
|
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
547
|
-
on_charref_hex $1.hex
|
559
|
+
on_charref_hex $1.hex, orig
|
548
560
|
else
|
549
561
|
parse_error "invalid character reference `#{ref}'"
|
550
562
|
end
|
@@ -558,8 +570,9 @@ module XMLScan
|
|
558
570
|
end
|
559
571
|
|
560
572
|
|
561
|
-
def
|
573
|
+
def scan_attr_value(s) # almostly copy & paste from scan_chardata
|
562
574
|
unless /&/ =~ s then
|
575
|
+
#STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
|
563
576
|
on_attr_value s
|
564
577
|
else
|
565
578
|
s = $`
|
@@ -576,14 +589,14 @@ module XMLScan
|
|
576
589
|
next
|
577
590
|
end
|
578
591
|
end
|
579
|
-
ref = $`
|
592
|
+
orig = ?& + (ref = $`) + ?;
|
580
593
|
s = $'
|
581
594
|
if /\A[^#]/ =~ ref then
|
582
|
-
on_attr_entityref ref
|
595
|
+
on_attr_entityref ref, orig
|
583
596
|
elsif /\A#(\d+)\z/ =~ ref then
|
584
|
-
on_attr_charref $1.to_i
|
597
|
+
on_attr_charref $1.to_i, orig
|
585
598
|
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
586
|
-
on_attr_charref_hex $1.hex
|
599
|
+
on_attr_charref_hex $1.hex, orig
|
587
600
|
else
|
588
601
|
parse_error "invalid character reference `#{ref}'"
|
589
602
|
end
|
@@ -682,6 +695,7 @@ module XMLScan
|
|
682
695
|
|
683
696
|
|
684
697
|
def scan_etag(s)
|
698
|
+
orig="#{s}>"
|
685
699
|
s[0,2] = '' # remove '</'
|
686
700
|
if s.empty? then
|
687
701
|
if @src.close_tag then # </>
|
@@ -689,14 +703,14 @@ module XMLScan
|
|
689
703
|
else # </< or </[EOF]
|
690
704
|
parse_error "parse error at `</'"
|
691
705
|
s << '>' if @src.close_tag
|
692
|
-
return on_chardata
|
706
|
+
return on_chardata '</' << s
|
693
707
|
end
|
694
708
|
elsif /[ \t\n\r]+/ =~ s then
|
695
709
|
s1, s2 = $`, $'
|
696
710
|
if s1.empty? then # </ tag
|
697
711
|
parse_error "parse error at `</'"
|
698
712
|
s << '>' if @src.close_tag
|
699
|
-
return on_chardata
|
713
|
+
return on_chardata '</' + s
|
700
714
|
elsif not s2.empty? then # </ta g
|
701
715
|
parse_error "illegal whitespace is found within end tag `#{s1}'"
|
702
716
|
while @src.get_tag
|
@@ -705,7 +719,7 @@ module XMLScan
|
|
705
719
|
s = s1
|
706
720
|
end
|
707
721
|
found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
|
708
|
-
on_etag s
|
722
|
+
on_etag s, orig
|
709
723
|
end
|
710
724
|
|
711
725
|
|
@@ -745,6 +759,8 @@ module XMLScan
|
|
745
759
|
|
746
760
|
|
747
761
|
def scan_stag(s)
|
762
|
+
hash = {}
|
763
|
+
orig = [s.dup]
|
748
764
|
unless /(?=[\/ \t\n\r='"])/ =~ s then
|
749
765
|
name = s
|
750
766
|
name[0,1] = '' # remove `<'
|
@@ -753,54 +769,65 @@ module XMLScan
|
|
753
769
|
return found_empty_stag
|
754
770
|
else # << or <[EOF]
|
755
771
|
parse_error "parse error at `<'"
|
756
|
-
return on_chardata
|
772
|
+
return on_chardata '<'
|
757
773
|
end
|
758
774
|
end
|
759
775
|
on_stag name
|
760
776
|
found_unclosed_stag name unless @src.close_tag
|
761
|
-
on_stag_end name
|
777
|
+
on_stag_end name, orig*''+?>, {}
|
762
778
|
else
|
779
|
+
k = nil
|
763
780
|
name = $`
|
764
781
|
s = $'
|
765
782
|
name[0,1] = '' # remove `<'
|
766
783
|
if name.empty? then # `< tag' or `<=`
|
767
784
|
parse_error "parse error at `<'"
|
768
785
|
s << '>' if @src.close_tag
|
769
|
-
return on_chardata
|
786
|
+
return on_chardata '<' << s
|
770
787
|
end
|
771
788
|
on_stag name
|
772
789
|
emptyelem = false
|
773
|
-
key,val,error,qmark,c = nil
|
774
790
|
begin
|
775
791
|
continue = false
|
776
792
|
s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
|
777
793
|
) { |key,val,error|
|
778
|
-
|
794
|
+
orig_val = []
|
795
|
+
if key then
|
779
796
|
on_attribute key
|
797
|
+
k=key
|
798
|
+
orig_val << val
|
780
799
|
qmark = val.slice!(0,1)
|
781
800
|
if val[-1] == qmark[0] then
|
782
801
|
val.chop!
|
783
|
-
|
802
|
+
scan_attr_value val unless val.empty?
|
784
803
|
else
|
785
|
-
|
804
|
+
scan_attr_value val unless val.empty?
|
786
805
|
begin
|
787
806
|
s = @src.get
|
807
|
+
#STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
|
788
808
|
unless s then
|
789
809
|
parse_error "unterminated attribute `#{key}' meets EOF"
|
790
810
|
break
|
791
811
|
end
|
812
|
+
orig << s.dup
|
792
813
|
c = s[0]
|
793
814
|
val, s = s.split(qmark, 2)
|
815
|
+
orig_val << val
|
794
816
|
if c == ?< then
|
795
817
|
wellformed_error "`<' is found in attribute `#{key}'"
|
796
818
|
elsif c != ?> then
|
797
|
-
|
819
|
+
#STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
|
820
|
+
orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
|
821
|
+
scan_attr_value ?>
|
798
822
|
end
|
799
|
-
|
823
|
+
scan_attr_value val if c
|
800
824
|
end until s
|
801
825
|
continue = s # if eof then continue is false, else true.
|
802
826
|
end
|
803
|
-
|
827
|
+
#STDERR << "attr:#{k}, #{orig_val}\n"
|
828
|
+
hash[k] = orig_val*''
|
829
|
+
#STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
|
830
|
+
on_attribute_end key #, orig_val*''
|
804
831
|
elsif error then
|
805
832
|
continue = s = found_stag_error(error)
|
806
833
|
else
|
@@ -816,9 +843,11 @@ module XMLScan
|
|
816
843
|
end
|
817
844
|
end
|
818
845
|
if emptyelem then
|
819
|
-
on_stag_end_empty name
|
846
|
+
on_stag_end_empty name, orig*''+?>, hash
|
820
847
|
else
|
821
|
-
|
848
|
+
#STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
|
849
|
+
on_stag_end name, orig*''+?>, hash
|
850
|
+
#on_stag_end name, "<#{name}#{s}>", hash
|
822
851
|
end
|
823
852
|
end
|
824
853
|
end
|
@@ -1067,10 +1096,10 @@ module XMLScan
|
|
1067
1096
|
|
1068
1097
|
|
1069
1098
|
def scan_document
|
1070
|
-
on_start_document
|
1099
|
+
on_start_document ''
|
1071
1100
|
@src.prepare
|
1072
1101
|
scan_prolog @src.get
|
1073
|
-
on_end_document
|
1102
|
+
on_end_document ''
|
1074
1103
|
end
|
1075
1104
|
|
1076
1105
|
|
data/lib/xmlscan/version.rb
CHANGED
@@ -9,15 +9,9 @@
|
|
9
9
|
|
10
10
|
module XMLScan
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
# TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
|
18
|
-
# release is a stable release.
|
19
|
-
|
20
|
-
VERSION = '0.2.3'
|
21
|
-
RELEASE_DATE = '2003-05-02'
|
12
|
+
GEMNAME = 'xmlscan'
|
13
|
+
VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
|
14
|
+
VERSION = open(VERSION_FILE).to_a*''.chop
|
15
|
+
RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
|
22
16
|
|
23
17
|
end
|