xmlscan 0.2.3 → 0.3.0preb
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +7 -4
- data/VERSION +1 -1
- data/lib/xmlscan/htmlscan.rb +7 -7
- data/lib/xmlscan/namespace.rb +33 -33
- data/lib/xmlscan/parser.rb +17 -13
- data/lib/xmlscan/processor.rb +109 -0
- data/lib/xmlscan/scanner.rb +118 -89
- data/lib/xmlscan/version.rb +4 -10
- data/lib/xmlscan/visitor.rb +31 -29
- data/lib/xmlscan/xmlchar.rb +18 -18
- metadata +15 -16
- data/install.rb +0 -41
- data/test.rb +0 -7
data/lib/xmlscan/scanner.rb
CHANGED
@@ -122,20 +122,29 @@ module XMLScan
|
|
122
122
|
self
|
123
123
|
end
|
124
124
|
|
125
|
-
|
125
|
+
=begin
|
126
|
+
Managing source in a private array.
|
127
|
+
* tag oriented (?< and ?> are the key tokens
|
128
|
+
* ?> that aren't followed by another ?< or ?> are stripped in splitting
|
129
|
+
=end
|
126
130
|
def get
|
127
131
|
pop or
|
128
132
|
unless @eof then
|
129
133
|
last = @last
|
130
134
|
begin
|
131
|
-
|
132
|
-
unless src then
|
135
|
+
unless chunk = @src.gets then
|
133
136
|
@eof = true
|
134
|
-
|
135
|
-
last
|
136
|
-
|
137
|
+
@last = nil
|
138
|
+
return last
|
139
|
+
#unshift last # to be popped after reverse!
|
140
|
+
#last = nil
|
141
|
+
#break
|
137
142
|
end
|
138
|
-
|
143
|
+
# negative lookahead: < or >< or >>
|
144
|
+
# so don't consume those (but split leaving them always at the
|
145
|
+
# end of chunks)
|
146
|
+
# consume (>) and split on >
|
147
|
+
a = chunk.split(/(?=<|>[<>])|>/, -1)
|
139
148
|
if last then
|
140
149
|
unless /\A[<>]/ =~ a.first then
|
141
150
|
a[0] = last << (a.first || '')
|
@@ -143,6 +152,7 @@ module XMLScan
|
|
143
152
|
push last
|
144
153
|
end
|
145
154
|
end
|
155
|
+
raise "size #{size}" if size > 1
|
146
156
|
concat a
|
147
157
|
last = pop
|
148
158
|
end while empty?
|
@@ -223,7 +233,7 @@ module XMLScan
|
|
223
233
|
last.push @last.inspect
|
224
234
|
end
|
225
235
|
a.push '#eof' if @eof
|
226
|
-
"((#{a
|
236
|
+
"((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
|
227
237
|
end
|
228
238
|
|
229
239
|
def each
|
@@ -354,72 +364,72 @@ module XMLScan
|
|
354
364
|
end
|
355
365
|
end
|
356
366
|
|
357
|
-
def on_xmldecl_version(str)
|
358
|
-
@visitor.on_xmldecl_version str
|
367
|
+
def on_xmldecl_version(str, *a)
|
368
|
+
@visitor.on_xmldecl_version str, *a
|
359
369
|
end
|
360
370
|
|
361
|
-
def on_xmldecl_encoding(str)
|
362
|
-
@visitor.on_xmldecl_encoding str
|
371
|
+
def on_xmldecl_encoding(str, *a)
|
372
|
+
@visitor.on_xmldecl_encoding str, *a
|
363
373
|
end
|
364
374
|
|
365
|
-
def on_xmldecl_standalone(str)
|
366
|
-
@visitor.on_xmldecl_standalone str
|
375
|
+
def on_xmldecl_standalone(str, *a)
|
376
|
+
@visitor.on_xmldecl_standalone str, *a
|
367
377
|
end
|
368
378
|
|
369
|
-
def on_xmldecl_other(name, value)
|
370
|
-
@visitor.on_xmldecl_other name, value
|
379
|
+
def on_xmldecl_other(name, value, *a)
|
380
|
+
@visitor.on_xmldecl_other name, value, *a
|
371
381
|
end
|
372
382
|
|
373
|
-
def on_xmldecl_end
|
374
|
-
@visitor.on_xmldecl_end
|
383
|
+
def on_xmldecl_end(*a)
|
384
|
+
@visitor.on_xmldecl_end *a
|
375
385
|
end
|
376
386
|
|
377
|
-
def on_doctype(root, pubid, sysid)
|
378
|
-
@visitor.on_doctype root, pubid, sysid
|
387
|
+
def on_doctype(root, pubid, sysid, *a)
|
388
|
+
@visitor.on_doctype root, pubid, sysid, *a
|
379
389
|
end
|
380
390
|
|
381
|
-
def on_prolog_space(str)
|
382
|
-
@visitor.on_prolog_space str
|
391
|
+
def on_prolog_space(str, *a)
|
392
|
+
@visitor.on_prolog_space str, *a
|
383
393
|
end
|
384
394
|
|
385
|
-
def on_comment(str)
|
386
|
-
@visitor.on_comment str
|
395
|
+
def on_comment(str, *a)
|
396
|
+
@visitor.on_comment str, *a
|
387
397
|
end
|
388
398
|
|
389
|
-
def on_pi(target, pi)
|
390
|
-
@visitor.on_pi target, pi
|
399
|
+
def on_pi(target, pi, *a)
|
400
|
+
@visitor.on_pi target, pi, *a
|
391
401
|
end
|
392
402
|
|
393
|
-
def on_chardata(str)
|
394
|
-
@visitor.on_chardata str
|
403
|
+
def on_chardata(str, *a)
|
404
|
+
@visitor.on_chardata str, *a
|
395
405
|
end
|
396
406
|
|
397
|
-
def on_cdata(str)
|
398
|
-
@visitor.on_cdata str
|
407
|
+
def on_cdata(str, *a)
|
408
|
+
@visitor.on_cdata str, *a
|
399
409
|
end
|
400
410
|
|
401
|
-
def on_etag(name)
|
402
|
-
@visitor.on_etag name
|
411
|
+
def on_etag(name, *a)
|
412
|
+
@visitor.on_etag name, *a
|
403
413
|
end
|
404
414
|
|
405
|
-
def on_entityref(ref)
|
406
|
-
@visitor.on_entityref ref
|
415
|
+
def on_entityref(ref, *a)
|
416
|
+
@visitor.on_entityref ref, *a
|
407
417
|
end
|
408
418
|
|
409
|
-
def on_charref(code)
|
410
|
-
@visitor.on_charref code
|
419
|
+
def on_charref(code, *a)
|
420
|
+
@visitor.on_charref code, *a
|
411
421
|
end
|
412
422
|
|
413
|
-
def on_charref_hex(code)
|
414
|
-
@visitor.on_charref_hex code
|
423
|
+
def on_charref_hex(code, *a)
|
424
|
+
@visitor.on_charref_hex code, *a
|
415
425
|
end
|
416
426
|
|
417
|
-
def on_start_document
|
418
|
-
@visitor.on_start_document
|
427
|
+
def on_start_document(*a)
|
428
|
+
@visitor.on_start_document *a
|
419
429
|
end
|
420
430
|
|
421
|
-
def on_end_document
|
422
|
-
@visitor.on_end_document
|
431
|
+
def on_end_document(*a)
|
432
|
+
@visitor.on_end_document *a
|
423
433
|
end
|
424
434
|
|
425
435
|
|
@@ -444,50 +454,51 @@ module XMLScan
|
|
444
454
|
#
|
445
455
|
# A: on_chardata ('HOGE')
|
446
456
|
|
447
|
-
def on_stag(name)
|
448
|
-
@visitor.on_stag name
|
457
|
+
def on_stag(name, *a)
|
458
|
+
@visitor.on_stag name, *a
|
449
459
|
end
|
450
460
|
|
451
|
-
def on_attribute(name)
|
452
|
-
@visitor.on_attribute name
|
461
|
+
def on_attribute(name, *a)
|
462
|
+
@visitor.on_attribute name, *a
|
453
463
|
end
|
454
464
|
|
455
|
-
def on_attr_value(str)
|
456
|
-
@visitor.on_attr_value str
|
465
|
+
def on_attr_value(str, *a)
|
466
|
+
@visitor.on_attr_value str, *a
|
457
467
|
end
|
458
468
|
|
459
|
-
def on_attr_entityref(ref)
|
460
|
-
@visitor.on_attr_entityref ref
|
469
|
+
def on_attr_entityref(ref, *a)
|
470
|
+
@visitor.on_attr_entityref ref, *a
|
461
471
|
end
|
462
472
|
|
463
|
-
def on_attr_charref(code)
|
464
|
-
@visitor.on_attr_charref code
|
473
|
+
def on_attr_charref(code, *a)
|
474
|
+
@visitor.on_attr_charref code, *a
|
465
475
|
end
|
466
476
|
|
467
|
-
def on_attr_charref_hex(code)
|
468
|
-
@visitor.on_attr_charref_hex code
|
477
|
+
def on_attr_charref_hex(code, *a)
|
478
|
+
@visitor.on_attr_charref_hex code, *a
|
469
479
|
end
|
470
480
|
|
471
|
-
def on_attribute_end(name)
|
472
|
-
@visitor.on_attribute_end name
|
481
|
+
def on_attribute_end(name, *a)
|
482
|
+
@visitor.on_attribute_end name, *a, *a
|
473
483
|
end
|
474
484
|
|
475
|
-
def on_stag_end_empty(name)
|
476
|
-
@visitor.on_stag_end_empty name
|
485
|
+
def on_stag_end_empty(name, *a)
|
486
|
+
@visitor.on_stag_end_empty name, *a
|
477
487
|
end
|
478
488
|
|
479
|
-
def on_stag_end(name)
|
480
|
-
|
489
|
+
def on_stag_end(name, *a)
|
490
|
+
#STDERR << "ose #{name}, #{a.inspect}\n"
|
491
|
+
@visitor.on_stag_end name, *a
|
481
492
|
end
|
482
493
|
|
483
494
|
|
495
|
+
S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
|
496
|
+
E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
|
484
497
|
|
485
498
|
private
|
486
499
|
|
487
500
|
module OptRegexp
|
488
501
|
UTFSTR = "é"
|
489
|
-
S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
|
490
|
-
E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
|
491
502
|
|
492
503
|
RE_ENCODINGS = {
|
493
504
|
:n=>/e/n.encoding,
|
@@ -525,6 +536,7 @@ module XMLScan
|
|
525
536
|
else
|
526
537
|
s = $`
|
527
538
|
on_chardata s unless s.empty?
|
539
|
+
#orig = $'.sub(/(?=;).*$/,'')
|
528
540
|
ref = nil
|
529
541
|
$'.split('&', -1).each { |s|
|
530
542
|
unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
|
@@ -533,18 +545,18 @@ module XMLScan
|
|
533
545
|
parse_error "reference to `#{ref}' doesn't end with `;'"
|
534
546
|
else
|
535
547
|
parse_error "`&' is not used for entity/character references"
|
536
|
-
on_chardata
|
548
|
+
on_chardata '&'+s
|
537
549
|
next
|
538
550
|
end
|
539
551
|
end
|
540
|
-
ref = $`
|
552
|
+
orig = ?& + (ref = $`) + ?;
|
541
553
|
s = $'
|
542
554
|
if /\A[^#]/ =~ ref then
|
543
|
-
on_entityref ref
|
555
|
+
on_entityref ref, orig
|
544
556
|
elsif /\A#(\d+)\z/ =~ ref then
|
545
|
-
on_charref $1.to_i
|
557
|
+
on_charref $1.to_i, orig
|
546
558
|
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
547
|
-
on_charref_hex $1.hex
|
559
|
+
on_charref_hex $1.hex, orig
|
548
560
|
else
|
549
561
|
parse_error "invalid character reference `#{ref}'"
|
550
562
|
end
|
@@ -558,8 +570,9 @@ module XMLScan
|
|
558
570
|
end
|
559
571
|
|
560
572
|
|
561
|
-
def
|
573
|
+
def scan_attr_value(s) # almostly copy & paste from scan_chardata
|
562
574
|
unless /&/ =~ s then
|
575
|
+
#STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
|
563
576
|
on_attr_value s
|
564
577
|
else
|
565
578
|
s = $`
|
@@ -576,14 +589,14 @@ module XMLScan
|
|
576
589
|
next
|
577
590
|
end
|
578
591
|
end
|
579
|
-
ref = $`
|
592
|
+
orig = ?& + (ref = $`) + ?;
|
580
593
|
s = $'
|
581
594
|
if /\A[^#]/ =~ ref then
|
582
|
-
on_attr_entityref ref
|
595
|
+
on_attr_entityref ref, orig
|
583
596
|
elsif /\A#(\d+)\z/ =~ ref then
|
584
|
-
on_attr_charref $1.to_i
|
597
|
+
on_attr_charref $1.to_i, orig
|
585
598
|
elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
|
586
|
-
on_attr_charref_hex $1.hex
|
599
|
+
on_attr_charref_hex $1.hex, orig
|
587
600
|
else
|
588
601
|
parse_error "invalid character reference `#{ref}'"
|
589
602
|
end
|
@@ -682,6 +695,7 @@ module XMLScan
|
|
682
695
|
|
683
696
|
|
684
697
|
def scan_etag(s)
|
698
|
+
orig="#{s}>"
|
685
699
|
s[0,2] = '' # remove '</'
|
686
700
|
if s.empty? then
|
687
701
|
if @src.close_tag then # </>
|
@@ -689,14 +703,14 @@ module XMLScan
|
|
689
703
|
else # </< or </[EOF]
|
690
704
|
parse_error "parse error at `</'"
|
691
705
|
s << '>' if @src.close_tag
|
692
|
-
return on_chardata
|
706
|
+
return on_chardata '</' << s
|
693
707
|
end
|
694
708
|
elsif /[ \t\n\r]+/ =~ s then
|
695
709
|
s1, s2 = $`, $'
|
696
710
|
if s1.empty? then # </ tag
|
697
711
|
parse_error "parse error at `</'"
|
698
712
|
s << '>' if @src.close_tag
|
699
|
-
return on_chardata
|
713
|
+
return on_chardata '</' + s
|
700
714
|
elsif not s2.empty? then # </ta g
|
701
715
|
parse_error "illegal whitespace is found within end tag `#{s1}'"
|
702
716
|
while @src.get_tag
|
@@ -705,7 +719,7 @@ module XMLScan
|
|
705
719
|
s = s1
|
706
720
|
end
|
707
721
|
found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
|
708
|
-
on_etag s
|
722
|
+
on_etag s, orig
|
709
723
|
end
|
710
724
|
|
711
725
|
|
@@ -745,6 +759,8 @@ module XMLScan
|
|
745
759
|
|
746
760
|
|
747
761
|
def scan_stag(s)
|
762
|
+
hash = {}
|
763
|
+
orig = [s.dup]
|
748
764
|
unless /(?=[\/ \t\n\r='"])/ =~ s then
|
749
765
|
name = s
|
750
766
|
name[0,1] = '' # remove `<'
|
@@ -753,54 +769,65 @@ module XMLScan
|
|
753
769
|
return found_empty_stag
|
754
770
|
else # << or <[EOF]
|
755
771
|
parse_error "parse error at `<'"
|
756
|
-
return on_chardata
|
772
|
+
return on_chardata '<'
|
757
773
|
end
|
758
774
|
end
|
759
775
|
on_stag name
|
760
776
|
found_unclosed_stag name unless @src.close_tag
|
761
|
-
on_stag_end name
|
777
|
+
on_stag_end name, orig*''+?>, {}
|
762
778
|
else
|
779
|
+
k = nil
|
763
780
|
name = $`
|
764
781
|
s = $'
|
765
782
|
name[0,1] = '' # remove `<'
|
766
783
|
if name.empty? then # `< tag' or `<=`
|
767
784
|
parse_error "parse error at `<'"
|
768
785
|
s << '>' if @src.close_tag
|
769
|
-
return on_chardata
|
786
|
+
return on_chardata '<' << s
|
770
787
|
end
|
771
788
|
on_stag name
|
772
789
|
emptyelem = false
|
773
|
-
key,val,error,qmark,c = nil
|
774
790
|
begin
|
775
791
|
continue = false
|
776
792
|
s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
|
777
793
|
) { |key,val,error|
|
778
|
-
|
794
|
+
orig_val = []
|
795
|
+
if key then
|
779
796
|
on_attribute key
|
797
|
+
k=key
|
798
|
+
orig_val << val
|
780
799
|
qmark = val.slice!(0,1)
|
781
800
|
if val[-1] == qmark[0] then
|
782
801
|
val.chop!
|
783
|
-
|
802
|
+
scan_attr_value val unless val.empty?
|
784
803
|
else
|
785
|
-
|
804
|
+
scan_attr_value val unless val.empty?
|
786
805
|
begin
|
787
806
|
s = @src.get
|
807
|
+
#STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
|
788
808
|
unless s then
|
789
809
|
parse_error "unterminated attribute `#{key}' meets EOF"
|
790
810
|
break
|
791
811
|
end
|
812
|
+
orig << s.dup
|
792
813
|
c = s[0]
|
793
814
|
val, s = s.split(qmark, 2)
|
815
|
+
orig_val << val
|
794
816
|
if c == ?< then
|
795
817
|
wellformed_error "`<' is found in attribute `#{key}'"
|
796
818
|
elsif c != ?> then
|
797
|
-
|
819
|
+
#STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
|
820
|
+
orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
|
821
|
+
scan_attr_value ?>
|
798
822
|
end
|
799
|
-
|
823
|
+
scan_attr_value val if c
|
800
824
|
end until s
|
801
825
|
continue = s # if eof then continue is false, else true.
|
802
826
|
end
|
803
|
-
|
827
|
+
#STDERR << "attr:#{k}, #{orig_val}\n"
|
828
|
+
hash[k] = orig_val*''
|
829
|
+
#STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
|
830
|
+
on_attribute_end key #, orig_val*''
|
804
831
|
elsif error then
|
805
832
|
continue = s = found_stag_error(error)
|
806
833
|
else
|
@@ -816,9 +843,11 @@ module XMLScan
|
|
816
843
|
end
|
817
844
|
end
|
818
845
|
if emptyelem then
|
819
|
-
on_stag_end_empty name
|
846
|
+
on_stag_end_empty name, orig*''+?>, hash
|
820
847
|
else
|
821
|
-
|
848
|
+
#STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
|
849
|
+
on_stag_end name, orig*''+?>, hash
|
850
|
+
#on_stag_end name, "<#{name}#{s}>", hash
|
822
851
|
end
|
823
852
|
end
|
824
853
|
end
|
@@ -1067,10 +1096,10 @@ module XMLScan
|
|
1067
1096
|
|
1068
1097
|
|
1069
1098
|
def scan_document
|
1070
|
-
on_start_document
|
1099
|
+
on_start_document ''
|
1071
1100
|
@src.prepare
|
1072
1101
|
scan_prolog @src.get
|
1073
|
-
on_end_document
|
1102
|
+
on_end_document ''
|
1074
1103
|
end
|
1075
1104
|
|
1076
1105
|
|
data/lib/xmlscan/version.rb
CHANGED
@@ -9,15 +9,9 @@
|
|
9
9
|
|
10
10
|
module XMLScan
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
# TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
|
18
|
-
# release is a stable release.
|
19
|
-
|
20
|
-
VERSION = '0.2.3'
|
21
|
-
RELEASE_DATE = '2003-05-02'
|
12
|
+
GEMNAME = 'xmlscan'
|
13
|
+
VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
|
14
|
+
VERSION = open(VERSION_FILE).to_a*''.chop
|
15
|
+
RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
|
22
16
|
|
23
17
|
end
|