xmlscan 0.2.3 → 0.3.0preb

Sign up to get free protection for your applications and to get access to all the features.
@@ -122,20 +122,29 @@ module XMLScan
122
122
  self
123
123
  end
124
124
 
125
-
125
+ =begin
126
+ Managing source in a private array.
127
+ * tag oriented (?< and ?> are the key tokens
128
+ * ?> that aren't followed by another ?< or ?> are stripped in splitting
129
+ =end
126
130
  def get
127
131
  pop or
128
132
  unless @eof then
129
133
  last = @last
130
134
  begin
131
- src = @src.gets
132
- unless src then
135
+ unless chunk = @src.gets then
133
136
  @eof = true
134
- unshift last
135
- last = nil
136
- break
137
+ @last = nil
138
+ return last
139
+ #unshift last # to be popped after reverse!
140
+ #last = nil
141
+ #break
137
142
  end
138
- a = src.split(/(?=<|>[<>])|>/, -1)
143
+ # negative lookahead: < or >< or >>
144
+ # so don't consume those (but split leaving them always at the
145
+ # end of chunks)
146
+ # consume (>) and split on >
147
+ a = chunk.split(/(?=<|>[<>])|>/, -1)
139
148
  if last then
140
149
  unless /\A[<>]/ =~ a.first then
141
150
  a[0] = last << (a.first || '')
@@ -143,6 +152,7 @@ module XMLScan
143
152
  push last
144
153
  end
145
154
  end
155
+ raise "size #{size}" if size > 1
146
156
  concat a
147
157
  last = pop
148
158
  end while empty?
@@ -223,7 +233,7 @@ module XMLScan
223
233
  last.push @last.inspect
224
234
  end
225
235
  a.push '#eof' if @eof
226
- "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
236
+ "((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
227
237
  end
228
238
 
229
239
  def each
@@ -354,72 +364,72 @@ module XMLScan
354
364
  end
355
365
  end
356
366
 
357
- def on_xmldecl_version(str)
358
- @visitor.on_xmldecl_version str
367
+ def on_xmldecl_version(str, *a)
368
+ @visitor.on_xmldecl_version str, *a
359
369
  end
360
370
 
361
- def on_xmldecl_encoding(str)
362
- @visitor.on_xmldecl_encoding str
371
+ def on_xmldecl_encoding(str, *a)
372
+ @visitor.on_xmldecl_encoding str, *a
363
373
  end
364
374
 
365
- def on_xmldecl_standalone(str)
366
- @visitor.on_xmldecl_standalone str
375
+ def on_xmldecl_standalone(str, *a)
376
+ @visitor.on_xmldecl_standalone str, *a
367
377
  end
368
378
 
369
- def on_xmldecl_other(name, value)
370
- @visitor.on_xmldecl_other name, value
379
+ def on_xmldecl_other(name, value, *a)
380
+ @visitor.on_xmldecl_other name, value, *a
371
381
  end
372
382
 
373
- def on_xmldecl_end
374
- @visitor.on_xmldecl_end
383
+ def on_xmldecl_end(*a)
384
+ @visitor.on_xmldecl_end *a
375
385
  end
376
386
 
377
- def on_doctype(root, pubid, sysid)
378
- @visitor.on_doctype root, pubid, sysid
387
+ def on_doctype(root, pubid, sysid, *a)
388
+ @visitor.on_doctype root, pubid, sysid, *a
379
389
  end
380
390
 
381
- def on_prolog_space(str)
382
- @visitor.on_prolog_space str
391
+ def on_prolog_space(str, *a)
392
+ @visitor.on_prolog_space str, *a
383
393
  end
384
394
 
385
- def on_comment(str)
386
- @visitor.on_comment str
395
+ def on_comment(str, *a)
396
+ @visitor.on_comment str, *a
387
397
  end
388
398
 
389
- def on_pi(target, pi)
390
- @visitor.on_pi target, pi
399
+ def on_pi(target, pi, *a)
400
+ @visitor.on_pi target, pi, *a
391
401
  end
392
402
 
393
- def on_chardata(str)
394
- @visitor.on_chardata str
403
+ def on_chardata(str, *a)
404
+ @visitor.on_chardata str, *a
395
405
  end
396
406
 
397
- def on_cdata(str)
398
- @visitor.on_cdata str
407
+ def on_cdata(str, *a)
408
+ @visitor.on_cdata str, *a
399
409
  end
400
410
 
401
- def on_etag(name)
402
- @visitor.on_etag name
411
+ def on_etag(name, *a)
412
+ @visitor.on_etag name, *a
403
413
  end
404
414
 
405
- def on_entityref(ref)
406
- @visitor.on_entityref ref
415
+ def on_entityref(ref, *a)
416
+ @visitor.on_entityref ref, *a
407
417
  end
408
418
 
409
- def on_charref(code)
410
- @visitor.on_charref code
419
+ def on_charref(code, *a)
420
+ @visitor.on_charref code, *a
411
421
  end
412
422
 
413
- def on_charref_hex(code)
414
- @visitor.on_charref_hex code
423
+ def on_charref_hex(code, *a)
424
+ @visitor.on_charref_hex code, *a
415
425
  end
416
426
 
417
- def on_start_document
418
- @visitor.on_start_document
427
+ def on_start_document(*a)
428
+ @visitor.on_start_document *a
419
429
  end
420
430
 
421
- def on_end_document
422
- @visitor.on_end_document
431
+ def on_end_document(*a)
432
+ @visitor.on_end_document *a
423
433
  end
424
434
 
425
435
 
@@ -444,50 +454,51 @@ module XMLScan
444
454
  #
445
455
  # A: on_chardata ('HOGE')
446
456
 
447
- def on_stag(name)
448
- @visitor.on_stag name
457
+ def on_stag(name, *a)
458
+ @visitor.on_stag name, *a
449
459
  end
450
460
 
451
- def on_attribute(name)
452
- @visitor.on_attribute name
461
+ def on_attribute(name, *a)
462
+ @visitor.on_attribute name, *a
453
463
  end
454
464
 
455
- def on_attr_value(str)
456
- @visitor.on_attr_value str
465
+ def on_attr_value(str, *a)
466
+ @visitor.on_attr_value str, *a
457
467
  end
458
468
 
459
- def on_attr_entityref(ref)
460
- @visitor.on_attr_entityref ref
469
+ def on_attr_entityref(ref, *a)
470
+ @visitor.on_attr_entityref ref, *a
461
471
  end
462
472
 
463
- def on_attr_charref(code)
464
- @visitor.on_attr_charref code
473
+ def on_attr_charref(code, *a)
474
+ @visitor.on_attr_charref code, *a
465
475
  end
466
476
 
467
- def on_attr_charref_hex(code)
468
- @visitor.on_attr_charref_hex code
477
+ def on_attr_charref_hex(code, *a)
478
+ @visitor.on_attr_charref_hex code, *a
469
479
  end
470
480
 
471
- def on_attribute_end(name)
472
- @visitor.on_attribute_end name
481
+ def on_attribute_end(name, *a)
482
+ @visitor.on_attribute_end name, *a, *a
473
483
  end
474
484
 
475
- def on_stag_end_empty(name)
476
- @visitor.on_stag_end_empty name
485
+ def on_stag_end_empty(name, *a)
486
+ @visitor.on_stag_end_empty name, *a
477
487
  end
478
488
 
479
- def on_stag_end(name)
480
- @visitor.on_stag_end name
489
+ def on_stag_end(name, *a)
490
+ #STDERR << "ose #{name}, #{a.inspect}\n"
491
+ @visitor.on_stag_end name, *a
481
492
  end
482
493
 
483
494
 
495
+ S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
496
+ E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
484
497
 
485
498
  private
486
499
 
487
500
  module OptRegexp
488
501
  UTFSTR = "é"
489
- S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
- E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
502
 
492
503
  RE_ENCODINGS = {
493
504
  :n=>/e/n.encoding,
@@ -525,6 +536,7 @@ module XMLScan
525
536
  else
526
537
  s = $`
527
538
  on_chardata s unless s.empty?
539
+ #orig = $'.sub(/(?=;).*$/,'')
528
540
  ref = nil
529
541
  $'.split('&', -1).each { |s|
530
542
  unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
@@ -533,18 +545,18 @@ module XMLScan
533
545
  parse_error "reference to `#{ref}' doesn't end with `;'"
534
546
  else
535
547
  parse_error "`&' is not used for entity/character references"
536
- on_chardata('&' << s)
548
+ on_chardata '&'+s
537
549
  next
538
550
  end
539
551
  end
540
- ref = $`
552
+ orig = ?& + (ref = $`) + ?;
541
553
  s = $'
542
554
  if /\A[^#]/ =~ ref then
543
- on_entityref ref
555
+ on_entityref ref, orig
544
556
  elsif /\A#(\d+)\z/ =~ ref then
545
- on_charref $1.to_i
557
+ on_charref $1.to_i, orig
546
558
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
- on_charref_hex $1.hex
559
+ on_charref_hex $1.hex, orig
548
560
  else
549
561
  parse_error "invalid character reference `#{ref}'"
550
562
  end
@@ -558,8 +570,9 @@ module XMLScan
558
570
  end
559
571
 
560
572
 
561
- def scan_attvalue(s) # almostly copy & paste from scan_chardata
573
+ def scan_attr_value(s) # almostly copy & paste from scan_chardata
562
574
  unless /&/ =~ s then
575
+ #STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
563
576
  on_attr_value s
564
577
  else
565
578
  s = $`
@@ -576,14 +589,14 @@ module XMLScan
576
589
  next
577
590
  end
578
591
  end
579
- ref = $`
592
+ orig = ?& + (ref = $`) + ?;
580
593
  s = $'
581
594
  if /\A[^#]/ =~ ref then
582
- on_attr_entityref ref
595
+ on_attr_entityref ref, orig
583
596
  elsif /\A#(\d+)\z/ =~ ref then
584
- on_attr_charref $1.to_i
597
+ on_attr_charref $1.to_i, orig
585
598
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
- on_attr_charref_hex $1.hex
599
+ on_attr_charref_hex $1.hex, orig
587
600
  else
588
601
  parse_error "invalid character reference `#{ref}'"
589
602
  end
@@ -682,6 +695,7 @@ module XMLScan
682
695
 
683
696
 
684
697
  def scan_etag(s)
698
+ orig="#{s}>"
685
699
  s[0,2] = '' # remove '</'
686
700
  if s.empty? then
687
701
  if @src.close_tag then # </>
@@ -689,14 +703,14 @@ module XMLScan
689
703
  else # </< or </[EOF]
690
704
  parse_error "parse error at `</'"
691
705
  s << '>' if @src.close_tag
692
- return on_chardata('</' << s)
706
+ return on_chardata '</' << s
693
707
  end
694
708
  elsif /[ \t\n\r]+/ =~ s then
695
709
  s1, s2 = $`, $'
696
710
  if s1.empty? then # </ tag
697
711
  parse_error "parse error at `</'"
698
712
  s << '>' if @src.close_tag
699
- return on_chardata('</' + s)
713
+ return on_chardata '</' + s
700
714
  elsif not s2.empty? then # </ta g
701
715
  parse_error "illegal whitespace is found within end tag `#{s1}'"
702
716
  while @src.get_tag
@@ -705,7 +719,7 @@ module XMLScan
705
719
  s = s1
706
720
  end
707
721
  found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
- on_etag s
722
+ on_etag s, orig
709
723
  end
710
724
 
711
725
 
@@ -745,6 +759,8 @@ module XMLScan
745
759
 
746
760
 
747
761
  def scan_stag(s)
762
+ hash = {}
763
+ orig = [s.dup]
748
764
  unless /(?=[\/ \t\n\r='"])/ =~ s then
749
765
  name = s
750
766
  name[0,1] = '' # remove `<'
@@ -753,54 +769,65 @@ module XMLScan
753
769
  return found_empty_stag
754
770
  else # << or <[EOF]
755
771
  parse_error "parse error at `<'"
756
- return on_chardata('<')
772
+ return on_chardata '<'
757
773
  end
758
774
  end
759
775
  on_stag name
760
776
  found_unclosed_stag name unless @src.close_tag
761
- on_stag_end name
777
+ on_stag_end name, orig*''+?>, {}
762
778
  else
779
+ k = nil
763
780
  name = $`
764
781
  s = $'
765
782
  name[0,1] = '' # remove `<'
766
783
  if name.empty? then # `< tag' or `<=`
767
784
  parse_error "parse error at `<'"
768
785
  s << '>' if @src.close_tag
769
- return on_chardata('<' << s)
786
+ return on_chardata '<' << s
770
787
  end
771
788
  on_stag name
772
789
  emptyelem = false
773
- key,val,error,qmark,c = nil
774
790
  begin
775
791
  continue = false
776
792
  s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
793
  ) { |key,val,error|
778
- if key then # key="value"
794
+ orig_val = []
795
+ if key then
779
796
  on_attribute key
797
+ k=key
798
+ orig_val << val
780
799
  qmark = val.slice!(0,1)
781
800
  if val[-1] == qmark[0] then
782
801
  val.chop!
783
- scan_attvalue val unless val.empty?
802
+ scan_attr_value val unless val.empty?
784
803
  else
785
- scan_attvalue val unless val.empty?
804
+ scan_attr_value val unless val.empty?
786
805
  begin
787
806
  s = @src.get
807
+ #STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
788
808
  unless s then
789
809
  parse_error "unterminated attribute `#{key}' meets EOF"
790
810
  break
791
811
  end
812
+ orig << s.dup
792
813
  c = s[0]
793
814
  val, s = s.split(qmark, 2)
815
+ orig_val << val
794
816
  if c == ?< then
795
817
  wellformed_error "`<' is found in attribute `#{key}'"
796
818
  elsif c != ?> then
797
- scan_attvalue '>'
819
+ #STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
820
+ orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
821
+ scan_attr_value ?>
798
822
  end
799
- scan_attvalue val if c
823
+ scan_attr_value val if c
800
824
  end until s
801
825
  continue = s # if eof then continue is false, else true.
802
826
  end
803
- on_attribute_end key
827
+ #STDERR << "attr:#{k}, #{orig_val}\n"
828
+ hash[k] = orig_val*''
829
+ #STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
830
+ on_attribute_end key #, orig_val*''
804
831
  elsif error then
805
832
  continue = s = found_stag_error(error)
806
833
  else
@@ -816,9 +843,11 @@ module XMLScan
816
843
  end
817
844
  end
818
845
  if emptyelem then
819
- on_stag_end_empty name
846
+ on_stag_end_empty name, orig*''+?>, hash
820
847
  else
821
- on_stag_end name
848
+ #STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
849
+ on_stag_end name, orig*''+?>, hash
850
+ #on_stag_end name, "<#{name}#{s}>", hash
822
851
  end
823
852
  end
824
853
  end
@@ -1067,10 +1096,10 @@ module XMLScan
1067
1096
 
1068
1097
 
1069
1098
  def scan_document
1070
- on_start_document
1099
+ on_start_document ''
1071
1100
  @src.prepare
1072
1101
  scan_prolog @src.get
1073
- on_end_document
1102
+ on_end_document ''
1074
1103
  end
1075
1104
 
1076
1105
 
@@ -9,15 +9,9 @@
9
9
 
10
10
  module XMLScan
11
11
 
12
- # The version like 'X.X.0' (TENNY is 0) means that this is an unstable
13
- # release. Incompatible changes will be applied to this version
14
- # without special notice. This version should be distributed as a
15
- # snapshot only.
16
- #
17
- # TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
18
- # release is a stable release.
19
-
20
- VERSION = '0.2.3'
21
- RELEASE_DATE = '2003-05-02'
12
+ GEMNAME = 'xmlscan'
13
+ VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
14
+ VERSION = open(VERSION_FILE).to_a*''.chop
15
+ RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
22
16
 
23
17
  end