xmlscan 0.2.3 → 0.3.0preb

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -122,20 +122,29 @@ module XMLScan
122
122
  self
123
123
  end
124
124
 
125
-
125
+ =begin
126
+ Managing source in a private array.
127
+ * tag oriented (?< and ?> are the key tokens
128
+ * ?> that aren't followed by another ?< or ?> are stripped in splitting
129
+ =end
126
130
  def get
127
131
  pop or
128
132
  unless @eof then
129
133
  last = @last
130
134
  begin
131
- src = @src.gets
132
- unless src then
135
+ unless chunk = @src.gets then
133
136
  @eof = true
134
- unshift last
135
- last = nil
136
- break
137
+ @last = nil
138
+ return last
139
+ #unshift last # to be popped after reverse!
140
+ #last = nil
141
+ #break
137
142
  end
138
- a = src.split(/(?=<|>[<>])|>/, -1)
143
+ # negative lookahead: < or >< or >>
144
+ # so don't consume those (but split leaving them always at the
145
+ # end of chunks)
146
+ # consume (>) and split on >
147
+ a = chunk.split(/(?=<|>[<>])|>/, -1)
139
148
  if last then
140
149
  unless /\A[<>]/ =~ a.first then
141
150
  a[0] = last << (a.first || '')
@@ -143,6 +152,7 @@ module XMLScan
143
152
  push last
144
153
  end
145
154
  end
155
+ raise "size #{size}" if size > 1
146
156
  concat a
147
157
  last = pop
148
158
  end while empty?
@@ -223,7 +233,7 @@ module XMLScan
223
233
  last.push @last.inspect
224
234
  end
225
235
  a.push '#eof' if @eof
226
- "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
236
+ "((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
227
237
  end
228
238
 
229
239
  def each
@@ -354,72 +364,72 @@ module XMLScan
354
364
  end
355
365
  end
356
366
 
357
- def on_xmldecl_version(str)
358
- @visitor.on_xmldecl_version str
367
+ def on_xmldecl_version(str, *a)
368
+ @visitor.on_xmldecl_version str, *a
359
369
  end
360
370
 
361
- def on_xmldecl_encoding(str)
362
- @visitor.on_xmldecl_encoding str
371
+ def on_xmldecl_encoding(str, *a)
372
+ @visitor.on_xmldecl_encoding str, *a
363
373
  end
364
374
 
365
- def on_xmldecl_standalone(str)
366
- @visitor.on_xmldecl_standalone str
375
+ def on_xmldecl_standalone(str, *a)
376
+ @visitor.on_xmldecl_standalone str, *a
367
377
  end
368
378
 
369
- def on_xmldecl_other(name, value)
370
- @visitor.on_xmldecl_other name, value
379
+ def on_xmldecl_other(name, value, *a)
380
+ @visitor.on_xmldecl_other name, value, *a
371
381
  end
372
382
 
373
- def on_xmldecl_end
374
- @visitor.on_xmldecl_end
383
+ def on_xmldecl_end(*a)
384
+ @visitor.on_xmldecl_end *a
375
385
  end
376
386
 
377
- def on_doctype(root, pubid, sysid)
378
- @visitor.on_doctype root, pubid, sysid
387
+ def on_doctype(root, pubid, sysid, *a)
388
+ @visitor.on_doctype root, pubid, sysid, *a
379
389
  end
380
390
 
381
- def on_prolog_space(str)
382
- @visitor.on_prolog_space str
391
+ def on_prolog_space(str, *a)
392
+ @visitor.on_prolog_space str, *a
383
393
  end
384
394
 
385
- def on_comment(str)
386
- @visitor.on_comment str
395
+ def on_comment(str, *a)
396
+ @visitor.on_comment str, *a
387
397
  end
388
398
 
389
- def on_pi(target, pi)
390
- @visitor.on_pi target, pi
399
+ def on_pi(target, pi, *a)
400
+ @visitor.on_pi target, pi, *a
391
401
  end
392
402
 
393
- def on_chardata(str)
394
- @visitor.on_chardata str
403
+ def on_chardata(str, *a)
404
+ @visitor.on_chardata str, *a
395
405
  end
396
406
 
397
- def on_cdata(str)
398
- @visitor.on_cdata str
407
+ def on_cdata(str, *a)
408
+ @visitor.on_cdata str, *a
399
409
  end
400
410
 
401
- def on_etag(name)
402
- @visitor.on_etag name
411
+ def on_etag(name, *a)
412
+ @visitor.on_etag name, *a
403
413
  end
404
414
 
405
- def on_entityref(ref)
406
- @visitor.on_entityref ref
415
+ def on_entityref(ref, *a)
416
+ @visitor.on_entityref ref, *a
407
417
  end
408
418
 
409
- def on_charref(code)
410
- @visitor.on_charref code
419
+ def on_charref(code, *a)
420
+ @visitor.on_charref code, *a
411
421
  end
412
422
 
413
- def on_charref_hex(code)
414
- @visitor.on_charref_hex code
423
+ def on_charref_hex(code, *a)
424
+ @visitor.on_charref_hex code, *a
415
425
  end
416
426
 
417
- def on_start_document
418
- @visitor.on_start_document
427
+ def on_start_document(*a)
428
+ @visitor.on_start_document *a
419
429
  end
420
430
 
421
- def on_end_document
422
- @visitor.on_end_document
431
+ def on_end_document(*a)
432
+ @visitor.on_end_document *a
423
433
  end
424
434
 
425
435
 
@@ -444,50 +454,51 @@ module XMLScan
444
454
  #
445
455
  # A: on_chardata ('HOGE')
446
456
 
447
- def on_stag(name)
448
- @visitor.on_stag name
457
+ def on_stag(name, *a)
458
+ @visitor.on_stag name, *a
449
459
  end
450
460
 
451
- def on_attribute(name)
452
- @visitor.on_attribute name
461
+ def on_attribute(name, *a)
462
+ @visitor.on_attribute name, *a
453
463
  end
454
464
 
455
- def on_attr_value(str)
456
- @visitor.on_attr_value str
465
+ def on_attr_value(str, *a)
466
+ @visitor.on_attr_value str, *a
457
467
  end
458
468
 
459
- def on_attr_entityref(ref)
460
- @visitor.on_attr_entityref ref
469
+ def on_attr_entityref(ref, *a)
470
+ @visitor.on_attr_entityref ref, *a
461
471
  end
462
472
 
463
- def on_attr_charref(code)
464
- @visitor.on_attr_charref code
473
+ def on_attr_charref(code, *a)
474
+ @visitor.on_attr_charref code, *a
465
475
  end
466
476
 
467
- def on_attr_charref_hex(code)
468
- @visitor.on_attr_charref_hex code
477
+ def on_attr_charref_hex(code, *a)
478
+ @visitor.on_attr_charref_hex code, *a
469
479
  end
470
480
 
471
- def on_attribute_end(name)
472
- @visitor.on_attribute_end name
481
+ def on_attribute_end(name, *a)
482
+ @visitor.on_attribute_end name, *a, *a
473
483
  end
474
484
 
475
- def on_stag_end_empty(name)
476
- @visitor.on_stag_end_empty name
485
+ def on_stag_end_empty(name, *a)
486
+ @visitor.on_stag_end_empty name, *a
477
487
  end
478
488
 
479
- def on_stag_end(name)
480
- @visitor.on_stag_end name
489
+ def on_stag_end(name, *a)
490
+ #STDERR << "ose #{name}, #{a.inspect}\n"
491
+ @visitor.on_stag_end name, *a
481
492
  end
482
493
 
483
494
 
495
+ S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
496
+ E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
484
497
 
485
498
  private
486
499
 
487
500
  module OptRegexp
488
501
  UTFSTR = "é"
489
- S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
- E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
502
 
492
503
  RE_ENCODINGS = {
493
504
  :n=>/e/n.encoding,
@@ -525,6 +536,7 @@ module XMLScan
525
536
  else
526
537
  s = $`
527
538
  on_chardata s unless s.empty?
539
+ #orig = $'.sub(/(?=;).*$/,'')
528
540
  ref = nil
529
541
  $'.split('&', -1).each { |s|
530
542
  unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
@@ -533,18 +545,18 @@ module XMLScan
533
545
  parse_error "reference to `#{ref}' doesn't end with `;'"
534
546
  else
535
547
  parse_error "`&' is not used for entity/character references"
536
- on_chardata('&' << s)
548
+ on_chardata '&'+s
537
549
  next
538
550
  end
539
551
  end
540
- ref = $`
552
+ orig = ?& + (ref = $`) + ?;
541
553
  s = $'
542
554
  if /\A[^#]/ =~ ref then
543
- on_entityref ref
555
+ on_entityref ref, orig
544
556
  elsif /\A#(\d+)\z/ =~ ref then
545
- on_charref $1.to_i
557
+ on_charref $1.to_i, orig
546
558
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
- on_charref_hex $1.hex
559
+ on_charref_hex $1.hex, orig
548
560
  else
549
561
  parse_error "invalid character reference `#{ref}'"
550
562
  end
@@ -558,8 +570,9 @@ module XMLScan
558
570
  end
559
571
 
560
572
 
561
- def scan_attvalue(s) # almostly copy & paste from scan_chardata
573
+ def scan_attr_value(s) # almostly copy & paste from scan_chardata
562
574
  unless /&/ =~ s then
575
+ #STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
563
576
  on_attr_value s
564
577
  else
565
578
  s = $`
@@ -576,14 +589,14 @@ module XMLScan
576
589
  next
577
590
  end
578
591
  end
579
- ref = $`
592
+ orig = ?& + (ref = $`) + ?;
580
593
  s = $'
581
594
  if /\A[^#]/ =~ ref then
582
- on_attr_entityref ref
595
+ on_attr_entityref ref, orig
583
596
  elsif /\A#(\d+)\z/ =~ ref then
584
- on_attr_charref $1.to_i
597
+ on_attr_charref $1.to_i, orig
585
598
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
- on_attr_charref_hex $1.hex
599
+ on_attr_charref_hex $1.hex, orig
587
600
  else
588
601
  parse_error "invalid character reference `#{ref}'"
589
602
  end
@@ -682,6 +695,7 @@ module XMLScan
682
695
 
683
696
 
684
697
  def scan_etag(s)
698
+ orig="#{s}>"
685
699
  s[0,2] = '' # remove '</'
686
700
  if s.empty? then
687
701
  if @src.close_tag then # </>
@@ -689,14 +703,14 @@ module XMLScan
689
703
  else # </< or </[EOF]
690
704
  parse_error "parse error at `</'"
691
705
  s << '>' if @src.close_tag
692
- return on_chardata('</' << s)
706
+ return on_chardata '</' << s
693
707
  end
694
708
  elsif /[ \t\n\r]+/ =~ s then
695
709
  s1, s2 = $`, $'
696
710
  if s1.empty? then # </ tag
697
711
  parse_error "parse error at `</'"
698
712
  s << '>' if @src.close_tag
699
- return on_chardata('</' + s)
713
+ return on_chardata '</' + s
700
714
  elsif not s2.empty? then # </ta g
701
715
  parse_error "illegal whitespace is found within end tag `#{s1}'"
702
716
  while @src.get_tag
@@ -705,7 +719,7 @@ module XMLScan
705
719
  s = s1
706
720
  end
707
721
  found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
- on_etag s
722
+ on_etag s, orig
709
723
  end
710
724
 
711
725
 
@@ -745,6 +759,8 @@ module XMLScan
745
759
 
746
760
 
747
761
  def scan_stag(s)
762
+ hash = {}
763
+ orig = [s.dup]
748
764
  unless /(?=[\/ \t\n\r='"])/ =~ s then
749
765
  name = s
750
766
  name[0,1] = '' # remove `<'
@@ -753,54 +769,65 @@ module XMLScan
753
769
  return found_empty_stag
754
770
  else # << or <[EOF]
755
771
  parse_error "parse error at `<'"
756
- return on_chardata('<')
772
+ return on_chardata '<'
757
773
  end
758
774
  end
759
775
  on_stag name
760
776
  found_unclosed_stag name unless @src.close_tag
761
- on_stag_end name
777
+ on_stag_end name, orig*''+?>, {}
762
778
  else
779
+ k = nil
763
780
  name = $`
764
781
  s = $'
765
782
  name[0,1] = '' # remove `<'
766
783
  if name.empty? then # `< tag' or `<=`
767
784
  parse_error "parse error at `<'"
768
785
  s << '>' if @src.close_tag
769
- return on_chardata('<' << s)
786
+ return on_chardata '<' << s
770
787
  end
771
788
  on_stag name
772
789
  emptyelem = false
773
- key,val,error,qmark,c = nil
774
790
  begin
775
791
  continue = false
776
792
  s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
793
  ) { |key,val,error|
778
- if key then # key="value"
794
+ orig_val = []
795
+ if key then
779
796
  on_attribute key
797
+ k=key
798
+ orig_val << val
780
799
  qmark = val.slice!(0,1)
781
800
  if val[-1] == qmark[0] then
782
801
  val.chop!
783
- scan_attvalue val unless val.empty?
802
+ scan_attr_value val unless val.empty?
784
803
  else
785
- scan_attvalue val unless val.empty?
804
+ scan_attr_value val unless val.empty?
786
805
  begin
787
806
  s = @src.get
807
+ #STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
788
808
  unless s then
789
809
  parse_error "unterminated attribute `#{key}' meets EOF"
790
810
  break
791
811
  end
812
+ orig << s.dup
792
813
  c = s[0]
793
814
  val, s = s.split(qmark, 2)
815
+ orig_val << val
794
816
  if c == ?< then
795
817
  wellformed_error "`<' is found in attribute `#{key}'"
796
818
  elsif c != ?> then
797
- scan_attvalue '>'
819
+ #STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
820
+ orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
821
+ scan_attr_value ?>
798
822
  end
799
- scan_attvalue val if c
823
+ scan_attr_value val if c
800
824
  end until s
801
825
  continue = s # if eof then continue is false, else true.
802
826
  end
803
- on_attribute_end key
827
+ #STDERR << "attr:#{k}, #{orig_val}\n"
828
+ hash[k] = orig_val*''
829
+ #STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
830
+ on_attribute_end key #, orig_val*''
804
831
  elsif error then
805
832
  continue = s = found_stag_error(error)
806
833
  else
@@ -816,9 +843,11 @@ module XMLScan
816
843
  end
817
844
  end
818
845
  if emptyelem then
819
- on_stag_end_empty name
846
+ on_stag_end_empty name, orig*''+?>, hash
820
847
  else
821
- on_stag_end name
848
+ #STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
849
+ on_stag_end name, orig*''+?>, hash
850
+ #on_stag_end name, "<#{name}#{s}>", hash
822
851
  end
823
852
  end
824
853
  end
@@ -1067,10 +1096,10 @@ module XMLScan
1067
1096
 
1068
1097
 
1069
1098
  def scan_document
1070
- on_start_document
1099
+ on_start_document ''
1071
1100
  @src.prepare
1072
1101
  scan_prolog @src.get
1073
- on_end_document
1102
+ on_end_document ''
1074
1103
  end
1075
1104
 
1076
1105
 
@@ -9,15 +9,9 @@
9
9
 
10
10
  module XMLScan
11
11
 
12
- # The version like 'X.X.0' (TENNY is 0) means that this is an unstable
13
- # release. Incompatible changes will be applied to this version
14
- # without special notice. This version should be distributed as a
15
- # snapshot only.
16
- #
17
- # TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
18
- # release is a stable release.
19
-
20
- VERSION = '0.2.3'
21
- RELEASE_DATE = '2003-05-02'
12
+ GEMNAME = 'xmlscan'
13
+ VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
14
+ VERSION = open(VERSION_FILE).to_a*''.chop
15
+ RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
22
16
 
23
17
  end