RubyGems - xmlscan - Versions diffs - 0.2.3 → 0.3.0preb - Mend

xmlscan 0.2.3 → 0.3.0preb

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/xmlscan/scanner.rb CHANGED

@@ -122,20 +122,29 @@ module XMLScan
       self
     end
+=begin
+  Managing source in a private array.
+  * tag oriented (?< and ?> are the key tokens
+  * ?> that aren't followed by another ?< or ?> are stripped in splitting
+=end
     def get
       pop or
         unless @eof then
           last = @last
           begin
-            src = @src.gets
-            unless src then
+            unless chunk = @src.gets then
               @eof = true
-              unshift last
-              last = nil
-              break
+              @last = nil
+              return last
+              #unshift last # to be popped after reverse!
+              #last = nil
+              #break
             end
-            a = src.split(/(?=<|>[<>])|>/, -1)
+            # negative lookahead: < or >< or >>
+            # so don't consume those (but split leaving them always at the
+            # end of chunks)
+            # consume (>) and split on >
+            a = chunk.split(/(?=<|>[<>])|>/, -1)
             if last then
               unless /\A[<>]/ =~ a.first then
                 a[0] = last << (a.first || '')
@@ -143,6 +152,7 @@ module XMLScan
                 push last
               end
             end
+            raise "size #{size}" if size > 1
             concat a
             last = pop
           end while empty?
@@ -223,7 +233,7 @@ module XMLScan
         last.push @last.inspect
       end
       a.push '#eof' if @eof
-      "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
+      "((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
     end
     def each
@@ -354,72 +364,72 @@ module XMLScan
       end
     end
-    def on_xmldecl_version(str)
-      @visitor.on_xmldecl_version str
+    def on_xmldecl_version(str, *a)
+      @visitor.on_xmldecl_version str, *a
     end
-    def on_xmldecl_encoding(str)
-      @visitor.on_xmldecl_encoding str
+    def on_xmldecl_encoding(str, *a)
+      @visitor.on_xmldecl_encoding str, *a
     end
-    def on_xmldecl_standalone(str)
-      @visitor.on_xmldecl_standalone str
+    def on_xmldecl_standalone(str, *a)
+      @visitor.on_xmldecl_standalone str, *a
     end
-    def on_xmldecl_other(name, value)
-      @visitor.on_xmldecl_other name, value
+    def on_xmldecl_other(name, value, *a)
+      @visitor.on_xmldecl_other name, value, *a
     end
-    def on_xmldecl_end
-      @visitor.on_xmldecl_end
+    def on_xmldecl_end(*a)
+      @visitor.on_xmldecl_end *a
     end
-    def on_doctype(root, pubid, sysid)
-      @visitor.on_doctype root, pubid, sysid
+    def on_doctype(root, pubid, sysid, *a)
+      @visitor.on_doctype root, pubid, sysid, *a
     end
-    def on_prolog_space(str)
-      @visitor.on_prolog_space str
+    def on_prolog_space(str, *a)
+      @visitor.on_prolog_space str, *a
     end
-    def on_comment(str)
-      @visitor.on_comment str
+    def on_comment(str, *a)
+      @visitor.on_comment str, *a
     end
-    def on_pi(target, pi)
-      @visitor.on_pi target, pi
+    def on_pi(target, pi, *a)
+      @visitor.on_pi target, pi, *a
     end
-    def on_chardata(str)
-      @visitor.on_chardata str
+    def on_chardata(str, *a)
+      @visitor.on_chardata str, *a
     end
-    def on_cdata(str)
-      @visitor.on_cdata str
+    def on_cdata(str, *a)
+      @visitor.on_cdata str, *a
     end
-    def on_etag(name)
-      @visitor.on_etag name
+    def on_etag(name, *a)
+      @visitor.on_etag name, *a
     end
-    def on_entityref(ref)
-      @visitor.on_entityref ref
+    def on_entityref(ref, *a)
+      @visitor.on_entityref ref, *a
     end
-    def on_charref(code)
-      @visitor.on_charref code
+    def on_charref(code, *a)
+      @visitor.on_charref code, *a
     end
-    def on_charref_hex(code)
-      @visitor.on_charref_hex code
+    def on_charref_hex(code, *a)
+      @visitor.on_charref_hex code, *a
     end
-    def on_start_document
-      @visitor.on_start_document
+    def on_start_document(*a)
+      @visitor.on_start_document *a
     end
-    def on_end_document
-      @visitor.on_end_document
+    def on_end_document(*a)
+      @visitor.on_end_document *a
     end
@@ -444,50 +454,51 @@ module XMLScan
     #
     #    A: on_chardata          ('HOGE')
-    def on_stag(name)
-      @visitor.on_stag name
+    def on_stag(name, *a)
+      @visitor.on_stag name, *a
     end
-    def on_attribute(name)
-      @visitor.on_attribute name
+    def on_attribute(name, *a)
+      @visitor.on_attribute name, *a
     end
-    def on_attr_value(str)
-      @visitor.on_attr_value str
+    def on_attr_value(str, *a)
+      @visitor.on_attr_value str, *a
     end
-    def on_attr_entityref(ref)
-      @visitor.on_attr_entityref ref
+    def on_attr_entityref(ref, *a)
+      @visitor.on_attr_entityref ref, *a
     end
-    def on_attr_charref(code)
-      @visitor.on_attr_charref code
+    def on_attr_charref(code, *a)
+      @visitor.on_attr_charref code, *a
     end
-    def on_attr_charref_hex(code)
-      @visitor.on_attr_charref_hex code
+    def on_attr_charref_hex(code, *a)
+      @visitor.on_attr_charref_hex code, *a
     end
-    def on_attribute_end(name)
-      @visitor.on_attribute_end name
+    def on_attribute_end(name, *a)
+      @visitor.on_attribute_end name, *a, *a
     end
-    def on_stag_end_empty(name)
-      @visitor.on_stag_end_empty name
+    def on_stag_end_empty(name, *a)
+      @visitor.on_stag_end_empty name, *a
     end
-    def on_stag_end(name)
-      @visitor.on_stag_end name
+    def on_stag_end(name, *a)
+      #STDERR << "ose #{name}, #{a.inspect}\n"
+      @visitor.on_stag_end name, *a
     end
+    S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
+    E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
     private
     module OptRegexp
       UTFSTR = "é"
-      S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
-      E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
       RE_ENCODINGS = {
         :n=>/e/n.encoding,
@@ -525,6 +536,7 @@ module XMLScan
         else
           s = $`
           on_chardata s unless s.empty?
+          #orig = $'.sub(/(?=;).*$/,'')
           ref = nil
           $'.split('&', -1).each { |s|
             unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
@@ -533,18 +545,18 @@ module XMLScan
                 parse_error "reference to `#{ref}' doesn't end with `;'"
               else
                 parse_error "`&' is not used for entity/character references"
-                on_chardata('&' << s)
+                on_chardata '&'+s
                 next
               end
             end
-            ref = $`
+            orig = ?& + (ref = $`) + ?;
             s = $'
             if /\A[^#]/ =~ ref then
-              on_entityref ref
+              on_entityref ref, orig
             elsif /\A#(\d+)\z/ =~ ref then
-              on_charref $1.to_i
+              on_charref $1.to_i, orig
             elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
-              on_charref_hex $1.hex
+              on_charref_hex $1.hex, orig
             else
               parse_error "invalid character reference `#{ref}'"
             end
@@ -558,8 +570,9 @@ module XMLScan
     end
-    def scan_attvalue(s)     # almostly copy & paste from scan_chardata
+    def scan_attr_value(s)     # almostly copy & paste from scan_chardata
       unless /&/ =~ s then
+        #STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
         on_attr_value s
       else
         s = $`
@@ -576,14 +589,14 @@ module XMLScan
               next
             end
           end
-          ref = $`
+          orig = ?& + (ref = $`) + ?;
           s = $'
           if /\A[^#]/ =~ ref then
-            on_attr_entityref ref
+            on_attr_entityref ref, orig
           elsif /\A#(\d+)\z/ =~ ref then
-            on_attr_charref $1.to_i
+            on_attr_charref $1.to_i, orig
           elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
-            on_attr_charref_hex $1.hex
+            on_attr_charref_hex $1.hex, orig
           else
             parse_error "invalid character reference `#{ref}'"
           end
@@ -682,6 +695,7 @@ module XMLScan
     def scan_etag(s)
+      orig="#{s}>"
       s[0,2] = ''  # remove '</'
       if s.empty? then
         if @src.close_tag then   # </>
@@ -689,14 +703,14 @@ module XMLScan
         else                     # </< or </[EOF]
           parse_error "parse error at `</'"
           s << '>' if @src.close_tag
-          return on_chardata('</' << s)
+          return on_chardata '</' << s
         end
       elsif /[ \t\n\r]+/ =~ s then
         s1, s2 = $`, $'
         if s1.empty? then                # </ tag
           parse_error "parse error at `</'"
           s << '>' if @src.close_tag
-          return on_chardata('</' + s)
+          return on_chardata '</' + s
         elsif not s2.empty? then         # </ta g
           parse_error "illegal whitespace is found within end tag `#{s1}'"
           while @src.get_tag
@@ -705,7 +719,7 @@ module XMLScan
         s = s1
       end
       found_unclosed_etag s unless @src.close_tag   # </tag< or </tag[EOF]
-      on_etag s
+      on_etag s, orig
     end
@@ -745,6 +759,8 @@ module XMLScan
     def scan_stag(s)
+      hash = {}
+      orig = [s.dup]
       unless /(?=[\/ \t\n\r='"])/ =~ s then
         name = s
         name[0,1] = ''        # remove `<'
@@ -753,54 +769,65 @@ module XMLScan
             return found_empty_stag
           else                     # << or <[EOF]
             parse_error "parse error at `<'"
-            return on_chardata('<')
+            return on_chardata '<'
           end
         end
         on_stag name
         found_unclosed_stag name unless @src.close_tag
-        on_stag_end name
+        on_stag_end name, orig*''+?>, {}
       else
+        k = nil
         name = $`
         s = $'
         name[0,1] = ''        # remove `<'
         if name.empty? then   # `< tag' or `<=`
           parse_error "parse error at `<'"
           s << '>' if @src.close_tag
-          return on_chardata('<' << s)
+          return on_chardata '<' << s
         end
         on_stag name
         emptyelem = false
-        key,val,error,qmark,c = nil
         begin
           continue = false
           s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
                  ) { |key,val,error|
-            if key then                # key="value"
+            orig_val = []
+            if key then
               on_attribute key
+              k=key
+              orig_val << val
               qmark = val.slice!(0,1)
               if val[-1] == qmark[0] then
                 val.chop!
-                scan_attvalue val unless val.empty?
+                scan_attr_value val unless val.empty?
               else
-                scan_attvalue val unless val.empty?
+                scan_attr_value val unless val.empty?
                 begin
                   s = @src.get
+                  #STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
                   unless s then
                     parse_error "unterminated attribute `#{key}' meets EOF"
                     break
                   end
+                  orig << s.dup
                   c = s[0]
                   val, s = s.split(qmark, 2)
+                  orig_val << val
                   if c == ?< then
                     wellformed_error "`<' is found in attribute `#{key}'"
                   elsif c != ?> then
-                    scan_attvalue '>'
+                    #STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
+                    orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
+                    scan_attr_value ?>
                   end
-                  scan_attvalue val if c
+                  scan_attr_value val if c
                 end until s
                 continue = s      # if eof then continue is false, else true.
               end
-              on_attribute_end key
+              #STDERR << "attr:#{k}, #{orig_val}\n"
+              hash[k] = orig_val*''
+              #STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
+              on_attribute_end key #, orig_val*''
             elsif error then
               continue = s = found_stag_error(error)
             else
@@ -816,9 +843,11 @@ module XMLScan
           end
         end
         if emptyelem then
-          on_stag_end_empty name
+          on_stag_end_empty name, orig*''+?>, hash
         else
-          on_stag_end name
+          #STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
+          on_stag_end name, orig*''+?>, hash
+          #on_stag_end name, "<#{name}#{s}>", hash
         end
       end
     end
@@ -1067,10 +1096,10 @@ module XMLScan
     def scan_document
-      on_start_document
+      on_start_document ''
       @src.prepare
       scan_prolog @src.get
-      on_end_document
+      on_end_document ''
     end

data/lib/xmlscan/version.rb CHANGED

@@ -9,15 +9,9 @@
 module XMLScan
-  # The version like 'X.X.0' (TENNY is 0) means that this is an unstable
-  # release. Incompatible changes will be applied to this version
-  # without special notice. This version should be distributed as a
-  # snapshot only.
-  #
-  # TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
-  # release is a stable release.
-  VERSION = '0.2.3'
-  RELEASE_DATE = '2003-05-02'
+  GEMNAME = 'xmlscan'
+  VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
+  VERSION = open(VERSION_FILE).to_a*''.chop
+  RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
 end