RubyGems - tsql_shparser - Versions diffs - 0.0.1 - Mend

tsql_shparser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/lib/tsql_shparser.rb +198 -0
data/lib/tsql_shparser/tsql_stmt.rb +673 -0
data/lib/tsql_shparser/tsql_tokenizer.rb +375 -0
data/test/tst_tokenizer.rb +211 -0
metadata +40 -0

data/lib/tsql_shparser/tsql_tokenizer.rb ADDED

@@ -0,0 +1,375 @@
+#!/usr/bin/env ruby
+#
+# tsql_tokenizer.rb: Tokenizer for t-SQL
+# Copyright (c) 2005-2006 Shashank Date (shanko_date@yahoo.com)
+#
+# License: Ruby's
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Defines the Token type. A token is an array of three elements (triplet):
+# [line_number, column_number, token_string]
+# Some tokens may span over multiple lines (e.g. quoted strings), in which
+# case the line number and column number is that of the end of the token.
+class Token
+  VERSION = "0.0.1"
+  DATA_TYPES = %w{
+    SQL_VARIANT DATETIME    SMALLDATETIME    FLOAT    REAL    DECIMAL    MONEY
+    SMALLMONEY    BIGINT    INT    SMALLINT    TINYINT    BIT    NTEXT    TEXT
+    IMAGE    TIMESTAMP    UNIQUEIDENTIFIER     NVARCHAR    NCHAR    VARCHAR    CHAR
+    VARBINARY BINARY
+  }
+  KEY_WORDS = %w{
+  ADD EXCEPT PERCENT
+  ALL EXEC PLAN
+  ALTER EXECUTE PRECISION
+  AND EXISTS PRIMARY
+  ANY EXIT PRINT
+  AS FETCH PROC
+  ASC FILE PROCEDURE
+  AUTHORIZATION FILLFACTOR PUBLIC
+  BACKUP FOR RAISERROR
+  BEGIN FOREIGN READ
+  BETWEEN FREETEXT READTEXT
+  BREAK FREETEXTTABLE RECONFIGURE
+  BROWSE FROM REFERENCES
+  BULK FULL REPLICATION
+  BY FUNCTION RESTORE
+  CASCADE GOTO RESTRICT
+  CASE GRANT RETURN
+  CHECK GROUP REVOKE
+  CHECKPOINT HAVING RIGHT
+  CLOSE HOLDLOCK ROLLBACK
+  CLUSTERED IDENTITY ROWCOUNT
+  COALESCE IDENTITY_INSERT ROWGUIDCOL
+  COLLATE IDENTITYCOL RULE
+  COLUMN IF SAVE
+  COMMIT IN SCHEMA
+  COMPUTE INDEX SELECT
+  CONSTRAINT INNER SESSION_USER
+  CONTAINS INSERT SET
+  CONTAINSTABLE INTERSECT SETUSER
+  CONTINUE INTO SHUTDOWN
+  CONVERT IS SOME
+  CREATE JOIN STATISTICS
+  CROSS KEY SYSTEM_USER
+  CURRENT KILL TABLE
+  CURRENT_DATE LEFT TEXTSIZE
+  CURRENT_TIME LIKE THEN TIES
+  CURRENT_TIMESTAMP LINENO TO
+  CURRENT_USER LOAD TOP
+  CURSOR NATIONAL  TRAN
+  DATABASE NOCHECK TRANSACTION
+  DBCC NONCLUSTERED TRIGGER
+  DEALLOCATE NOT TRUNCATE
+  DECLARE NULL TSEQUAL
+  DEFAULT NULLIF UNION
+  DELETE OF UNIQUE
+  DENY OFF UPDATE
+  DESC OFFSETS UPDATETEXT
+  DISK ON USE
+  DISTINCT OPEN USER
+  DISTRIBUTED OPENDATASOURCE VALUES
+  DOUBLE OPENQUERY VARYING
+  DROP OPENROWSET VIEW
+  DUMMY OPENXML WAITFOR
+  DUMP OPTION WHEN
+  ELSE OR WHERE
+  END ORDER WHILE
+  ERRLVL OUTER WITH
+  ESCAPE OVER WRITETEXT
+}
+  attr_reader :line
+  attr_reader :col
+  attr :token_value
+  def initialize(line,col,val)
+    @line = line
+    @col  = col
+    @token_value = val
+  end
+  def token_type
+    case
+ #     when @token_value =~ /^#.*/ then :TempTable
+      when @token_value =~ /^[&|~^\/<>+%*!=-]/ then :Operator
+      when @token_value =~ /^@.*/            then :HostVariable
+      when @token_value =~ /^[.]$/           then :Dot
+      when @token_value =~ /^[()]$/          then :Paran
+      when @token_value =~ /^[;]$/           then :SemiColon
+      when @token_value =~ /^[,]$/           then :Comma
+      when @token_value =~ /.+:$/            then :Label
+      when @token_value =~ /^N?'/            then :String
+      when @token_value =~ /^([$])?\d+(\.\d+)?$/   then :Number
+      when DATA_TYPES.include?(@token_value) then :DataType
+      when KEY_WORDS.include?(@token_value)  then :KeyWord
+      else :Id
+    end
+  end
+  def ==(tok)
+    return nil unless tok
+    (self.line == tok.line) &&
+    (self.col == tok.col) &&
+    (self.token_value == tok.token_value)
+  end
+end
+# Class to tokenize a given string or file.
+class Tokenizer
+  VERSION = "0.0.1"
+  def initialize(file=nil)
+      @input_file = file
+      @tokens = []
+      @position = 0
+  end
+#------------------
+# Private Methods:
+  ## Calculate the sum of lengths of all the prior elements of the array
+  ## Assume that each element has the .length method defined
+  def sum_prior_len(arr,n)
+      return 0 if n < 1
+      arr[n-1].length + sum_prior_len(arr,n-1)
+  end
+private :sum_prior_len
+  ## Split the token string into its subparts and return an array of triplets
+  def tok_split(line,pos,tok_str,preserve_case=nil)
+    t = (preserve_case ? tok_str.strip : tok_str.strip.upcase)
+    arr = (t.length > 0) ? [[line,pos,t]] : []
+    tk = []
+    tk = (t.gsub(/([&|~^\/<>+%*,;!()=-])/ ){' ' + $1 + ' '}.
+            gsub(/(\W)(\d*?[.]\d*)/){$1 + ' ' + $2 + ' '}.
+            gsub(/([A-Za-z_]\w*)[.]([A-Za-z_]\w*)?/){ ($1 || '') + ' . ' + ($2 || '') }.
+            gsub(/^[.]([A-Za-z_]\w*)/){  ' . ' + ($1 || '') }
+    ).split unless t =~ /['"\]]$/ #'
+    if (tk.size > 1)
+      arr = []
+      tk.each_with_index{|e,i|
+        end_pos = (pos - t.length + sum_prior_len(tk,i) + e.size)
+        #p [e, "pos="+pos.to_s, "t.size="+t.size.to_s,"sum_prior(#{i})="+sum_prior_len(tk,i).to_s,'sz='+e.size.to_s]
+        arr << [line, end_pos, e]
+      }
+      #p arr;puts
+    end
+    arr
+  end
+private :tok_split
+# ------------------
+# Public Methods:
+  ## Split the string into its sub-strings and return an array of triplets.
+  ## Each triplet contains the ending line number, ending  column number
+  ## and the sub-string (token string). A token string may spill over
+  ## multiple lines.
+  ##
+  def tokenize_string(str)
+    #puts str; puts
+    stream = str.split('')
+    slc = nil      # Single Line Comment indicator
+    mlc = nil      # Multi  Line Comment indicator
+    sq  = nil      # Single Quote indicator
+    dq  = nil      # Double Quote indicator
+    bkt = nil      # Bracket indicator
+    qtok  = ""     # accumulator for quoted-string token
+    atok  = ""     # accumulator for all types of tokens except quoted-string
+    qstr  = ""     # Final quoted string
+    tok_arr  = []  # token array
+    col    = 0     # Column number of the token
+    line   = 1     # Line number of the token
+    i      = 1     # Current Position in the stream
+    prev_c = ''    # Single character look behind
+    while stream != []
+        c = stream.shift
+        #puts c
+        case
+        when c =~ /[ \t]/
+          unless (slc or mlc or sq or dq or bkt)
+            tok_arr += tok_split(line,col,atok)
+            atok = ""
+          end
+        when ((prev_c == '-') and (c == '-'))
+          unless (slc or mlc or sq or dq or bkt)
+            slc = i
+            atok.chop!
+            #puts "starting a single-line comment @ #{i}"
+          end
+        when ((prev_c == '/') and (c == '*'))
+          unless (slc or mlc or sq or dq or bkt)
+              mlc = i
+              atok.chop!
+              #puts "starting a  multi-line comment @ #{i}"
+          end
+        when ((prev_c == '*') and (c == '/'))
+          if (mlc and (mlc < (i-1)))
+            mlc = nil
+            c = ''
+            #puts "  ending a  multi-line comment @ #{i}"
+          end
+        when ((c == "\r") or (c == "\n"))
+          unless (slc or mlc or sq or dq or bkt)
+            tok_arr += tok_split(line,col,atok)
+            atok = ""
+          end
+          (col = 0; line += 1) if (c == "\n")
+          if slc
+            slc = nil
+            c = ''
+            #puts "  ending a single-line comment @ #{i}"
+          end
+        when (c == "'")
+          unless (slc or mlc or dq or bkt)
+            if sq
+              ### WARNING:
+              # This logic is wrong: it assumes end of the single-quote token
+              # But in case of a embedded/escaped single-quote the token has
+              # not yet ended. Needs to be fixed in a later version.
+              ###
+              sq = nil
+              qtok += c
+              c = ''
+              qstr = qtok
+              #puts "   ending single-quote @ #{i}"
+            else
+              sq = i
+              if prev_c == 'N'
+                qtok = 'N'
+                atok.chop!
+                temp_pos = col-1
+              else
+                qtok = ""
+                temp_pos = col
+              end
+              tok_arr += tok_split(line,temp_pos,atok)
+              atok = ""
+              #puts " starting single-quote @ #{i}"
+            end
+          end
+        when (c == '"')
+          unless (slc or mlc or sq or bkt)
+            if dq
+              dq = nil
+              qtok += c
+              c = ''
+              qstr = qtok
+              #puts "   ending double-quote @ #{i}"
+            else
+              dq = i
+              qtok = ""
+              tok_arr += tok_split(line,col,atok)
+              atok = ""
+              #puts " starting double-quote @ #{i}"
+            end
+          end
+        when (c == '[')
+          unless (slc or mlc or sq or dq or bkt)
+            bkt = i
+            qtok = ""
+            tok_arr += tok_split(line,col,atok)
+            atok = ""
+            #puts " starting square-bracket @ #{i}"
+          end
+        when (c == ']')
+          if bkt
+            bkt = nil
+            qtok += c
+            c = ''
+            qstr = qtok
+            #puts "   ending square-bracket @ #{i}"
+          end
+        end
+        qtok += c if (sq or dq or bkt)
+        atok += c unless (slc or mlc or sq or dq or bkt)
+        prev_c = c
+        col += 1
+        i += 1
+        (tok_arr += tok_split(line,col,qstr,true); qstr = "";) if qstr.size > 0
+    end
+    tok_arr += tok_split(line, col, atok)
+    raise "#{@input_file} Umatched quoted string at (#{line},#{col})" if (sq or dq or bkt)
+    raise "#{@input_file} Incomplete Comment at (#{line},#{col})"     if mlc
+    @tokens = tok_arr
+  end
+  def tokenize_file(file=nil)
+    @input_file ||= file
+    if @input_file
+      arr = IO.readlines(@input_file)
+      tokenize_string(arr.join)
+    end
+  rescue
+    puts $!.to_s
+  ensure
+    return @tokens.length
+  end
+  def get_next_token
+    tok = ((@position >= 0) ? @tokens[@position] : nil)
+    return tok unless tok
+    token = Token.new(*tok)
+    @position += 1
+    token
+  end
+  def look_back(m)
+    tok = ((@position >= m) ? @tokens[@position-m] : nil)
+    return tok unless tok
+    token = Token.new(*tok)
+    token
+  end
+  def current_token;    look_back(0);  end
+  def previous_token;   look_back(1);  end
+  def unget_token
+    @position -= 1 if (@position >= 0)
+    @position
+  end
+end
+if $0 == __FILE__
+  t = Tokenizer.new("..\\..\\test\\tsql_scripts\\simple.sql") #
+  t.tokenize_file
+  #t.tokenize_string("select 10 as ten , 'this is a word' as word, '[two\n   lines  \n,  in  \"between\" this ]' as two")
+  begin
+    s = t.get_next_token
+    p [s.line, s.col, s.token_value, s.token_type] if s
+  end while s
+end

data/test/tst_tokenizer.rb ADDED

@@ -0,0 +1,211 @@
+$LOAD_PATH.unshift "../lib" if File.basename(Dir.pwd) == "test"
+require 'test/unit'
+require 'tsql_shparser'
+class Tst_Tokenizer < Test::Unit::TestCase
+  def setup
+    @t = Tokenizer.new
+  end
+  def teardown
+    @t = nil
+  end
+  def test00
+    assert ! @t.get_next_token
+    assert ! @t.look_back(10)
+    assert ! @t.current_token
+    assert ! @t.previous_token
+    assert @t.tokenize_file == 0
+    assert @t.unget_token == -1
+    assert @t.tokenize_string('') == []
+  end
+  def test01
+    arr = @t.tokenize_string("select")
+    assert arr.length == 1
+    arr = @t.tokenize_string("select *")
+    assert arr.length == 2
+    arr = @t.tokenize_string("select *\nfrom ")
+    assert arr.length == 3
+    arr = @t.tokenize_string("select *\nfrom T")
+    assert arr.length == 4
+    arr = @t.tokenize_string("select count(*) from T")
+    assert arr.length == 7
+    arr = @t.tokenize_string("select count(*) from S.T")
+    assert arr.length == 9
+  end
+  def test010
+    arr = @t.tokenize_string("select count(*),10.5 from S.T")
+    assert arr.length == 11
+  end
+  def test02
+    arr = @t.tokenize_string("select * -- this is comment\nfrom T")
+    assert arr.length == 4
+    arr = @t.tokenize_string("select *,  -- this is a 'comment'\n cmnt from T")
+    assert arr.length == 6
+    arr = @t.tokenize_string("select *,  '-- this is comment\n' cmnt from T")
+    assert arr.length == 7
+    arr = @t.tokenize_string("select '/*' as startc, \n  '*/' as endc")
+    assert arr.length == 8
+    arr = @t.tokenize_string("select /* as startc, \n  */ as endc")
+    assert arr.length == 3
+  end
+  def test03
+    arr = @t.tokenize_string("select 10")
+    tk1  = @t.get_next_token
+    @t.unget_token
+    tk2  = @t.get_next_token
+    assert tk1 == tk2
+  end
+  def test04
+# This is not being tokenized correctly yet
+      str = %Q{
+          select  ' Code = ''' + B + '''' as code
+          from T
+      }
+      arr = @t.tokenize_string(str)
+      assert arr.length == 10
+  end
+  def test05
+    arr = @t.tokenize_string("select A.COL1 from S.T as A")
+    assert arr.length == 10
+  end
+  def test050
+    arr = @t.tokenize_string("select A.[COL1] from S.T as A")
+    assert arr.length == 10
+  end
+  def test051
+    arr = @t.tokenize_string("select A1.[COL1] from S.T as A")
+    assert arr.length == 10
+  end
+  def test052
+    arr = @t.tokenize_string("select [A1].[COL1] from S.T as A")
+    assert arr.length == 10
+  end
+  def test053
+    arr = @t.tokenize_string("select [A1].COL1 from S.T as A")
+    assert arr.length == 10
+  end
+  def test06
+    arr = @t.tokenize_string("select A.* from T")
+    assert arr.length == 6
+  end
+  def test060
+    arr = @t.tokenize_string("select A1.* from T")
+    assert arr.length == 6
+  end
+  def test061
+    arr = @t.tokenize_string("select [A].* from T")
+    assert arr.length == 6
+  end
+  def test07
+    assert_raise(RuntimeError) {@t.tokenize_string('/*/')}
+    assert_raise(RuntimeError) {@t.tokenize_string("select ' as done")}
+    assert_raise(RuntimeError) {@t.tokenize_string("select \" as done")}
+    assert_raise(RuntimeError) {@t.tokenize_string("select [ as done")}
+  end
+  def test08
+    arr = @t.tokenize_string("select 'Long \n multiline \n string constant' cstr")
+    assert arr.length == 3
+    arr = @t.tokenize_string("select N'Long \n multiline \n Unicode string constant' ucstr")
+    assert arr.length == 3
+    arr = @t.tokenize_string("select [Long \n multiline \n column name] as str")
+    assert arr.length == 4
+    arr = @t.tokenize_string(%Q{select "Long \n multiline \n alias" = 'str'})
+    assert arr.length == 4
+    arr = @t.tokenize_string("select A.col1, [Long \n multiline \n column name] as str from T A")
+    assert arr.length == 11
+  end
+  def test09
+    arr = @t.tokenize_string("select case when A!=B  then A else B end MAX_AB")
+    assert arr.length == 13
+  end
+  def test10
+    arr = @t.tokenize_string("select case when ((A=B))  then A else B end MAX_AB")
+    assert arr.length == 16
+  end
+  def test11
+    arr = @t.tokenize_string(%Q{select '+=-%/~`!@#$&*)"(-_][|:;,.<>?/\\{}^' AB})
+    assert arr.length == 3
+  end
+  def test12
+    arr = @t.tokenize_string("select 10.*5 from T")
+    assert arr.length == 6
+  end
+  def test13
+    arr = @t.tokenize_string("select 10.*\n.5 from T")
+    assert arr.length == 6
+  end
+  def test14
+    arr = @t.tokenize_string("select [Long's Island] from T")
+    assert arr.length == 4
+  end
+end
+__END__