tsql_shparser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # tsql_tokenizer.rb: Tokenizer for t-SQL
4
+ # Copyright (c) 2005-2006 Shashank Date (shanko_date@yahoo.com)
5
+ #
6
+ # License: Ruby's
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11
+ #
12
+
13
+
14
+ # Defines the Token type. A token is an array of three elements (triplet):
15
+ # [line_number, column_number, token_string]
16
+ # Some tokens may span over multiple lines (e.g. quoted strings), in which
17
+ # case the line number and column number is that of the end of the token.
18
+ class Token
19
+ VERSION = "0.0.1"
20
+ DATA_TYPES = %w{
21
+ SQL_VARIANT DATETIME SMALLDATETIME FLOAT REAL DECIMAL MONEY
22
+ SMALLMONEY BIGINT INT SMALLINT TINYINT BIT NTEXT TEXT
23
+ IMAGE TIMESTAMP UNIQUEIDENTIFIER NVARCHAR NCHAR VARCHAR CHAR
24
+ VARBINARY BINARY
25
+ }
26
+
27
+ KEY_WORDS = %w{
28
+ ADD EXCEPT PERCENT
29
+ ALL EXEC PLAN
30
+ ALTER EXECUTE PRECISION
31
+ AND EXISTS PRIMARY
32
+ ANY EXIT PRINT
33
+ AS FETCH PROC
34
+ ASC FILE PROCEDURE
35
+ AUTHORIZATION FILLFACTOR PUBLIC
36
+ BACKUP FOR RAISERROR
37
+ BEGIN FOREIGN READ
38
+ BETWEEN FREETEXT READTEXT
39
+ BREAK FREETEXTTABLE RECONFIGURE
40
+ BROWSE FROM REFERENCES
41
+ BULK FULL REPLICATION
42
+ BY FUNCTION RESTORE
43
+ CASCADE GOTO RESTRICT
44
+ CASE GRANT RETURN
45
+ CHECK GROUP REVOKE
46
+ CHECKPOINT HAVING RIGHT
47
+ CLOSE HOLDLOCK ROLLBACK
48
+ CLUSTERED IDENTITY ROWCOUNT
49
+ COALESCE IDENTITY_INSERT ROWGUIDCOL
50
+ COLLATE IDENTITYCOL RULE
51
+ COLUMN IF SAVE
52
+ COMMIT IN SCHEMA
53
+ COMPUTE INDEX SELECT
54
+ CONSTRAINT INNER SESSION_USER
55
+ CONTAINS INSERT SET
56
+ CONTAINSTABLE INTERSECT SETUSER
57
+ CONTINUE INTO SHUTDOWN
58
+ CONVERT IS SOME
59
+ CREATE JOIN STATISTICS
60
+ CROSS KEY SYSTEM_USER
61
+ CURRENT KILL TABLE
62
+ CURRENT_DATE LEFT TEXTSIZE
63
+ CURRENT_TIME LIKE THEN TIES
64
+ CURRENT_TIMESTAMP LINENO TO
65
+ CURRENT_USER LOAD TOP
66
+ CURSOR NATIONAL TRAN
67
+ DATABASE NOCHECK TRANSACTION
68
+ DBCC NONCLUSTERED TRIGGER
69
+ DEALLOCATE NOT TRUNCATE
70
+ DECLARE NULL TSEQUAL
71
+ DEFAULT NULLIF UNION
72
+ DELETE OF UNIQUE
73
+ DENY OFF UPDATE
74
+ DESC OFFSETS UPDATETEXT
75
+ DISK ON USE
76
+ DISTINCT OPEN USER
77
+ DISTRIBUTED OPENDATASOURCE VALUES
78
+ DOUBLE OPENQUERY VARYING
79
+ DROP OPENROWSET VIEW
80
+ DUMMY OPENXML WAITFOR
81
+ DUMP OPTION WHEN
82
+ ELSE OR WHERE
83
+ END ORDER WHILE
84
+ ERRLVL OUTER WITH
85
+ ESCAPE OVER WRITETEXT
86
+ }
87
+
88
+ attr_reader :line
89
+ attr_reader :col
90
+ attr :token_value
91
+
92
+ def initialize(line,col,val)
93
+ @line = line
94
+ @col = col
95
+ @token_value = val
96
+ end
97
+
98
+ def token_type
99
+ case
100
+ # when @token_value =~ /^#.*/ then :TempTable
101
+ when @token_value =~ /^[&|~^\/<>+%*!=-]/ then :Operator
102
+ when @token_value =~ /^@.*/ then :HostVariable
103
+ when @token_value =~ /^[.]$/ then :Dot
104
+ when @token_value =~ /^[()]$/ then :Paran
105
+ when @token_value =~ /^[;]$/ then :SemiColon
106
+ when @token_value =~ /^[,]$/ then :Comma
107
+ when @token_value =~ /.+:$/ then :Label
108
+ when @token_value =~ /^N?'/ then :String
109
+ when @token_value =~ /^([$])?\d+(\.\d+)?$/ then :Number
110
+ when DATA_TYPES.include?(@token_value) then :DataType
111
+ when KEY_WORDS.include?(@token_value) then :KeyWord
112
+ else :Id
113
+ end
114
+ end
115
+
116
+ def ==(tok)
117
+ return nil unless tok
118
+
119
+ (self.line == tok.line) &&
120
+ (self.col == tok.col) &&
121
+ (self.token_value == tok.token_value)
122
+ end
123
+
124
+ end
125
+
126
+ # Class to tokenize a given string or file.
127
+ class Tokenizer
128
+ VERSION = "0.0.1"
129
+
130
+ def initialize(file=nil)
131
+ @input_file = file
132
+ @tokens = []
133
+ @position = 0
134
+ end
135
+
136
+ #------------------
137
+ # Private Methods:
138
+
139
+ ## Calculate the sum of lengths of all the prior elements of the array
140
+ ## Assume that each element has the .length method defined
141
+ def sum_prior_len(arr,n)
142
+ return 0 if n < 1
143
+ arr[n-1].length + sum_prior_len(arr,n-1)
144
+ end
145
+
146
+ private :sum_prior_len
147
+
148
+ ## Split the token string into its subparts and return an array of triplets
149
+ def tok_split(line,pos,tok_str,preserve_case=nil)
150
+
151
+ t = (preserve_case ? tok_str.strip : tok_str.strip.upcase)
152
+ arr = (t.length > 0) ? [[line,pos,t]] : []
153
+ tk = []
154
+ tk = (t.gsub(/([&|~^\/<>+%*,;!()=-])/ ){' ' + $1 + ' '}.
155
+ gsub(/(\W)(\d*?[.]\d*)/){$1 + ' ' + $2 + ' '}.
156
+ gsub(/([A-Za-z_]\w*)[.]([A-Za-z_]\w*)?/){ ($1 || '') + ' . ' + ($2 || '') }.
157
+ gsub(/^[.]([A-Za-z_]\w*)/){ ' . ' + ($1 || '') }
158
+ ).split unless t =~ /['"\]]$/ #'
159
+
160
+ if (tk.size > 1)
161
+ arr = []
162
+ tk.each_with_index{|e,i|
163
+ end_pos = (pos - t.length + sum_prior_len(tk,i) + e.size)
164
+ #p [e, "pos="+pos.to_s, "t.size="+t.size.to_s,"sum_prior(#{i})="+sum_prior_len(tk,i).to_s,'sz='+e.size.to_s]
165
+ arr << [line, end_pos, e]
166
+ }
167
+
168
+ #p arr;puts
169
+ end
170
+
171
+ arr
172
+ end
173
+
174
+ private :tok_split
175
+
176
+ # ------------------
177
+ # Public Methods:
178
+
179
+ ## Split the string into its sub-strings and return an array of triplets.
180
+ ## Each triplet contains the ending line number, ending column number
181
+ ## and the sub-string (token string). A token string may spill over
182
+ ## multiple lines.
183
+ ##
184
+ def tokenize_string(str)
185
+ #puts str; puts
186
+ stream = str.split('')
187
+
188
+ slc = nil # Single Line Comment indicator
189
+ mlc = nil # Multi Line Comment indicator
190
+ sq = nil # Single Quote indicator
191
+ dq = nil # Double Quote indicator
192
+ bkt = nil # Bracket indicator
193
+
194
+ qtok = "" # accumulator for quoted-string token
195
+ atok = "" # accumulator for all types of tokens except quoted-string
196
+ qstr = "" # Final quoted string
197
+ tok_arr = [] # token array
198
+
199
+ col = 0 # Column number of the token
200
+ line = 1 # Line number of the token
201
+ i = 1 # Current Position in the stream
202
+ prev_c = '' # Single character look behind
203
+
204
+ while stream != []
205
+ c = stream.shift
206
+ #puts c
207
+ case
208
+ when c =~ /[ \t]/
209
+ unless (slc or mlc or sq or dq or bkt)
210
+ tok_arr += tok_split(line,col,atok)
211
+ atok = ""
212
+ end
213
+ when ((prev_c == '-') and (c == '-'))
214
+ unless (slc or mlc or sq or dq or bkt)
215
+ slc = i
216
+ atok.chop!
217
+ #puts "starting a single-line comment @ #{i}"
218
+ end
219
+ when ((prev_c == '/') and (c == '*'))
220
+ unless (slc or mlc or sq or dq or bkt)
221
+ mlc = i
222
+ atok.chop!
223
+ #puts "starting a multi-line comment @ #{i}"
224
+ end
225
+ when ((prev_c == '*') and (c == '/'))
226
+ if (mlc and (mlc < (i-1)))
227
+ mlc = nil
228
+ c = ''
229
+ #puts " ending a multi-line comment @ #{i}"
230
+ end
231
+ when ((c == "\r") or (c == "\n"))
232
+ unless (slc or mlc or sq or dq or bkt)
233
+ tok_arr += tok_split(line,col,atok)
234
+ atok = ""
235
+ end
236
+
237
+ (col = 0; line += 1) if (c == "\n")
238
+ if slc
239
+ slc = nil
240
+ c = ''
241
+ #puts " ending a single-line comment @ #{i}"
242
+ end
243
+ when (c == "'")
244
+ unless (slc or mlc or dq or bkt)
245
+ if sq
246
+ ### WARNING:
247
+ # This logic is wrong: it assumes end of the single-quote token
248
+ # But in case of a embedded/escaped single-quote the token has
249
+ # not yet ended. Needs to be fixed in a later version.
250
+ ###
251
+ sq = nil
252
+ qtok += c
253
+ c = ''
254
+ qstr = qtok
255
+ #puts " ending single-quote @ #{i}"
256
+ else
257
+ sq = i
258
+
259
+ if prev_c == 'N'
260
+ qtok = 'N'
261
+ atok.chop!
262
+ temp_pos = col-1
263
+ else
264
+ qtok = ""
265
+ temp_pos = col
266
+ end
267
+ tok_arr += tok_split(line,temp_pos,atok)
268
+ atok = ""
269
+ #puts " starting single-quote @ #{i}"
270
+ end
271
+ end
272
+ when (c == '"')
273
+ unless (slc or mlc or sq or bkt)
274
+ if dq
275
+ dq = nil
276
+ qtok += c
277
+ c = ''
278
+ qstr = qtok
279
+ #puts " ending double-quote @ #{i}"
280
+ else
281
+ dq = i
282
+ qtok = ""
283
+ tok_arr += tok_split(line,col,atok)
284
+ atok = ""
285
+ #puts " starting double-quote @ #{i}"
286
+ end
287
+ end
288
+ when (c == '[')
289
+ unless (slc or mlc or sq or dq or bkt)
290
+ bkt = i
291
+ qtok = ""
292
+ tok_arr += tok_split(line,col,atok)
293
+ atok = ""
294
+ #puts " starting square-bracket @ #{i}"
295
+ end
296
+ when (c == ']')
297
+ if bkt
298
+ bkt = nil
299
+ qtok += c
300
+ c = ''
301
+ qstr = qtok
302
+ #puts " ending square-bracket @ #{i}"
303
+ end
304
+ end
305
+
306
+ qtok += c if (sq or dq or bkt)
307
+ atok += c unless (slc or mlc or sq or dq or bkt)
308
+
309
+
310
+ prev_c = c
311
+ col += 1
312
+ i += 1
313
+
314
+ (tok_arr += tok_split(line,col,qstr,true); qstr = "";) if qstr.size > 0
315
+
316
+ end
317
+
318
+ tok_arr += tok_split(line, col, atok)
319
+
320
+ raise "#{@input_file} Umatched quoted string at (#{line},#{col})" if (sq or dq or bkt)
321
+ raise "#{@input_file} Incomplete Comment at (#{line},#{col})" if mlc
322
+
323
+ @tokens = tok_arr
324
+
325
+ end
326
+
327
+
328
+ def tokenize_file(file=nil)
329
+ @input_file ||= file
330
+ if @input_file
331
+ arr = IO.readlines(@input_file)
332
+ tokenize_string(arr.join)
333
+ end
334
+ rescue
335
+ puts $!.to_s
336
+ ensure
337
+ return @tokens.length
338
+ end
339
+
340
+ def get_next_token
341
+ tok = ((@position >= 0) ? @tokens[@position] : nil)
342
+ return tok unless tok
343
+
344
+ token = Token.new(*tok)
345
+ @position += 1
346
+ token
347
+ end
348
+
349
+ def look_back(m)
350
+ tok = ((@position >= m) ? @tokens[@position-m] : nil)
351
+ return tok unless tok
352
+ token = Token.new(*tok)
353
+ token
354
+ end
355
+
356
+ def current_token; look_back(0); end
357
+ def previous_token; look_back(1); end
358
+
359
+ def unget_token
360
+ @position -= 1 if (@position >= 0)
361
+ @position
362
+ end
363
+
364
+ end
365
+
366
+ if $0 == __FILE__
367
+ t = Tokenizer.new("..\\..\\test\\tsql_scripts\\simple.sql") #
368
+ t.tokenize_file
369
+ #t.tokenize_string("select 10 as ten , 'this is a word' as word, '[two\n lines \n, in \"between\" this ]' as two")
370
+ begin
371
+ s = t.get_next_token
372
+ p [s.line, s.col, s.token_value, s.token_type] if s
373
+ end while s
374
+ end
375
+
@@ -0,0 +1,211 @@
1
+ $LOAD_PATH.unshift "../lib" if File.basename(Dir.pwd) == "test"
2
+ require 'test/unit'
3
+ require 'tsql_shparser'
4
+
5
+ class Tst_Tokenizer < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @t = Tokenizer.new
9
+ end
10
+
11
+ def teardown
12
+ @t = nil
13
+ end
14
+
15
+ def test00
16
+ assert ! @t.get_next_token
17
+ assert ! @t.look_back(10)
18
+ assert ! @t.current_token
19
+ assert ! @t.previous_token
20
+ assert @t.tokenize_file == 0
21
+ assert @t.unget_token == -1
22
+ assert @t.tokenize_string('') == []
23
+ end
24
+
25
+ def test01
26
+
27
+ arr = @t.tokenize_string("select")
28
+ assert arr.length == 1
29
+
30
+ arr = @t.tokenize_string("select *")
31
+ assert arr.length == 2
32
+
33
+ arr = @t.tokenize_string("select *\nfrom ")
34
+ assert arr.length == 3
35
+
36
+ arr = @t.tokenize_string("select *\nfrom T")
37
+ assert arr.length == 4
38
+
39
+ arr = @t.tokenize_string("select count(*) from T")
40
+ assert arr.length == 7
41
+
42
+ arr = @t.tokenize_string("select count(*) from S.T")
43
+ assert arr.length == 9
44
+ end
45
+
46
+
47
+ def test010
48
+
49
+ arr = @t.tokenize_string("select count(*),10.5 from S.T")
50
+ assert arr.length == 11
51
+
52
+ end
53
+
54
+ def test02
55
+
56
+ arr = @t.tokenize_string("select * -- this is comment\nfrom T")
57
+ assert arr.length == 4
58
+
59
+ arr = @t.tokenize_string("select *, -- this is a 'comment'\n cmnt from T")
60
+ assert arr.length == 6
61
+
62
+ arr = @t.tokenize_string("select *, '-- this is comment\n' cmnt from T")
63
+ assert arr.length == 7
64
+
65
+ arr = @t.tokenize_string("select '/*' as startc, \n '*/' as endc")
66
+ assert arr.length == 8
67
+
68
+ arr = @t.tokenize_string("select /* as startc, \n */ as endc")
69
+ assert arr.length == 3
70
+
71
+ end
72
+
73
+ def test03
74
+
75
+ arr = @t.tokenize_string("select 10")
76
+
77
+ tk1 = @t.get_next_token
78
+
79
+ @t.unget_token
80
+
81
+ tk2 = @t.get_next_token
82
+
83
+ assert tk1 == tk2
84
+
85
+ end
86
+
87
+ def test04
88
+ # This is not being tokenized correctly yet
89
+ str = %Q{
90
+ select ' Code = ''' + B + '''' as code
91
+ from T
92
+ }
93
+ arr = @t.tokenize_string(str)
94
+ assert arr.length == 10
95
+
96
+ end
97
+
98
+ def test05
99
+ arr = @t.tokenize_string("select A.COL1 from S.T as A")
100
+ assert arr.length == 10
101
+
102
+ end
103
+
104
+ def test050
105
+ arr = @t.tokenize_string("select A.[COL1] from S.T as A")
106
+ assert arr.length == 10
107
+ end
108
+
109
+ def test051
110
+
111
+ arr = @t.tokenize_string("select A1.[COL1] from S.T as A")
112
+ assert arr.length == 10
113
+
114
+ end
115
+
116
+ def test052
117
+
118
+ arr = @t.tokenize_string("select [A1].[COL1] from S.T as A")
119
+ assert arr.length == 10
120
+
121
+ end
122
+
123
+ def test053
124
+
125
+ arr = @t.tokenize_string("select [A1].COL1 from S.T as A")
126
+ assert arr.length == 10
127
+
128
+ end
129
+
130
+ def test06
131
+
132
+ arr = @t.tokenize_string("select A.* from T")
133
+ assert arr.length == 6
134
+
135
+ end
136
+
137
+ def test060
138
+
139
+ arr = @t.tokenize_string("select A1.* from T")
140
+ assert arr.length == 6
141
+
142
+ end
143
+
144
+ def test061
145
+
146
+ arr = @t.tokenize_string("select [A].* from T")
147
+ assert arr.length == 6
148
+
149
+ end
150
+
151
+
152
+ def test07
153
+ assert_raise(RuntimeError) {@t.tokenize_string('/*/')}
154
+ assert_raise(RuntimeError) {@t.tokenize_string("select ' as done")}
155
+ assert_raise(RuntimeError) {@t.tokenize_string("select \" as done")}
156
+ assert_raise(RuntimeError) {@t.tokenize_string("select [ as done")}
157
+ end
158
+
159
+ def test08
160
+
161
+ arr = @t.tokenize_string("select 'Long \n multiline \n string constant' cstr")
162
+ assert arr.length == 3
163
+
164
+
165
+ arr = @t.tokenize_string("select N'Long \n multiline \n Unicode string constant' ucstr")
166
+ assert arr.length == 3
167
+
168
+ arr = @t.tokenize_string("select [Long \n multiline \n column name] as str")
169
+ assert arr.length == 4
170
+
171
+ arr = @t.tokenize_string(%Q{select "Long \n multiline \n alias" = 'str'})
172
+ assert arr.length == 4
173
+
174
+ arr = @t.tokenize_string("select A.col1, [Long \n multiline \n column name] as str from T A")
175
+ assert arr.length == 11
176
+
177
+ end
178
+
179
+ def test09
180
+ arr = @t.tokenize_string("select case when A!=B then A else B end MAX_AB")
181
+ assert arr.length == 13
182
+ end
183
+
184
+ def test10
185
+ arr = @t.tokenize_string("select case when ((A=B)) then A else B end MAX_AB")
186
+ assert arr.length == 16
187
+ end
188
+
189
+ def test11
190
+ arr = @t.tokenize_string(%Q{select '+=-%/~`!@#$&*)"(-_][|:;,.<>?/\\{}^' AB})
191
+ assert arr.length == 3
192
+ end
193
+
194
+ def test12
195
+ arr = @t.tokenize_string("select 10.*5 from T")
196
+ assert arr.length == 6
197
+ end
198
+
199
+ def test13
200
+ arr = @t.tokenize_string("select 10.*\n.5 from T")
201
+ assert arr.length == 6
202
+ end
203
+
204
+ def test14
205
+ arr = @t.tokenize_string("select [Long's Island] from T")
206
+ assert arr.length == 4
207
+ end
208
+
209
+ end
210
+
211
+ __END__