tsql_shparser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,375 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # tsql_tokenizer.rb: Tokenizer for t-SQL
4
+ # Copyright (c) 2005-2006 Shashank Date (shanko_date@yahoo.com)
5
+ #
6
+ # License: Ruby's
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11
+ #
12
+
13
+
14
+ # Defines the Token type. A token is an array of three elements (triplet):
15
+ # [line_number, column_number, token_string]
16
+ # Some tokens may span over multiple lines (e.g. quoted strings), in which
17
+ # case the line number and column number is that of the end of the token.
18
+ class Token
19
+ VERSION = "0.0.1"
20
+ DATA_TYPES = %w{
21
+ SQL_VARIANT DATETIME SMALLDATETIME FLOAT REAL DECIMAL MONEY
22
+ SMALLMONEY BIGINT INT SMALLINT TINYINT BIT NTEXT TEXT
23
+ IMAGE TIMESTAMP UNIQUEIDENTIFIER NVARCHAR NCHAR VARCHAR CHAR
24
+ VARBINARY BINARY
25
+ }
26
+
27
+ KEY_WORDS = %w{
28
+ ADD EXCEPT PERCENT
29
+ ALL EXEC PLAN
30
+ ALTER EXECUTE PRECISION
31
+ AND EXISTS PRIMARY
32
+ ANY EXIT PRINT
33
+ AS FETCH PROC
34
+ ASC FILE PROCEDURE
35
+ AUTHORIZATION FILLFACTOR PUBLIC
36
+ BACKUP FOR RAISERROR
37
+ BEGIN FOREIGN READ
38
+ BETWEEN FREETEXT READTEXT
39
+ BREAK FREETEXTTABLE RECONFIGURE
40
+ BROWSE FROM REFERENCES
41
+ BULK FULL REPLICATION
42
+ BY FUNCTION RESTORE
43
+ CASCADE GOTO RESTRICT
44
+ CASE GRANT RETURN
45
+ CHECK GROUP REVOKE
46
+ CHECKPOINT HAVING RIGHT
47
+ CLOSE HOLDLOCK ROLLBACK
48
+ CLUSTERED IDENTITY ROWCOUNT
49
+ COALESCE IDENTITY_INSERT ROWGUIDCOL
50
+ COLLATE IDENTITYCOL RULE
51
+ COLUMN IF SAVE
52
+ COMMIT IN SCHEMA
53
+ COMPUTE INDEX SELECT
54
+ CONSTRAINT INNER SESSION_USER
55
+ CONTAINS INSERT SET
56
+ CONTAINSTABLE INTERSECT SETUSER
57
+ CONTINUE INTO SHUTDOWN
58
+ CONVERT IS SOME
59
+ CREATE JOIN STATISTICS
60
+ CROSS KEY SYSTEM_USER
61
+ CURRENT KILL TABLE
62
+ CURRENT_DATE LEFT TEXTSIZE
63
+ CURRENT_TIME LIKE THEN TIES
64
+ CURRENT_TIMESTAMP LINENO TO
65
+ CURRENT_USER LOAD TOP
66
+ CURSOR NATIONAL TRAN
67
+ DATABASE NOCHECK TRANSACTION
68
+ DBCC NONCLUSTERED TRIGGER
69
+ DEALLOCATE NOT TRUNCATE
70
+ DECLARE NULL TSEQUAL
71
+ DEFAULT NULLIF UNION
72
+ DELETE OF UNIQUE
73
+ DENY OFF UPDATE
74
+ DESC OFFSETS UPDATETEXT
75
+ DISK ON USE
76
+ DISTINCT OPEN USER
77
+ DISTRIBUTED OPENDATASOURCE VALUES
78
+ DOUBLE OPENQUERY VARYING
79
+ DROP OPENROWSET VIEW
80
+ DUMMY OPENXML WAITFOR
81
+ DUMP OPTION WHEN
82
+ ELSE OR WHERE
83
+ END ORDER WHILE
84
+ ERRLVL OUTER WITH
85
+ ESCAPE OVER WRITETEXT
86
+ }
87
+
88
+ attr_reader :line
89
+ attr_reader :col
90
+ attr :token_value
91
+
92
+ def initialize(line,col,val)
93
+ @line = line
94
+ @col = col
95
+ @token_value = val
96
+ end
97
+
98
+ def token_type
99
+ case
100
+ # when @token_value =~ /^#.*/ then :TempTable
101
+ when @token_value =~ /^[&|~^\/<>+%*!=-]/ then :Operator
102
+ when @token_value =~ /^@.*/ then :HostVariable
103
+ when @token_value =~ /^[.]$/ then :Dot
104
+ when @token_value =~ /^[()]$/ then :Paran
105
+ when @token_value =~ /^[;]$/ then :SemiColon
106
+ when @token_value =~ /^[,]$/ then :Comma
107
+ when @token_value =~ /.+:$/ then :Label
108
+ when @token_value =~ /^N?'/ then :String
109
+ when @token_value =~ /^([$])?\d+(\.\d+)?$/ then :Number
110
+ when DATA_TYPES.include?(@token_value) then :DataType
111
+ when KEY_WORDS.include?(@token_value) then :KeyWord
112
+ else :Id
113
+ end
114
+ end
115
+
116
+ def ==(tok)
117
+ return nil unless tok
118
+
119
+ (self.line == tok.line) &&
120
+ (self.col == tok.col) &&
121
+ (self.token_value == tok.token_value)
122
+ end
123
+
124
+ end
125
+
126
+ # Class to tokenize a given string or file.
127
+ class Tokenizer
128
+ VERSION = "0.0.1"
129
+
130
+ def initialize(file=nil)
131
+ @input_file = file
132
+ @tokens = []
133
+ @position = 0
134
+ end
135
+
136
+ #------------------
137
+ # Private Methods:
138
+
139
+ ## Calculate the sum of lengths of all the prior elements of the array
140
+ ## Assume that each element has the .length method defined
141
+ def sum_prior_len(arr,n)
142
+ return 0 if n < 1
143
+ arr[n-1].length + sum_prior_len(arr,n-1)
144
+ end
145
+
146
+ private :sum_prior_len
147
+
148
+ ## Split the token string into its subparts and return an array of triplets
149
+ def tok_split(line,pos,tok_str,preserve_case=nil)
150
+
151
+ t = (preserve_case ? tok_str.strip : tok_str.strip.upcase)
152
+ arr = (t.length > 0) ? [[line,pos,t]] : []
153
+ tk = []
154
+ tk = (t.gsub(/([&|~^\/<>+%*,;!()=-])/ ){' ' + $1 + ' '}.
155
+ gsub(/(\W)(\d*?[.]\d*)/){$1 + ' ' + $2 + ' '}.
156
+ gsub(/([A-Za-z_]\w*)[.]([A-Za-z_]\w*)?/){ ($1 || '') + ' . ' + ($2 || '') }.
157
+ gsub(/^[.]([A-Za-z_]\w*)/){ ' . ' + ($1 || '') }
158
+ ).split unless t =~ /['"\]]$/ #'
159
+
160
+ if (tk.size > 1)
161
+ arr = []
162
+ tk.each_with_index{|e,i|
163
+ end_pos = (pos - t.length + sum_prior_len(tk,i) + e.size)
164
+ #p [e, "pos="+pos.to_s, "t.size="+t.size.to_s,"sum_prior(#{i})="+sum_prior_len(tk,i).to_s,'sz='+e.size.to_s]
165
+ arr << [line, end_pos, e]
166
+ }
167
+
168
+ #p arr;puts
169
+ end
170
+
171
+ arr
172
+ end
173
+
174
+ private :tok_split
175
+
176
+ # ------------------
177
+ # Public Methods:
178
+
179
+ ## Split the string into its sub-strings and return an array of triplets.
180
+ ## Each triplet contains the ending line number, ending column number
181
+ ## and the sub-string (token string). A token string may spill over
182
+ ## multiple lines.
183
+ ##
184
+ def tokenize_string(str)
185
+ #puts str; puts
186
+ stream = str.split('')
187
+
188
+ slc = nil # Single Line Comment indicator
189
+ mlc = nil # Multi Line Comment indicator
190
+ sq = nil # Single Quote indicator
191
+ dq = nil # Double Quote indicator
192
+ bkt = nil # Bracket indicator
193
+
194
+ qtok = "" # accumulator for quoted-string token
195
+ atok = "" # accumulator for all types of tokens except quoted-string
196
+ qstr = "" # Final quoted string
197
+ tok_arr = [] # token array
198
+
199
+ col = 0 # Column number of the token
200
+ line = 1 # Line number of the token
201
+ i = 1 # Current Position in the stream
202
+ prev_c = '' # Single character look behind
203
+
204
+ while stream != []
205
+ c = stream.shift
206
+ #puts c
207
+ case
208
+ when c =~ /[ \t]/
209
+ unless (slc or mlc or sq or dq or bkt)
210
+ tok_arr += tok_split(line,col,atok)
211
+ atok = ""
212
+ end
213
+ when ((prev_c == '-') and (c == '-'))
214
+ unless (slc or mlc or sq or dq or bkt)
215
+ slc = i
216
+ atok.chop!
217
+ #puts "starting a single-line comment @ #{i}"
218
+ end
219
+ when ((prev_c == '/') and (c == '*'))
220
+ unless (slc or mlc or sq or dq or bkt)
221
+ mlc = i
222
+ atok.chop!
223
+ #puts "starting a multi-line comment @ #{i}"
224
+ end
225
+ when ((prev_c == '*') and (c == '/'))
226
+ if (mlc and (mlc < (i-1)))
227
+ mlc = nil
228
+ c = ''
229
+ #puts " ending a multi-line comment @ #{i}"
230
+ end
231
+ when ((c == "\r") or (c == "\n"))
232
+ unless (slc or mlc or sq or dq or bkt)
233
+ tok_arr += tok_split(line,col,atok)
234
+ atok = ""
235
+ end
236
+
237
+ (col = 0; line += 1) if (c == "\n")
238
+ if slc
239
+ slc = nil
240
+ c = ''
241
+ #puts " ending a single-line comment @ #{i}"
242
+ end
243
+ when (c == "'")
244
+ unless (slc or mlc or dq or bkt)
245
+ if sq
246
+ ### WARNING:
247
+ # This logic is wrong: it assumes end of the single-quote token
248
+ # But in case of a embedded/escaped single-quote the token has
249
+ # not yet ended. Needs to be fixed in a later version.
250
+ ###
251
+ sq = nil
252
+ qtok += c
253
+ c = ''
254
+ qstr = qtok
255
+ #puts " ending single-quote @ #{i}"
256
+ else
257
+ sq = i
258
+
259
+ if prev_c == 'N'
260
+ qtok = 'N'
261
+ atok.chop!
262
+ temp_pos = col-1
263
+ else
264
+ qtok = ""
265
+ temp_pos = col
266
+ end
267
+ tok_arr += tok_split(line,temp_pos,atok)
268
+ atok = ""
269
+ #puts " starting single-quote @ #{i}"
270
+ end
271
+ end
272
+ when (c == '"')
273
+ unless (slc or mlc or sq or bkt)
274
+ if dq
275
+ dq = nil
276
+ qtok += c
277
+ c = ''
278
+ qstr = qtok
279
+ #puts " ending double-quote @ #{i}"
280
+ else
281
+ dq = i
282
+ qtok = ""
283
+ tok_arr += tok_split(line,col,atok)
284
+ atok = ""
285
+ #puts " starting double-quote @ #{i}"
286
+ end
287
+ end
288
+ when (c == '[')
289
+ unless (slc or mlc or sq or dq or bkt)
290
+ bkt = i
291
+ qtok = ""
292
+ tok_arr += tok_split(line,col,atok)
293
+ atok = ""
294
+ #puts " starting square-bracket @ #{i}"
295
+ end
296
+ when (c == ']')
297
+ if bkt
298
+ bkt = nil
299
+ qtok += c
300
+ c = ''
301
+ qstr = qtok
302
+ #puts " ending square-bracket @ #{i}"
303
+ end
304
+ end
305
+
306
+ qtok += c if (sq or dq or bkt)
307
+ atok += c unless (slc or mlc or sq or dq or bkt)
308
+
309
+
310
+ prev_c = c
311
+ col += 1
312
+ i += 1
313
+
314
+ (tok_arr += tok_split(line,col,qstr,true); qstr = "";) if qstr.size > 0
315
+
316
+ end
317
+
318
+ tok_arr += tok_split(line, col, atok)
319
+
320
+ raise "#{@input_file} Umatched quoted string at (#{line},#{col})" if (sq or dq or bkt)
321
+ raise "#{@input_file} Incomplete Comment at (#{line},#{col})" if mlc
322
+
323
+ @tokens = tok_arr
324
+
325
+ end
326
+
327
+
328
+ def tokenize_file(file=nil)
329
+ @input_file ||= file
330
+ if @input_file
331
+ arr = IO.readlines(@input_file)
332
+ tokenize_string(arr.join)
333
+ end
334
+ rescue
335
+ puts $!.to_s
336
+ ensure
337
+ return @tokens.length
338
+ end
339
+
340
+ def get_next_token
341
+ tok = ((@position >= 0) ? @tokens[@position] : nil)
342
+ return tok unless tok
343
+
344
+ token = Token.new(*tok)
345
+ @position += 1
346
+ token
347
+ end
348
+
349
+ def look_back(m)
350
+ tok = ((@position >= m) ? @tokens[@position-m] : nil)
351
+ return tok unless tok
352
+ token = Token.new(*tok)
353
+ token
354
+ end
355
+
356
+ def current_token; look_back(0); end
357
+ def previous_token; look_back(1); end
358
+
359
+ def unget_token
360
+ @position -= 1 if (@position >= 0)
361
+ @position
362
+ end
363
+
364
+ end
365
+
366
+ if $0 == __FILE__
367
+ t = Tokenizer.new("..\\..\\test\\tsql_scripts\\simple.sql") #
368
+ t.tokenize_file
369
+ #t.tokenize_string("select 10 as ten , 'this is a word' as word, '[two\n lines \n, in \"between\" this ]' as two")
370
+ begin
371
+ s = t.get_next_token
372
+ p [s.line, s.col, s.token_value, s.token_type] if s
373
+ end while s
374
+ end
375
+
@@ -0,0 +1,211 @@
1
+ $LOAD_PATH.unshift "../lib" if File.basename(Dir.pwd) == "test"
2
+ require 'test/unit'
3
+ require 'tsql_shparser'
4
+
5
+ class Tst_Tokenizer < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @t = Tokenizer.new
9
+ end
10
+
11
+ def teardown
12
+ @t = nil
13
+ end
14
+
15
+ def test00
16
+ assert ! @t.get_next_token
17
+ assert ! @t.look_back(10)
18
+ assert ! @t.current_token
19
+ assert ! @t.previous_token
20
+ assert @t.tokenize_file == 0
21
+ assert @t.unget_token == -1
22
+ assert @t.tokenize_string('') == []
23
+ end
24
+
25
+ def test01
26
+
27
+ arr = @t.tokenize_string("select")
28
+ assert arr.length == 1
29
+
30
+ arr = @t.tokenize_string("select *")
31
+ assert arr.length == 2
32
+
33
+ arr = @t.tokenize_string("select *\nfrom ")
34
+ assert arr.length == 3
35
+
36
+ arr = @t.tokenize_string("select *\nfrom T")
37
+ assert arr.length == 4
38
+
39
+ arr = @t.tokenize_string("select count(*) from T")
40
+ assert arr.length == 7
41
+
42
+ arr = @t.tokenize_string("select count(*) from S.T")
43
+ assert arr.length == 9
44
+ end
45
+
46
+
47
+ def test010
48
+
49
+ arr = @t.tokenize_string("select count(*),10.5 from S.T")
50
+ assert arr.length == 11
51
+
52
+ end
53
+
54
+ def test02
55
+
56
+ arr = @t.tokenize_string("select * -- this is comment\nfrom T")
57
+ assert arr.length == 4
58
+
59
+ arr = @t.tokenize_string("select *, -- this is a 'comment'\n cmnt from T")
60
+ assert arr.length == 6
61
+
62
+ arr = @t.tokenize_string("select *, '-- this is comment\n' cmnt from T")
63
+ assert arr.length == 7
64
+
65
+ arr = @t.tokenize_string("select '/*' as startc, \n '*/' as endc")
66
+ assert arr.length == 8
67
+
68
+ arr = @t.tokenize_string("select /* as startc, \n */ as endc")
69
+ assert arr.length == 3
70
+
71
+ end
72
+
73
+ def test03
74
+
75
+ arr = @t.tokenize_string("select 10")
76
+
77
+ tk1 = @t.get_next_token
78
+
79
+ @t.unget_token
80
+
81
+ tk2 = @t.get_next_token
82
+
83
+ assert tk1 == tk2
84
+
85
+ end
86
+
87
+ def test04
88
+ # This is not being tokenized correctly yet
89
+ str = %Q{
90
+ select ' Code = ''' + B + '''' as code
91
+ from T
92
+ }
93
+ arr = @t.tokenize_string(str)
94
+ assert arr.length == 10
95
+
96
+ end
97
+
98
+ def test05
99
+ arr = @t.tokenize_string("select A.COL1 from S.T as A")
100
+ assert arr.length == 10
101
+
102
+ end
103
+
104
+ def test050
105
+ arr = @t.tokenize_string("select A.[COL1] from S.T as A")
106
+ assert arr.length == 10
107
+ end
108
+
109
+ def test051
110
+
111
+ arr = @t.tokenize_string("select A1.[COL1] from S.T as A")
112
+ assert arr.length == 10
113
+
114
+ end
115
+
116
+ def test052
117
+
118
+ arr = @t.tokenize_string("select [A1].[COL1] from S.T as A")
119
+ assert arr.length == 10
120
+
121
+ end
122
+
123
+ def test053
124
+
125
+ arr = @t.tokenize_string("select [A1].COL1 from S.T as A")
126
+ assert arr.length == 10
127
+
128
+ end
129
+
130
+ def test06
131
+
132
+ arr = @t.tokenize_string("select A.* from T")
133
+ assert arr.length == 6
134
+
135
+ end
136
+
137
+ def test060
138
+
139
+ arr = @t.tokenize_string("select A1.* from T")
140
+ assert arr.length == 6
141
+
142
+ end
143
+
144
+ def test061
145
+
146
+ arr = @t.tokenize_string("select [A].* from T")
147
+ assert arr.length == 6
148
+
149
+ end
150
+
151
+
152
+ def test07
153
+ assert_raise(RuntimeError) {@t.tokenize_string('/*/')}
154
+ assert_raise(RuntimeError) {@t.tokenize_string("select ' as done")}
155
+ assert_raise(RuntimeError) {@t.tokenize_string("select \" as done")}
156
+ assert_raise(RuntimeError) {@t.tokenize_string("select [ as done")}
157
+ end
158
+
159
+ def test08
160
+
161
+ arr = @t.tokenize_string("select 'Long \n multiline \n string constant' cstr")
162
+ assert arr.length == 3
163
+
164
+
165
+ arr = @t.tokenize_string("select N'Long \n multiline \n Unicode string constant' ucstr")
166
+ assert arr.length == 3
167
+
168
+ arr = @t.tokenize_string("select [Long \n multiline \n column name] as str")
169
+ assert arr.length == 4
170
+
171
+ arr = @t.tokenize_string(%Q{select "Long \n multiline \n alias" = 'str'})
172
+ assert arr.length == 4
173
+
174
+ arr = @t.tokenize_string("select A.col1, [Long \n multiline \n column name] as str from T A")
175
+ assert arr.length == 11
176
+
177
+ end
178
+
179
+ def test09
180
+ arr = @t.tokenize_string("select case when A!=B then A else B end MAX_AB")
181
+ assert arr.length == 13
182
+ end
183
+
184
+ def test10
185
+ arr = @t.tokenize_string("select case when ((A=B)) then A else B end MAX_AB")
186
+ assert arr.length == 16
187
+ end
188
+
189
+ def test11
190
+ arr = @t.tokenize_string(%Q{select '+=-%/~`!@#$&*)"(-_][|:;,.<>?/\\{}^' AB})
191
+ assert arr.length == 3
192
+ end
193
+
194
+ def test12
195
+ arr = @t.tokenize_string("select 10.*5 from T")
196
+ assert arr.length == 6
197
+ end
198
+
199
+ def test13
200
+ arr = @t.tokenize_string("select 10.*\n.5 from T")
201
+ assert arr.length == 6
202
+ end
203
+
204
+ def test14
205
+ arr = @t.tokenize_string("select [Long's Island] from T")
206
+ assert arr.length == 4
207
+ end
208
+
209
+ end
210
+
211
+ __END__