sqlh 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlh/core/helper.py ADDED
@@ -0,0 +1,444 @@
1
+ """
2
+ SQL parser with token-based analysis.
3
+
4
+ This module provides SQL parsing functionality using token-based analysis:
5
+ |- Splitting multi-statement SQL by semicolons
6
+ |- Removing SQL comments (single-line and multi-line)
7
+ |- Extracting source and target tables
8
+ |- Handling CTE (Common Table Expression) identification
9
+
10
+ The parser uses keyword-based tokenization rather than full AST parsing,
11
+ making it lightweight and fast for simple table/field extraction tasks.
12
+ """
13
+
14
+ from .keywords import KeyWords
15
+
16
+
17
+ class ParseException(Exception):
18
+ """Exception raised when SQL parsing fails."""
19
+ pass
20
+
21
+
22
+ # ============================================================================
23
+ # Module-level functions (replacing SqlHelper class)
24
+ # ============================================================================
25
+
26
+
27
+ def split_sql(sql: str) -> list[str]:
28
+ """
29
+ Split multi-statement SQL by semicolons, handling comments and quotes.
30
+
31
+ Args:
32
+ sql: SQL statement string
33
+
34
+ Returns:
35
+ List of individual SQL statements
36
+
37
+ Example:
38
+ >>> split_sql("SELECT 1; SELECT 2;")
39
+ ["SELECT 1", " SELECT 2"]
40
+ """
41
+ result = []
42
+ # 嵌套注释的层级数
43
+ depth = 0
44
+ # 多行SQL的前缀语句,分号之前的语句
45
+ prefix = ""
46
+ sql = sql + ";" if not sql.strip().endswith(";") else sql
47
+
48
+ for line in sql.splitlines():
49
+ line = "" if line.strip().startswith("--") else line
50
+ # 标记是否以双引号结尾
51
+ has_terminated_double_quote = True
52
+ # 标记是否以单引号结尾
53
+ has_terminated_single_quote = True
54
+ # 标记是否属于单行注释内容
55
+ is_single_line_comment = False
56
+ # 标记前一个字符是否是短横行 "-"
57
+ was_pre_dash = False
58
+ # 标记前一个字符是否是斜杆 "/"
59
+ was_pre_slash = False
60
+ # 标记前一个字符是否是星号 "*"
61
+ was_pre_star = False
62
+ last_semi_index = 0
63
+ index = 0
64
+
65
+ if len(prefix) > 0:
66
+ prefix += "\n"
67
+
68
+ for char in line:
69
+ index += 1
70
+ match char:
71
+ case "'":
72
+ if has_terminated_double_quote:
73
+ has_terminated_single_quote = not has_terminated_single_quote
74
+ case '"':
75
+ if has_terminated_single_quote:
76
+ has_terminated_double_quote = not has_terminated_double_quote
77
+ case "-":
78
+ if has_terminated_double_quote and has_terminated_single_quote:
79
+ if was_pre_dash:
80
+ is_single_line_comment = True
81
+ was_pre_dash = True
82
+ case "/":
83
+ if has_terminated_double_quote and has_terminated_single_quote:
84
+ # 如果'/'前面是'*', 那么嵌套层级数-1
85
+ if was_pre_star:
86
+ depth -= 1
87
+ was_pre_slash = True
88
+ was_pre_dash = False
89
+ was_pre_star = False
90
+ case "*":
91
+ if has_terminated_double_quote and has_terminated_single_quote:
92
+ # 如果'*'前面是'/', 那么嵌套层级数+1
93
+ if was_pre_slash:
94
+ depth += 1
95
+ was_pre_star = True
96
+ was_pre_dash = False
97
+ was_pre_slash = False
98
+ case ";":
99
+ # 当分号不在单引号内,不在双引号内,不属于单行注释,并且多行嵌套注释的层级数为0时,表示此分号应该作为分隔符进行划分
100
+ if (
101
+ has_terminated_double_quote
102
+ and has_terminated_single_quote
103
+ and not is_single_line_comment
104
+ and depth == 0
105
+ ):
106
+ sql_stmt = prefix + line[last_semi_index : index - 1]
107
+ result.append(sql_stmt)
108
+ prefix = ""
109
+ last_semi_index = index
110
+ case _:
111
+ was_pre_dash = False
112
+ was_pre_slash = False
113
+ was_pre_star = False
114
+
115
+ if last_semi_index != index or len(line) == 0:
116
+ prefix += line[last_semi_index:]
117
+
118
+ assert depth == 0, f"The number of nested levels of sql multi-line comments is not equal to 0: {depth}"
119
+ if "" in result:
120
+ result.remove("")
121
+ return result
122
+
123
+
124
+ def trim_comment(sql: str) -> str:
125
+ """
126
+ Remove single-line and multi-line comments from SQL.
127
+
128
+ Args:
129
+ sql: SQL statement string
130
+
131
+ Returns:
132
+ SQL string with comments removed
133
+ """
134
+ # 1. 删除单行注释
135
+ sql = _trim_single_line_comment(sql=sql)
136
+
137
+ # 2. 将多行SQL转为单行SQL
138
+ sql = "\\n".join(sql.splitlines())
139
+
140
+ # 3. 删除多行注释
141
+ index = 0
142
+ # 嵌套注释的层级数
143
+ depth = 0
144
+ # 标记是否以双引号结尾
145
+ has_terminated_double_quote = True
146
+ # 标记是否以单引号结尾
147
+ has_terminated_single_quote = True
148
+ # 标记前一个字符是否是斜杆 "/"
149
+ was_pre_slash = False
150
+ # 标记前一个字符是否是星号 "*"
151
+ was_pre_star = False
152
+ # 标记是否是SQL Hint
153
+ is_hint = False
154
+ comment_start_index = 0
155
+ comment_end_index = 0
156
+ comment_index_list = []
157
+
158
+ for char in sql:
159
+ index += 1
160
+ match char:
161
+ case "'":
162
+ if has_terminated_double_quote:
163
+ has_terminated_single_quote = not has_terminated_single_quote
164
+ case '"':
165
+ if has_terminated_single_quote:
166
+ has_terminated_double_quote = not has_terminated_double_quote
167
+ case "/":
168
+ if has_terminated_double_quote and has_terminated_single_quote:
169
+ # 如果'/'前面是'*', 那么嵌套层级数-1
170
+ if was_pre_star:
171
+ if not is_hint:
172
+ depth -= 1
173
+ if depth == 0:
174
+ comment_end_index = index
175
+ comment_index_list.append((comment_start_index, comment_end_index))
176
+ else:
177
+ is_hint = False
178
+ was_pre_slash = True
179
+ was_pre_star = False
180
+ case "*":
181
+ if has_terminated_double_quote and has_terminated_single_quote:
182
+ # 如果'*'前面是'/', 那么嵌套层级数+1
183
+ if was_pre_slash:
184
+ depth += 1
185
+ # 记录层级为1的开始索引
186
+ if depth == 1:
187
+ comment_start_index = index - 2
188
+ was_pre_star = True
189
+ was_pre_slash = False
190
+ case "+":
191
+ if has_terminated_double_quote and has_terminated_single_quote:
192
+ if was_pre_star and depth == 1:
193
+ is_hint = True
194
+ depth = 0
195
+ was_pre_star = False
196
+ was_pre_slash = False
197
+ case _:
198
+ was_pre_slash = False
199
+ was_pre_star = False
200
+
201
+ for start, end in reversed(comment_index_list):
202
+ sql = sql[:start] + sql[end:]
203
+
204
+ # 4. 单行SQL转为多行
205
+ sql = sql.replace("\\n", "\n")
206
+ return sql
207
+
208
+
209
+ def get_source_target_tables(sql: str) -> dict[str, list[str]] | None:
210
+ """
211
+ Extract source and target tables from a single SQL statement.
212
+
213
+ This method uses token-based parsing to identify table dependencies.
214
+ CTE (Common Table Expression) intermediate tables are filtered out.
215
+
216
+ Args:
217
+ sql: Single SQL statement string
218
+
219
+ Returns:
220
+ Dictionary with keys:
221
+ - "source_tables": list of source table names
222
+ - "target_tables": list of target table names
223
+ Returns None if no tables found
224
+
225
+ Raises:
226
+ ParseException: If SQL contains multiple statements
227
+
228
+ Note:
229
+ TODO:
230
+ {
231
+ "source_tables": [(t1, 1), (t2, 2), (t3, 3)],
232
+ "target_tables": [(t4, 1)]
233
+ }
234
+ """
235
+ # 预处理:去掉多行注释和单行注释
236
+ sql = trim_comment(sql).strip()
237
+ # 删除末尾的`;`
238
+ sql = sql[:-1] if sql.endswith(";") else sql
239
+
240
+ # 校验SQL参数
241
+ if len(split_sql(sql)) > 1:
242
+ raise ParseException("sql脚本为多条SQL语句,需传入单条SQL语句.")
243
+
244
+ was_pre_insert = False
245
+ was_pre_from = False
246
+ was_pre_as = False
247
+ was_merge = False
248
+ was_using = False
249
+ was_pre_table_name = False
250
+ was_pre_table_function = False
251
+ target_tables: list[str] = []
252
+ source_tables: list[str] = []
253
+ result: dict[str, list[str]] = {}
254
+
255
+ for line in sql.splitlines():
256
+ line = line.strip()
257
+ if len(line) == 0:
258
+ continue
259
+
260
+ line = line.replace("(", " ( ")
261
+ line = line.replace(")", " ) ")
262
+ line = line.replace(",", " , ")
263
+
264
+ for token in line.split(" "):
265
+ token = token.strip()
266
+ if len(token) == 0:
267
+ continue
268
+
269
+ if token.upper() == "AS":
270
+ was_pre_as = True
271
+ continue
272
+
273
+ if token.upper() in KeyWords.insert_keywords:
274
+ was_pre_insert = True
275
+ was_pre_from = False
276
+ continue
277
+
278
+ if token.upper() == "MERGE":
279
+ was_merge = True
280
+ continue
281
+
282
+ if token.upper() == "USING":
283
+ was_using = True
284
+ continue
285
+
286
+ if token.upper() in KeyWords.from_keywords:
287
+ was_pre_from = True
288
+ was_pre_insert = False
289
+ was_pre_table_name = False
290
+ continue
291
+
292
+ if was_pre_as and token.upper() not in KeyWords.keywords:
293
+ was_pre_as = False
294
+ was_pre_table_name = False
295
+ continue
296
+
297
+ if token.upper() in KeyWords.keywords:
298
+ if was_pre_insert or was_pre_from:
299
+ was_pre_from = False
300
+ continue
301
+
302
+ if token.upper() not in KeyWords.keywords and was_pre_insert:
303
+ target_tables.append(token)
304
+ was_pre_insert = False
305
+ was_pre_from = False
306
+ continue
307
+
308
+ if token.upper() in KeyWords.table_function_keywords and was_pre_from:
309
+ was_pre_table_function = True
310
+ continue
311
+
312
+ # merge into
313
+ if was_merge and not was_using and token.upper() not in KeyWords.keywords and len(target_tables) == 0:
314
+ target_tables.append(token)
315
+ continue
316
+
317
+ if was_merge and was_using and token.upper() not in KeyWords.keywords:
318
+ if token != "(":
319
+ source_tables.append(token)
320
+ was_using = False
321
+ was_merge = False
322
+ continue
323
+
324
+ if was_pre_from:
325
+ if (
326
+ token not in KeyWords.keywords
327
+ and not was_pre_table_name
328
+ and token not in (",", "(")
329
+ and not was_pre_table_function
330
+ ):
331
+ source_tables.append(token)
332
+ was_pre_from = True
333
+ was_pre_table_name = True
334
+ if token in ["AS", ","]:
335
+ was_pre_from = True
336
+ was_pre_table_name = False
337
+
338
+ mid_table = _get_cte_mid_tables(sql)
339
+ source_tables = list(set(source_tables) - set(mid_table))
340
+ if len(source_tables) != 0:
341
+ result.setdefault("target_tables", target_tables)
342
+ result.setdefault("source_tables", source_tables)
343
+ return result
344
+ else:
345
+ return
346
+
347
+
348
+ # ============================================================================
349
+ # Private helper functions
350
+ # ============================================================================
351
+
352
+
353
+ def _trim_single_line_comment(sql: str) -> str:
354
+ """删除单行注释"""
355
+ result = []
356
+ for line in sql.splitlines():
357
+ line = line.strip()
358
+ line = "" if line.startswith("--") else line
359
+ line = "" if line.startswith("#") else line
360
+ if len(line) == 0:
361
+ continue
362
+
363
+ # 标记是否以双引号结尾
364
+ has_terminated_double_quote = True
365
+ # 标记是否以单引号结尾
366
+ has_terminated_single_quote = True
367
+ # 标记前一个字符是否是短横行 "-"
368
+ was_pre_dash = False
369
+ index = 0
370
+
371
+ for char in line:
372
+ index += 1
373
+ match char:
374
+ case "'":
375
+ if has_terminated_double_quote:
376
+ has_terminated_single_quote = not has_terminated_single_quote
377
+ case '"':
378
+ if has_terminated_single_quote:
379
+ has_terminated_double_quote = not has_terminated_double_quote
380
+ case "-":
381
+ if has_terminated_double_quote and has_terminated_single_quote:
382
+ if was_pre_dash:
383
+ line = line[: index - 2]
384
+ continue
385
+ was_pre_dash = True
386
+ case "#":
387
+ if has_terminated_double_quote and has_terminated_single_quote:
388
+ line = line[: index - 1]
389
+ continue
390
+ case _:
391
+ was_pre_dash = False
392
+
393
+ result.append(line)
394
+ return "\n".join(result)
395
+
396
+
397
+ def _get_cte_mid_tables(sql: str) -> list:
398
+ """获取cte语句的临时表名"""
399
+ # 括号层级
400
+ bracket_level = 0
401
+ was_pre_with = False
402
+ is_cte = False
403
+ was_pre_right_bracket = False
404
+ result = []
405
+
406
+ # 预处理:去掉多行注释和单行注释
407
+ sql = trim_comment(sql)
408
+
409
+ for line in sql.splitlines():
410
+ line = line.strip()
411
+ if len(line) == 0:
412
+ continue
413
+
414
+ line = line.replace("(", " ( ")
415
+ line = line.replace(")", " ) ")
416
+ line = line.replace(",", " , ")
417
+
418
+ for token in line.split(" "):
419
+ token = token.strip()
420
+ if len(token) == 0:
421
+ continue
422
+
423
+ if token.upper() == "(":
424
+ bracket_level += 1
425
+ if token.upper() == ")":
426
+ bracket_level -= 1
427
+ was_pre_right_bracket = True
428
+ if token.upper() == "WITH":
429
+ was_pre_with = True
430
+ is_cte = True
431
+ continue
432
+
433
+ if token.upper() in KeyWords.keywords:
434
+ if was_pre_right_bracket and is_cte and bracket_level == 0 and token.upper() != "AS":
435
+ is_cte = False
436
+
437
+ if token.upper() not in KeyWords.keywords:
438
+ if was_pre_with:
439
+ result.append(token)
440
+ if is_cte and bracket_level == 0 and not was_pre_with and token not in (",", "(", ")"):
441
+ result.append(token)
442
+ was_pre_with = False
443
+
444
+ return result
sqlh/core/keywords.py ADDED
@@ -0,0 +1,62 @@
1
+ class KeyWords:
2
+ keywords = [
3
+ "SELECT",
4
+ "INSERT",
5
+ "DELETE",
6
+ "UPDATE",
7
+ "UPSERT",
8
+ "REPLACE",
9
+ "DROP",
10
+ "CREATE",
11
+ "ALTER",
12
+ "TRUNCATE",
13
+ "WHERE",
14
+ "FROM",
15
+ "INNER",
16
+ "JOIN",
17
+ "AND",
18
+ "ON",
19
+ "OR",
20
+ "LIKE",
21
+ "IN",
22
+ "SET",
23
+ "BY",
24
+ "GROUP",
25
+ "ORDER",
26
+ "LEFT",
27
+ "OUTER",
28
+ "FULL",
29
+ "RIGHT",
30
+ "IF",
31
+ "END",
32
+ "THEN",
33
+ "AS",
34
+ "ELSE",
35
+ "CASE",
36
+ "WHEN",
37
+ "DISTINCT",
38
+ "OVERWRITE",
39
+ "TABLE",
40
+ "OVER",
41
+ "INTO",
42
+ "VIEW",
43
+ "NOT",
44
+ "EXISTS",
45
+ "EXTERNAL",
46
+ "WITH",
47
+ "DATABASE",
48
+ "TEMPORARY",
49
+ "MERGE",
50
+ ]
51
+
52
+ insert_keywords = [
53
+ "INSERT",
54
+ "CREATE",
55
+ ]
56
+
57
+ from_keywords = [
58
+ "FROM",
59
+ "JOIN",
60
+ ]
61
+
62
+ table_function_keywords = ["UNNEST", "LATERAL", "GENERATE_SERIES", "SEQUENCE"]