tdfs4ds 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,409 @@
1
+ import tdfs4ds
2
+ from tdfs4ds import logger_safe
3
+
4
+ import re
5
+ from typing import Dict, List, Set, Iterable
6
+
7
+
8
+ _COMMENT_SINGLE = re.compile(r"--[^\n]*")
9
+ _COMMENT_MULTI = re.compile(r"/\*.*?\*/", re.DOTALL | re.MULTILINE)
10
+
11
+ # --- add near the top with other regexes ---
12
+
13
+ _RE_CREATE_TABLE_AS = re.compile(
14
+ r"""
15
+ \bCREATE\s+
16
+ (?:MULTISET\s+|SET\s+|VOLATILE\s+|TEMP(?:ORARY)?\s+)? # qualifiers
17
+ TABLE\s+
18
+ (?P<name>[\w".]+)
19
+ \s+AS\s*\( # <-- require AS (
20
+ """,
21
+ re.IGNORECASE | re.VERBOSE,
22
+ )
23
+
24
+
25
+ _RE_SINGLE_QUOTED_STRING = re.compile(
26
+ r"""
27
+ '
28
+ (?:''|[^'])* # escaped quote or any non-quote char
29
+ '
30
+ """,
31
+ re.VERBOSE | re.DOTALL,
32
+ )
33
+
34
+ def _mask_single_quoted_strings(sql: str) -> str:
35
+ """
36
+ Replace all single-quoted SQL string literals with empty string ''.
37
+ Preserves SQL structure while removing misleading keywords.
38
+ """
39
+ return _RE_SINGLE_QUOTED_STRING.sub("''", sql)
40
+
41
+ _RE_REPLACE_VIEW_AS = re.compile(
42
+ r"""
43
+ \bREPLACE\s+VIEW\s+
44
+ (?P<name>[\w".]+)
45
+ \s+AS\b
46
+ """,
47
+ re.IGNORECASE | re.VERBOSE,
48
+ )
49
+
50
+ _RE_CREATE_OR_REPLACE_VIEW_AS = re.compile(
51
+ r"""
52
+ \b(?:CREATE|REPLACE)\s+VIEW\s+
53
+ (?P<name>[\w".]+)
54
+ \s+AS\b
55
+ """,
56
+ re.IGNORECASE | re.VERBOSE,
57
+ )
58
+
59
+ _RE_MERGE_INTO = re.compile(
60
+ r"""
61
+ \bMERGE\s+INTO\s+
62
+ (?P<name>[\w".]+)
63
+ \b
64
+ """,
65
+ re.IGNORECASE | re.VERBOSE,
66
+ )
67
+
68
+ _RE_UPDATE = re.compile(
69
+ r"""
70
+ \bUPDATE\s+
71
+ (?!SET\b) # <-- prevents UPDATE SET from matching
72
+ (?P<name>[\w".]+)
73
+ \b
74
+ """,
75
+ re.IGNORECASE | re.VERBOSE,
76
+ )
77
+
78
+
79
+ _RE_USING = re.compile(
80
+ r"""
81
+ \bUSING\s+(?P<name>[\w".]+)\b
82
+ """,
83
+ re.IGNORECASE | re.VERBOSE,
84
+ )
85
+
86
+ _RE_DELETE_FROM = re.compile(
87
+ r"""
88
+ \bDELETE\s+FROM\s+
89
+ (?P<name>[\w".]+)
90
+ \b
91
+ """,
92
+ re.IGNORECASE | re.VERBOSE,
93
+ )
94
+
95
+ # Capture ON <identifier> and ON (<subquery>) inside TD_* table functions
96
+ _RE_TD_FUNC_CALL = re.compile(
97
+ r"""\bTD_[A-Z_0-9]+\s*\(""",
98
+ re.IGNORECASE,
99
+ )
100
+
101
+ _RE_TD_ON_CLAUSE = re.compile(
102
+ r"""
103
+ \bON\b\s+
104
+ (?P<arg>
105
+ \([^\)]*\) # naive: one-level parens (good enough for your test cases)
106
+ |
107
+ [\w".]+ # identifier
108
+ )
109
+ """,
110
+ re.IGNORECASE | re.VERBOSE | re.DOTALL,
111
+ )
112
+
113
+
114
+ _RE_INSERT_INTO = re.compile(
115
+ r"""
116
+ \bINSERT\s+INTO\s+
117
+ (?P<name>[\w".]+)
118
+ \b
119
+ """,
120
+ re.IGNORECASE | re.VERBOSE,
121
+ )
122
+
123
+ _RE_CREATE_VIEW_AS = re.compile(
124
+ r"""
125
+ \b(?:CREATE|REPLACE)\s+VIEW\s+
126
+ (?P<name>[\w".]+)
127
+ \s+AS\b
128
+ """,
129
+ re.IGNORECASE | re.VERBOSE,
130
+ )
131
+
132
+ _RE_FROM_JOIN = re.compile(
133
+ r"""
134
+ \bFROM\b
135
+ \s+(?P<name>[\w".]+)
136
+ |
137
+ \b(?:INNER\s+JOIN|LEFT\s+JOIN|RIGHT\s+JOIN|FULL\s+OUTER\s+JOIN|CROSS\s+JOIN|JOIN)\b
138
+ \s+(?P<join_name>[\w".]+)
139
+ """,
140
+ re.IGNORECASE | re.VERBOSE,
141
+ )
142
+
143
+ _RE_CTE_NAMES = re.compile(
144
+ r"""
145
+ \bWITH\b\s*(?P<first>\w+)\s+AS\s*\(
146
+ |,\s*(?P<next>\w+)\s+AS\s*\(
147
+ """,
148
+ re.IGNORECASE | re.VERBOSE,
149
+ )
150
+
151
+ _RE_EXTRACT_FROM = re.compile(
152
+ r"""\bEXTRACT\s*\(\s*[^()]*?\bFROM\b[^()]*?\)""",
153
+ re.IGNORECASE | re.DOTALL,
154
+ )
155
+
156
+ _RE_USING_TABLE = re.compile(
157
+ r"""
158
+ \bUSING\s+(?P<name>[\w".]+)\b
159
+ """,
160
+ re.IGNORECASE | re.VERBOSE,
161
+ )
162
+
163
+ def _extract_using_tables(sql: str) -> List[str]:
164
+ out: List[str] = []
165
+ for m in _RE_USING_TABLE.finditer(sql):
166
+ name = m.group("name")
167
+ # Look ahead: if next non-space char is '(' then it's a TD_* USING option, not a table
168
+ j = m.end()
169
+ while j < len(sql) and sql[j].isspace():
170
+ j += 1
171
+ if j < len(sql) and sql[j] == "(":
172
+ continue
173
+ out.append(name)
174
+ return out
175
+
176
+
177
+ _RE_TD_FUNC_CALL = re.compile(r"\bTD_[A-Z_0-9]+\s*\(", re.IGNORECASE)
178
+
179
+ def _extract_td_function_inputs(sql: str) -> List[str]:
180
+ inputs: List[str] = []
181
+ for m in _RE_TD_FUNC_CALL.finditer(sql):
182
+ parens = _extract_balanced_parens(sql, m.end() - 1) # points to '('
183
+ inner = parens[1:-1] # inside TD_*( ... )
184
+
185
+ for onm in _RE_TD_ON_CLAUSE.finditer(inner):
186
+ arg = onm.group("arg").strip().rstrip(",);")
187
+
188
+ if arg.startswith("(") and arg.endswith(")"):
189
+ sub = arg[1:-1]
190
+ for sm in _RE_FROM_JOIN.finditer(sub):
191
+ nm = sm.group("name") or sm.group("join_name")
192
+ if nm:
193
+ inputs.append(nm)
194
+ else:
195
+ inputs.append(arg)
196
+ return inputs
197
+
198
+
199
+
200
+ def _mask_extract_from(sql: str) -> str:
201
+ # Replace EXTRACT( ... FROM ... ) with a placeholder that contains no FROM keyword
202
+ return _RE_EXTRACT_FROM.sub("EXTRACT(/*masked*/)", sql)
203
+
204
+ _DEFAULT_DB = "<DEFAULT_DATABASE>"
205
+
206
+ def _is_placeholder(part: str) -> bool:
207
+ return part.strip().upper() == _DEFAULT_DB.upper()
208
+
209
+ def _quote_identifier(name: str) -> str:
210
+ name = _strip_alias(name)
211
+
212
+ # already quoted identifier: leave as-is
213
+ if '"' in name:
214
+ return name
215
+
216
+ parts = name.split(".")
217
+ quoted_parts = []
218
+ for p in parts:
219
+ if p == _DEFAULT_DB:
220
+ quoted_parts.append(_DEFAULT_DB) # <-- do NOT quote placeholder
221
+ else:
222
+ quoted_parts.append(f'"{p}"')
223
+ return ".".join(quoted_parts)
224
+
225
+ def _extract_balanced_parens(sql: str, open_paren_index: int) -> str:
226
+ """
227
+ Return the substring starting at open_paren_index (which must point to '(')
228
+ up to and including the matching closing ')'.
229
+ Falls back to end-of-string if unbalanced.
230
+ """
231
+ depth = 0
232
+ for i in range(open_paren_index, len(sql)):
233
+ ch = sql[i]
234
+ if ch == "(":
235
+ depth += 1
236
+ elif ch == ")":
237
+ depth -= 1
238
+ if depth == 0:
239
+ return sql[open_paren_index : i + 1]
240
+ return sql[open_paren_index:]
241
+
242
+
243
+ def _qualify_default_db(name: str, default_db: str) -> str:
244
+ name = _strip_alias(name)
245
+ if "." in name or name.startswith('"') or name.startswith(default_db):
246
+ return name
247
+ return f"{default_db}.{name}"
248
+
249
+
250
+
251
+ def _remove_sql_comments(sql: str) -> str:
252
+ sql = _COMMENT_SINGLE.sub("", sql)
253
+ sql = _COMMENT_MULTI.sub("", sql)
254
+ return sql
255
+
256
+
257
+ def _extract_cte_names(sql: str) -> Set[str]:
258
+ names: Set[str] = set()
259
+ for m in _RE_CTE_NAMES.finditer(sql):
260
+ name = m.group("first") or m.group("next")
261
+ if name:
262
+ names.add(name.lower())
263
+ return names
264
+
265
+
266
+ def _strip_alias(name: str) -> str:
267
+ return name.strip().split()[0]
268
+
269
+
270
+ _DEFAULT_DB = "<DEFAULT_DATABASE>"
271
+
272
+ def _quote_identifier(name: str) -> str:
273
+ name = _strip_alias(name)
274
+ if '"' in name:
275
+ return name
276
+
277
+ parts = name.split(".")
278
+ out = []
279
+ for p in parts:
280
+ if p == _DEFAULT_DB:
281
+ out.append(_DEFAULT_DB) # keep placeholder unquoted
282
+ else:
283
+ out.append(f'"{p}"')
284
+ return ".".join(out)
285
+
286
+
287
+
288
+ def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
289
+ seen: Set[str] = set()
290
+ out: List[str] = []
291
+ for x in items:
292
+ if x not in seen:
293
+ out.append(x)
294
+ seen.add(x)
295
+ return out
296
+
297
+ _RE_TD_FUNC_CALL = re.compile(r"\bTD_[A-Z_0-9]+\s*\(", re.IGNORECASE)
298
+
299
+ def _extract_td_function_inputs(sql: str) -> List[str]:
300
+ inputs: List[str] = []
301
+ for m in _RE_TD_FUNC_CALL.finditer(sql):
302
+ # m.end()-1 is the '('
303
+ parens = _extract_balanced_parens(sql, m.end() - 1)
304
+ inner = parens[1:-1] # strip outer ()
305
+ for onm in _RE_TD_ON_CLAUSE.finditer(inner):
306
+ arg = onm.group("arg").strip()
307
+ if arg.startswith("(") and arg.endswith(")"):
308
+ sub = arg[1:-1]
309
+ for sm in _RE_FROM_JOIN.finditer(sub):
310
+ nm = sm.group("name") or sm.group("join_name")
311
+ if nm:
312
+ inputs.append(nm)
313
+ else:
314
+ inputs.append(arg)
315
+ return inputs
316
+
317
+
318
+
319
+ def analyze_sql_query(sql_query: str, *, debug: bool = False) -> Dict[str, List[str]]:
320
+ """
321
+ Extracts source and target tables/views from a SQL query.
322
+ """
323
+ sql = _remove_sql_comments(sql_query)
324
+ sql = _mask_single_quoted_strings(sql)
325
+ sql = _mask_extract_from(sql) # <-- important fix for EXTRACT(... FROM ...)
326
+ cte_names = _extract_cte_names(sql)
327
+
328
+ targets_raw: List[str] = []
329
+ sources_raw: List[str] = []
330
+
331
+ # DELETE ... USING <table>
332
+ sources_raw += _extract_using_tables(sql)
333
+
334
+
335
+ # --- UPDATE targets (and treat them as sources too) ---
336
+ # IMPORTANT: _RE_UPDATE must be defined with (?!SET\b) to avoid matching MERGE's "UPDATE SET"
337
+ update_targets = [m.group("name") for m in _RE_UPDATE.finditer(sql)]
338
+ targets_raw += update_targets
339
+ sources_raw += update_targets # <-- this is the "UPDATE management" you need
340
+
341
+ # --- targets ---
342
+ targets_raw += [m.group("name") for m in _RE_CREATE_TABLE_AS.finditer(sql)]
343
+ targets_raw += [m.group("name") for m in _RE_INSERT_INTO.finditer(sql)]
344
+ targets_raw += [m.group("name") for m in _RE_CREATE_OR_REPLACE_VIEW_AS.finditer(sql)]
345
+ targets_raw += [m.group("name") for m in _RE_REPLACE_VIEW_AS.finditer(sql)]
346
+ targets_raw += [m.group("name") for m in _RE_MERGE_INTO.finditer(sql)]
347
+ targets_raw += [m.group("name") for m in _RE_DELETE_FROM.finditer(sql)]
348
+
349
+ # --- sources: FROM/JOIN ---
350
+ for m in _RE_FROM_JOIN.finditer(sql):
351
+ name = (m.group("name") or m.group("join_name") or "").strip()
352
+ if not name:
353
+ continue
354
+
355
+ # If FROM/JOIN references a TD_* function call (TD_UNPIVOT(...)), don't treat TD_* as a table
356
+ end = m.end()
357
+ j = end
358
+ while j < len(sql) and sql[j].isspace():
359
+ j += 1
360
+ if name.upper().startswith("TD_") and j < len(sql) and sql[j] == "(":
361
+ continue
362
+
363
+ # Skip derived tables like FROM (SELECT ...) alias
364
+ if name.startswith("("):
365
+ continue
366
+
367
+ sources_raw.append(name)
368
+
369
+ # --- sources: TD_* ON inputs ---
370
+ sources_raw += _extract_td_function_inputs(sql)
371
+
372
+ # If you also have a generic USING regex, keep it (but you already have _RE_USING_TABLE above)
373
+ # sources_raw += [m.group("name") for m in _RE_USING.finditer(sql)]
374
+
375
+ if debug:
376
+ logger_safe("debug", "CTE names detected: %s", sorted(cte_names))
377
+ logger_safe("debug", "Raw target matches: %s", targets_raw)
378
+ logger_safe("debug", "Raw source matches: %s", sources_raw)
379
+
380
+ def normalize_and_filter(names: Iterable[str], *, qualify_default: bool) -> List[str]:
381
+ out: List[str] = []
382
+ for n in names:
383
+ n = _strip_alias(n)
384
+ n = n.rstrip(",);")
385
+
386
+ if not n:
387
+ continue
388
+
389
+ # ignore CTEs
390
+ if n.lower() in cte_names:
391
+ continue
392
+
393
+ if qualify_default:
394
+ n = _qualify_default_db(n, _DEFAULT_DB)
395
+
396
+ out.append(_quote_identifier(n))
397
+ return _dedupe_preserve_order(out)
398
+
399
+ result = {
400
+ "source": normalize_and_filter(sources_raw, qualify_default=True),
401
+ "target": normalize_and_filter(targets_raw, qualify_default=True),
402
+ }
403
+
404
+ if debug:
405
+ logger_safe("debug", "Final parsed result: %s", result)
406
+
407
+ return result
408
+
409
+