tdfs4ds 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +1 -1
- tdfs4ds/feature_store/feature_data_processing.py +7 -7
- tdfs4ds/lineage/__init__.py +21 -0
- tdfs4ds/lineage/indexing.py +501 -0
- tdfs4ds/lineage/lineage.py +409 -0
- tdfs4ds/lineage/network.py +446 -0
- tdfs4ds/utils/lineage.py +2 -0
- tdfs4ds/utils/query_management.py +13 -6
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/METADATA +10 -3
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/RECORD +12 -8
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/WHEEL +1 -1
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
import tdfs4ds
|
|
2
|
+
from tdfs4ds import logger_safe
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, List, Set, Iterable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_COMMENT_SINGLE = re.compile(r"--[^\n]*")
|
|
9
|
+
_COMMENT_MULTI = re.compile(r"/\*.*?\*/", re.DOTALL | re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
# --- add near the top with other regexes ---
|
|
12
|
+
|
|
13
|
+
_RE_CREATE_TABLE_AS = re.compile(
|
|
14
|
+
r"""
|
|
15
|
+
\bCREATE\s+
|
|
16
|
+
(?:MULTISET\s+|SET\s+|VOLATILE\s+|TEMP(?:ORARY)?\s+)? # qualifiers
|
|
17
|
+
TABLE\s+
|
|
18
|
+
(?P<name>[\w".]+)
|
|
19
|
+
\s+AS\s*\( # <-- require AS (
|
|
20
|
+
""",
|
|
21
|
+
re.IGNORECASE | re.VERBOSE,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_RE_SINGLE_QUOTED_STRING = re.compile(
|
|
26
|
+
r"""
|
|
27
|
+
'
|
|
28
|
+
(?:''|[^'])* # escaped quote or any non-quote char
|
|
29
|
+
'
|
|
30
|
+
""",
|
|
31
|
+
re.VERBOSE | re.DOTALL,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _mask_single_quoted_strings(sql: str) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Replace all single-quoted SQL string literals with empty string ''.
|
|
37
|
+
Preserves SQL structure while removing misleading keywords.
|
|
38
|
+
"""
|
|
39
|
+
return _RE_SINGLE_QUOTED_STRING.sub("''", sql)
|
|
40
|
+
|
|
41
|
+
_RE_REPLACE_VIEW_AS = re.compile(
|
|
42
|
+
r"""
|
|
43
|
+
\bREPLACE\s+VIEW\s+
|
|
44
|
+
(?P<name>[\w".]+)
|
|
45
|
+
\s+AS\b
|
|
46
|
+
""",
|
|
47
|
+
re.IGNORECASE | re.VERBOSE,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
_RE_CREATE_OR_REPLACE_VIEW_AS = re.compile(
|
|
51
|
+
r"""
|
|
52
|
+
\b(?:CREATE|REPLACE)\s+VIEW\s+
|
|
53
|
+
(?P<name>[\w".]+)
|
|
54
|
+
\s+AS\b
|
|
55
|
+
""",
|
|
56
|
+
re.IGNORECASE | re.VERBOSE,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
_RE_MERGE_INTO = re.compile(
|
|
60
|
+
r"""
|
|
61
|
+
\bMERGE\s+INTO\s+
|
|
62
|
+
(?P<name>[\w".]+)
|
|
63
|
+
\b
|
|
64
|
+
""",
|
|
65
|
+
re.IGNORECASE | re.VERBOSE,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
_RE_UPDATE = re.compile(
|
|
69
|
+
r"""
|
|
70
|
+
\bUPDATE\s+
|
|
71
|
+
(?!SET\b) # <-- prevents UPDATE SET from matching
|
|
72
|
+
(?P<name>[\w".]+)
|
|
73
|
+
\b
|
|
74
|
+
""",
|
|
75
|
+
re.IGNORECASE | re.VERBOSE,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
_RE_USING = re.compile(
|
|
80
|
+
r"""
|
|
81
|
+
\bUSING\s+(?P<name>[\w".]+)\b
|
|
82
|
+
""",
|
|
83
|
+
re.IGNORECASE | re.VERBOSE,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
_RE_DELETE_FROM = re.compile(
|
|
87
|
+
r"""
|
|
88
|
+
\bDELETE\s+FROM\s+
|
|
89
|
+
(?P<name>[\w".]+)
|
|
90
|
+
\b
|
|
91
|
+
""",
|
|
92
|
+
re.IGNORECASE | re.VERBOSE,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Capture ON <identifier> and ON (<subquery>) inside TD_* table functions
|
|
96
|
+
_RE_TD_FUNC_CALL = re.compile(
|
|
97
|
+
r"""\bTD_[A-Z_0-9]+\s*\(""",
|
|
98
|
+
re.IGNORECASE,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
_RE_TD_ON_CLAUSE = re.compile(
|
|
102
|
+
r"""
|
|
103
|
+
\bON\b\s+
|
|
104
|
+
(?P<arg>
|
|
105
|
+
\([^\)]*\) # naive: one-level parens (good enough for your test cases)
|
|
106
|
+
|
|
|
107
|
+
[\w".]+ # identifier
|
|
108
|
+
)
|
|
109
|
+
""",
|
|
110
|
+
re.IGNORECASE | re.VERBOSE | re.DOTALL,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
_RE_INSERT_INTO = re.compile(
|
|
115
|
+
r"""
|
|
116
|
+
\bINSERT\s+INTO\s+
|
|
117
|
+
(?P<name>[\w".]+)
|
|
118
|
+
\b
|
|
119
|
+
""",
|
|
120
|
+
re.IGNORECASE | re.VERBOSE,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
_RE_CREATE_VIEW_AS = re.compile(
|
|
124
|
+
r"""
|
|
125
|
+
\b(?:CREATE|REPLACE)\s+VIEW\s+
|
|
126
|
+
(?P<name>[\w".]+)
|
|
127
|
+
\s+AS\b
|
|
128
|
+
""",
|
|
129
|
+
re.IGNORECASE | re.VERBOSE,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
_RE_FROM_JOIN = re.compile(
|
|
133
|
+
r"""
|
|
134
|
+
\bFROM\b
|
|
135
|
+
\s+(?P<name>[\w".]+)
|
|
136
|
+
|
|
|
137
|
+
\b(?:INNER\s+JOIN|LEFT\s+JOIN|RIGHT\s+JOIN|FULL\s+OUTER\s+JOIN|CROSS\s+JOIN|JOIN)\b
|
|
138
|
+
\s+(?P<join_name>[\w".]+)
|
|
139
|
+
""",
|
|
140
|
+
re.IGNORECASE | re.VERBOSE,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
_RE_CTE_NAMES = re.compile(
|
|
144
|
+
r"""
|
|
145
|
+
\bWITH\b\s*(?P<first>\w+)\s+AS\s*\(
|
|
146
|
+
|,\s*(?P<next>\w+)\s+AS\s*\(
|
|
147
|
+
""",
|
|
148
|
+
re.IGNORECASE | re.VERBOSE,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
_RE_EXTRACT_FROM = re.compile(
|
|
152
|
+
r"""\bEXTRACT\s*\(\s*[^()]*?\bFROM\b[^()]*?\)""",
|
|
153
|
+
re.IGNORECASE | re.DOTALL,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
_RE_USING_TABLE = re.compile(
|
|
157
|
+
r"""
|
|
158
|
+
\bUSING\s+(?P<name>[\w".]+)\b
|
|
159
|
+
""",
|
|
160
|
+
re.IGNORECASE | re.VERBOSE,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _extract_using_tables(sql: str) -> List[str]:
|
|
164
|
+
out: List[str] = []
|
|
165
|
+
for m in _RE_USING_TABLE.finditer(sql):
|
|
166
|
+
name = m.group("name")
|
|
167
|
+
# Look ahead: if next non-space char is '(' then it's a TD_* USING option, not a table
|
|
168
|
+
j = m.end()
|
|
169
|
+
while j < len(sql) and sql[j].isspace():
|
|
170
|
+
j += 1
|
|
171
|
+
if j < len(sql) and sql[j] == "(":
|
|
172
|
+
continue
|
|
173
|
+
out.append(name)
|
|
174
|
+
return out
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
_RE_TD_FUNC_CALL = re.compile(r"\bTD_[A-Z_0-9]+\s*\(", re.IGNORECASE)
|
|
178
|
+
|
|
179
|
+
def _extract_td_function_inputs(sql: str) -> List[str]:
|
|
180
|
+
inputs: List[str] = []
|
|
181
|
+
for m in _RE_TD_FUNC_CALL.finditer(sql):
|
|
182
|
+
parens = _extract_balanced_parens(sql, m.end() - 1) # points to '('
|
|
183
|
+
inner = parens[1:-1] # inside TD_*( ... )
|
|
184
|
+
|
|
185
|
+
for onm in _RE_TD_ON_CLAUSE.finditer(inner):
|
|
186
|
+
arg = onm.group("arg").strip().rstrip(",);")
|
|
187
|
+
|
|
188
|
+
if arg.startswith("(") and arg.endswith(")"):
|
|
189
|
+
sub = arg[1:-1]
|
|
190
|
+
for sm in _RE_FROM_JOIN.finditer(sub):
|
|
191
|
+
nm = sm.group("name") or sm.group("join_name")
|
|
192
|
+
if nm:
|
|
193
|
+
inputs.append(nm)
|
|
194
|
+
else:
|
|
195
|
+
inputs.append(arg)
|
|
196
|
+
return inputs
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _mask_extract_from(sql: str) -> str:
|
|
201
|
+
# Replace EXTRACT( ... FROM ... ) with a placeholder that contains no FROM keyword
|
|
202
|
+
return _RE_EXTRACT_FROM.sub("EXTRACT(/*masked*/)", sql)
|
|
203
|
+
|
|
204
|
+
_DEFAULT_DB = "<DEFAULT_DATABASE>"
|
|
205
|
+
|
|
206
|
+
def _is_placeholder(part: str) -> bool:
|
|
207
|
+
return part.strip().upper() == _DEFAULT_DB.upper()
|
|
208
|
+
|
|
209
|
+
def _quote_identifier(name: str) -> str:
|
|
210
|
+
name = _strip_alias(name)
|
|
211
|
+
|
|
212
|
+
# already quoted identifier: leave as-is
|
|
213
|
+
if '"' in name:
|
|
214
|
+
return name
|
|
215
|
+
|
|
216
|
+
parts = name.split(".")
|
|
217
|
+
quoted_parts = []
|
|
218
|
+
for p in parts:
|
|
219
|
+
if p == _DEFAULT_DB:
|
|
220
|
+
quoted_parts.append(_DEFAULT_DB) # <-- do NOT quote placeholder
|
|
221
|
+
else:
|
|
222
|
+
quoted_parts.append(f'"{p}"')
|
|
223
|
+
return ".".join(quoted_parts)
|
|
224
|
+
|
|
225
|
+
def _extract_balanced_parens(sql: str, open_paren_index: int) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Return the substring starting at open_paren_index (which must point to '(')
|
|
228
|
+
up to and including the matching closing ')'.
|
|
229
|
+
Falls back to end-of-string if unbalanced.
|
|
230
|
+
"""
|
|
231
|
+
depth = 0
|
|
232
|
+
for i in range(open_paren_index, len(sql)):
|
|
233
|
+
ch = sql[i]
|
|
234
|
+
if ch == "(":
|
|
235
|
+
depth += 1
|
|
236
|
+
elif ch == ")":
|
|
237
|
+
depth -= 1
|
|
238
|
+
if depth == 0:
|
|
239
|
+
return sql[open_paren_index : i + 1]
|
|
240
|
+
return sql[open_paren_index:]
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _qualify_default_db(name: str, default_db: str) -> str:
|
|
244
|
+
name = _strip_alias(name)
|
|
245
|
+
if "." in name or name.startswith('"') or name.startswith(default_db):
|
|
246
|
+
return name
|
|
247
|
+
return f"{default_db}.{name}"
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _remove_sql_comments(sql: str) -> str:
|
|
252
|
+
sql = _COMMENT_SINGLE.sub("", sql)
|
|
253
|
+
sql = _COMMENT_MULTI.sub("", sql)
|
|
254
|
+
return sql
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _extract_cte_names(sql: str) -> Set[str]:
|
|
258
|
+
names: Set[str] = set()
|
|
259
|
+
for m in _RE_CTE_NAMES.finditer(sql):
|
|
260
|
+
name = m.group("first") or m.group("next")
|
|
261
|
+
if name:
|
|
262
|
+
names.add(name.lower())
|
|
263
|
+
return names
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _strip_alias(name: str) -> str:
|
|
267
|
+
return name.strip().split()[0]
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
_DEFAULT_DB = "<DEFAULT_DATABASE>"
|
|
271
|
+
|
|
272
|
+
def _quote_identifier(name: str) -> str:
|
|
273
|
+
name = _strip_alias(name)
|
|
274
|
+
if '"' in name:
|
|
275
|
+
return name
|
|
276
|
+
|
|
277
|
+
parts = name.split(".")
|
|
278
|
+
out = []
|
|
279
|
+
for p in parts:
|
|
280
|
+
if p == _DEFAULT_DB:
|
|
281
|
+
out.append(_DEFAULT_DB) # keep placeholder unquoted
|
|
282
|
+
else:
|
|
283
|
+
out.append(f'"{p}"')
|
|
284
|
+
return ".".join(out)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
289
|
+
seen: Set[str] = set()
|
|
290
|
+
out: List[str] = []
|
|
291
|
+
for x in items:
|
|
292
|
+
if x not in seen:
|
|
293
|
+
out.append(x)
|
|
294
|
+
seen.add(x)
|
|
295
|
+
return out
|
|
296
|
+
|
|
297
|
+
_RE_TD_FUNC_CALL = re.compile(r"\bTD_[A-Z_0-9]+\s*\(", re.IGNORECASE)
|
|
298
|
+
|
|
299
|
+
def _extract_td_function_inputs(sql: str) -> List[str]:
|
|
300
|
+
inputs: List[str] = []
|
|
301
|
+
for m in _RE_TD_FUNC_CALL.finditer(sql):
|
|
302
|
+
# m.end()-1 is the '('
|
|
303
|
+
parens = _extract_balanced_parens(sql, m.end() - 1)
|
|
304
|
+
inner = parens[1:-1] # strip outer ()
|
|
305
|
+
for onm in _RE_TD_ON_CLAUSE.finditer(inner):
|
|
306
|
+
arg = onm.group("arg").strip()
|
|
307
|
+
if arg.startswith("(") and arg.endswith(")"):
|
|
308
|
+
sub = arg[1:-1]
|
|
309
|
+
for sm in _RE_FROM_JOIN.finditer(sub):
|
|
310
|
+
nm = sm.group("name") or sm.group("join_name")
|
|
311
|
+
if nm:
|
|
312
|
+
inputs.append(nm)
|
|
313
|
+
else:
|
|
314
|
+
inputs.append(arg)
|
|
315
|
+
return inputs
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def analyze_sql_query(sql_query: str, *, debug: bool = False) -> Dict[str, List[str]]:
|
|
320
|
+
"""
|
|
321
|
+
Extracts source and target tables/views from a SQL query.
|
|
322
|
+
"""
|
|
323
|
+
sql = _remove_sql_comments(sql_query)
|
|
324
|
+
sql = _mask_single_quoted_strings(sql)
|
|
325
|
+
sql = _mask_extract_from(sql) # <-- important fix for EXTRACT(... FROM ...)
|
|
326
|
+
cte_names = _extract_cte_names(sql)
|
|
327
|
+
|
|
328
|
+
targets_raw: List[str] = []
|
|
329
|
+
sources_raw: List[str] = []
|
|
330
|
+
|
|
331
|
+
# DELETE ... USING <table>
|
|
332
|
+
sources_raw += _extract_using_tables(sql)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# --- UPDATE targets (and treat them as sources too) ---
|
|
336
|
+
# IMPORTANT: _RE_UPDATE must be defined with (?!SET\b) to avoid matching MERGE's "UPDATE SET"
|
|
337
|
+
update_targets = [m.group("name") for m in _RE_UPDATE.finditer(sql)]
|
|
338
|
+
targets_raw += update_targets
|
|
339
|
+
sources_raw += update_targets # <-- this is the "UPDATE management" you need
|
|
340
|
+
|
|
341
|
+
# --- targets ---
|
|
342
|
+
targets_raw += [m.group("name") for m in _RE_CREATE_TABLE_AS.finditer(sql)]
|
|
343
|
+
targets_raw += [m.group("name") for m in _RE_INSERT_INTO.finditer(sql)]
|
|
344
|
+
targets_raw += [m.group("name") for m in _RE_CREATE_OR_REPLACE_VIEW_AS.finditer(sql)]
|
|
345
|
+
targets_raw += [m.group("name") for m in _RE_REPLACE_VIEW_AS.finditer(sql)]
|
|
346
|
+
targets_raw += [m.group("name") for m in _RE_MERGE_INTO.finditer(sql)]
|
|
347
|
+
targets_raw += [m.group("name") for m in _RE_DELETE_FROM.finditer(sql)]
|
|
348
|
+
|
|
349
|
+
# --- sources: FROM/JOIN ---
|
|
350
|
+
for m in _RE_FROM_JOIN.finditer(sql):
|
|
351
|
+
name = (m.group("name") or m.group("join_name") or "").strip()
|
|
352
|
+
if not name:
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
# If FROM/JOIN references a TD_* function call (TD_UNPIVOT(...)), don't treat TD_* as a table
|
|
356
|
+
end = m.end()
|
|
357
|
+
j = end
|
|
358
|
+
while j < len(sql) and sql[j].isspace():
|
|
359
|
+
j += 1
|
|
360
|
+
if name.upper().startswith("TD_") and j < len(sql) and sql[j] == "(":
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Skip derived tables like FROM (SELECT ...) alias
|
|
364
|
+
if name.startswith("("):
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
sources_raw.append(name)
|
|
368
|
+
|
|
369
|
+
# --- sources: TD_* ON inputs ---
|
|
370
|
+
sources_raw += _extract_td_function_inputs(sql)
|
|
371
|
+
|
|
372
|
+
# If you also have a generic USING regex, keep it (but you already have _RE_USING_TABLE above)
|
|
373
|
+
# sources_raw += [m.group("name") for m in _RE_USING.finditer(sql)]
|
|
374
|
+
|
|
375
|
+
if debug:
|
|
376
|
+
logger_safe("debug", "CTE names detected: %s", sorted(cte_names))
|
|
377
|
+
logger_safe("debug", "Raw target matches: %s", targets_raw)
|
|
378
|
+
logger_safe("debug", "Raw source matches: %s", sources_raw)
|
|
379
|
+
|
|
380
|
+
def normalize_and_filter(names: Iterable[str], *, qualify_default: bool) -> List[str]:
|
|
381
|
+
out: List[str] = []
|
|
382
|
+
for n in names:
|
|
383
|
+
n = _strip_alias(n)
|
|
384
|
+
n = n.rstrip(",);")
|
|
385
|
+
|
|
386
|
+
if not n:
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
# ignore CTEs
|
|
390
|
+
if n.lower() in cte_names:
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
if qualify_default:
|
|
394
|
+
n = _qualify_default_db(n, _DEFAULT_DB)
|
|
395
|
+
|
|
396
|
+
out.append(_quote_identifier(n))
|
|
397
|
+
return _dedupe_preserve_order(out)
|
|
398
|
+
|
|
399
|
+
result = {
|
|
400
|
+
"source": normalize_and_filter(sources_raw, qualify_default=True),
|
|
401
|
+
"target": normalize_and_filter(targets_raw, qualify_default=True),
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
if debug:
|
|
405
|
+
logger_safe("debug", "Final parsed result: %s", result)
|
|
406
|
+
|
|
407
|
+
return result
|
|
408
|
+
|
|
409
|
+
|