tdfs4ds 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +1 -1
- tdfs4ds/feature_store/feature_data_processing.py +7 -7
- tdfs4ds/lineage/__init__.py +21 -0
- tdfs4ds/lineage/indexing.py +501 -0
- tdfs4ds/lineage/lineage.py +409 -0
- tdfs4ds/lineage/network.py +446 -0
- tdfs4ds/utils/lineage.py +2 -0
- tdfs4ds/utils/query_management.py +13 -6
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/METADATA +10 -3
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/RECORD +12 -8
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/WHEEL +1 -1
- {tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -231,7 +231,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
231
231
|
list_entity_id = [entity_id]
|
|
232
232
|
|
|
233
233
|
# Character set handling / pass-through
|
|
234
|
-
res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).
|
|
234
|
+
res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).splitlines()}
|
|
235
235
|
var_temp2 = []
|
|
236
236
|
for k, v in res.items():
|
|
237
237
|
if 'UNICODE' in v:
|
|
@@ -303,7 +303,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
303
303
|
tdml.execute_sql(query_create_volatile)
|
|
304
304
|
logger_safe('info', 'results calculated and materialized in a volatile table')
|
|
305
305
|
except Exception as e:
|
|
306
|
-
logger_safe('error', f"query execution failed : {str(e).
|
|
306
|
+
logger_safe('error', f"query execution failed : {str(e).splitlines()[0]}")
|
|
307
307
|
raise
|
|
308
308
|
|
|
309
309
|
|
|
@@ -334,7 +334,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
334
334
|
# else: no duplicates
|
|
335
335
|
# logger_safe("info", "No duplicate found.") # optional
|
|
336
336
|
except Exception as e:
|
|
337
|
-
logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).
|
|
337
|
+
logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).splitlines()[0])
|
|
338
338
|
raise
|
|
339
339
|
|
|
340
340
|
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
@@ -783,7 +783,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
783
783
|
try:
|
|
784
784
|
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
785
785
|
except Exception as e:
|
|
786
|
-
logger_safe("warning", "display_table failed: %s", str(e).
|
|
786
|
+
logger_safe("warning", "display_table failed: %s", str(e).splitlines()[0])
|
|
787
787
|
|
|
788
788
|
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
|
|
789
789
|
ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
|
|
@@ -870,7 +870,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
870
870
|
|
|
871
871
|
for q in queries:
|
|
872
872
|
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
873
|
-
logger_safe("debug", "Executing merge (head): %s", "\n".join(q.
|
|
873
|
+
logger_safe("debug", "Executing merge (head): %s", "\n".join(q.splitlines()[0:3]))
|
|
874
874
|
execute_query(q)
|
|
875
875
|
|
|
876
876
|
elapsed_time = time.time() - start_time
|
|
@@ -881,7 +881,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
881
881
|
formatted_elapsed_time, elapsed_time
|
|
882
882
|
)
|
|
883
883
|
except Exception as e:
|
|
884
|
-
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).
|
|
884
|
+
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).splitlines()[0])
|
|
885
885
|
raise
|
|
886
886
|
|
|
887
887
|
return count_features.NB_ROWS.values[0]
|
|
@@ -1028,7 +1028,7 @@ def prepare_feature_ingestion_tdstone2(df, entity_id):
|
|
|
1028
1028
|
tdml.execute_sql(query)
|
|
1029
1029
|
except Exception as e:
|
|
1030
1030
|
if tdfs4ds.DISPLAY_LOGS:
|
|
1031
|
-
logger_safe('debug',str(e).
|
|
1031
|
+
logger_safe('debug',str(e).splitlines()[0])
|
|
1032
1032
|
tdml.execute_sql(f'DELETE {volatile_table_name}')
|
|
1033
1033
|
|
|
1034
1034
|
# Optionally print the query if the display flag is set.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .lineage import (
|
|
2
|
+
analyze_sql_query
|
|
3
|
+
)
|
|
4
|
+
|
|
5
|
+
from .indexing import (
|
|
6
|
+
analyze_teradata_ddl,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .network import (
|
|
10
|
+
build_teradata_dependency_graph,
|
|
11
|
+
plot_lineage_sankey,
|
|
12
|
+
show_plotly_robust
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"analyze_sql_query",
|
|
17
|
+
"analyze_teradata_ddl",
|
|
18
|
+
"build_teradata_dependency_graph",
|
|
19
|
+
"plot_lineage_sankey",
|
|
20
|
+
"show_plotly_robust"
|
|
21
|
+
]
|
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _strip_sql_comments(sql: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Remove Teradata-style comments:
|
|
8
|
+
- /* ... */ block comments
|
|
9
|
+
- -- ... end-of-line comments
|
|
10
|
+
BUT do not treat comment markers inside single-quoted literals or double-quoted identifiers as comments.
|
|
11
|
+
"""
|
|
12
|
+
out = []
|
|
13
|
+
i = 0
|
|
14
|
+
n = len(sql)
|
|
15
|
+
|
|
16
|
+
in_squote = False # '...'
|
|
17
|
+
in_dquote = False # "..."
|
|
18
|
+
|
|
19
|
+
while i < n:
|
|
20
|
+
ch = sql[i]
|
|
21
|
+
|
|
22
|
+
# Toggle double-quoted identifiers
|
|
23
|
+
if not in_squote and ch == '"':
|
|
24
|
+
in_dquote = not in_dquote
|
|
25
|
+
out.append(ch)
|
|
26
|
+
i += 1
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
# Handle single-quoted literals with doubled quotes '' escape
|
|
30
|
+
if not in_dquote and ch == "'":
|
|
31
|
+
if in_squote:
|
|
32
|
+
# if this is an escaped single quote inside a literal: ''
|
|
33
|
+
if i + 1 < n and sql[i + 1] == "'":
|
|
34
|
+
out.append("''")
|
|
35
|
+
i += 2
|
|
36
|
+
continue
|
|
37
|
+
# end literal
|
|
38
|
+
in_squote = False
|
|
39
|
+
out.append(ch)
|
|
40
|
+
i += 1
|
|
41
|
+
continue
|
|
42
|
+
else:
|
|
43
|
+
in_squote = True
|
|
44
|
+
out.append(ch)
|
|
45
|
+
i += 1
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
# If not inside quotes, detect comments
|
|
49
|
+
if not in_squote and not in_dquote:
|
|
50
|
+
# Line comment --
|
|
51
|
+
if ch == "-" and i + 1 < n and sql[i + 1] == "-":
|
|
52
|
+
# skip until newline (but keep newline if present)
|
|
53
|
+
i += 2
|
|
54
|
+
while i < n and sql[i] != "\n":
|
|
55
|
+
i += 1
|
|
56
|
+
# keep the newline if any
|
|
57
|
+
if i < n and sql[i] == "\n":
|
|
58
|
+
out.append("\n")
|
|
59
|
+
i += 1
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
# Block comment /* ... */
|
|
63
|
+
if ch == "/" and i + 1 < n and sql[i + 1] == "*":
|
|
64
|
+
i += 2
|
|
65
|
+
while i + 1 < n and not (sql[i] == "*" and sql[i + 1] == "/"):
|
|
66
|
+
i += 1
|
|
67
|
+
i += 2 if i + 1 < n else 0
|
|
68
|
+
out.append(" ")
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# normal character
|
|
72
|
+
out.append(ch)
|
|
73
|
+
i += 1
|
|
74
|
+
|
|
75
|
+
return "".join(out)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _compress_whitespace(sql: str) -> str:
|
|
80
|
+
return re.sub(r"\s+", " ", sql).strip()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _extract_parenthesized_list(text: str, start_idx: int):
|
|
84
|
+
"""
|
|
85
|
+
Given text and index pointing at an opening '(',
|
|
86
|
+
return (content_inside_parens, index_after_closing_paren).
|
|
87
|
+
"""
|
|
88
|
+
if start_idx >= len(text) or text[start_idx] != "(":
|
|
89
|
+
raise ValueError("start_idx must point to '('")
|
|
90
|
+
|
|
91
|
+
depth = 1
|
|
92
|
+
i = start_idx + 1
|
|
93
|
+
content_chars = []
|
|
94
|
+
|
|
95
|
+
while i < len(text) and depth > 0:
|
|
96
|
+
ch = text[i]
|
|
97
|
+
if ch == "(":
|
|
98
|
+
depth += 1
|
|
99
|
+
content_chars.append(ch)
|
|
100
|
+
elif ch == ")":
|
|
101
|
+
depth -= 1
|
|
102
|
+
if depth > 0:
|
|
103
|
+
content_chars.append(ch)
|
|
104
|
+
else:
|
|
105
|
+
content_chars.append(ch)
|
|
106
|
+
i += 1
|
|
107
|
+
|
|
108
|
+
return "".join(content_chars).strip(), i
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _split_top_level_commas(expr: str) -> List[str]:
|
|
112
|
+
"""Split by commas that are not inside parentheses."""
|
|
113
|
+
parts, buf = [], []
|
|
114
|
+
depth = 0
|
|
115
|
+
for ch in expr:
|
|
116
|
+
if ch == "(":
|
|
117
|
+
depth += 1
|
|
118
|
+
elif ch == ")":
|
|
119
|
+
depth = max(0, depth - 1)
|
|
120
|
+
|
|
121
|
+
if ch == "," and depth == 0:
|
|
122
|
+
part = "".join(buf).strip()
|
|
123
|
+
if part:
|
|
124
|
+
parts.append(part)
|
|
125
|
+
buf = []
|
|
126
|
+
else:
|
|
127
|
+
buf.append(ch)
|
|
128
|
+
|
|
129
|
+
tail = "".join(buf).strip()
|
|
130
|
+
if tail:
|
|
131
|
+
parts.append(tail)
|
|
132
|
+
return parts
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _normalize_identifier(ident: str) -> str:
|
|
136
|
+
ident = ident.strip()
|
|
137
|
+
if "." in ident:
|
|
138
|
+
ident = ident.split(".")[-1].strip()
|
|
139
|
+
if len(ident) >= 2 and ident[0] == '"' and ident[-1] == '"':
|
|
140
|
+
ident = ident[1:-1]
|
|
141
|
+
return ident.strip()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _mask_single_quoted_literals_same_len(sql: str) -> str:
|
|
145
|
+
"""
|
|
146
|
+
Replace each single-quoted literal with spaces of the same length so that:
|
|
147
|
+
- keywords inside literals can't be detected
|
|
148
|
+
- string length stays identical (indexes still align)
|
|
149
|
+
Handles escaped quotes like 'It''s'.
|
|
150
|
+
"""
|
|
151
|
+
return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# --- Partition parsing helpers (expects literals already masked in the input) ---
|
|
155
|
+
|
|
156
|
+
_PARTITION_STOPWORDS = {
|
|
157
|
+
"range_n", "case_n", "columnar","case",
|
|
158
|
+
"between", "and", "or", "not", "is", "null", "no", "range",
|
|
159
|
+
"in", "like", "exists", "distinct",
|
|
160
|
+
"each", "interval", "day", "month", "year", "from", "to", "every",
|
|
161
|
+
"cast", "extract", "coalesce", "nullif", "trim", "substr", "substring",
|
|
162
|
+
"current_date", "current_timestamp",
|
|
163
|
+
"date", "timestamp", "integer", "smallint", "bigint", "byteint", "decimal", "float",
|
|
164
|
+
"when", "then", "else", "end",
|
|
165
|
+
"format", "zone", "as",
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _find_identifiers(s: str) -> List[str]:
|
|
170
|
+
# Collapse quoted qualifiers: "db"."table".col -> col
|
|
171
|
+
s = re.sub(r'"[^"]+"\s*\.\s*', '', s)
|
|
172
|
+
|
|
173
|
+
toks = re.findall(
|
|
174
|
+
r'"[^"]+"|[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*',
|
|
175
|
+
s
|
|
176
|
+
)
|
|
177
|
+
out = [_normalize_identifier(t) for t in toks]
|
|
178
|
+
return [x for x in out if x]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _columns_from_chunk(masked_chunk: str) -> List[str]:
|
|
183
|
+
cols = []
|
|
184
|
+
for c in _find_identifiers(masked_chunk):
|
|
185
|
+
if c.lower() not in _PARTITION_STOPWORDS:
|
|
186
|
+
cols.append(c)
|
|
187
|
+
|
|
188
|
+
seen, out = set(), []
|
|
189
|
+
for c in cols:
|
|
190
|
+
k = c.lower()
|
|
191
|
+
if k not in seen:
|
|
192
|
+
seen.add(k)
|
|
193
|
+
out.append(c)
|
|
194
|
+
return out
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _parse_partition_elements(partition_expr_masked: str) -> List[Dict[str, Any]]:
|
|
198
|
+
"""
|
|
199
|
+
Parse a (masked) PARTITION BY expression into ordered levels.
|
|
200
|
+
Each level: {level, kind, columns, raw}
|
|
201
|
+
"""
|
|
202
|
+
expr = partition_expr_masked.strip()
|
|
203
|
+
elements = _split_top_level_commas(expr) if expr else []
|
|
204
|
+
|
|
205
|
+
levels: List[Dict[str, Any]] = []
|
|
206
|
+
for level_idx, elem in enumerate(elements if elements else [expr], 1):
|
|
207
|
+
e = (elem or "").strip()
|
|
208
|
+
kind = "UNKNOWN"
|
|
209
|
+
cols: List[str] = []
|
|
210
|
+
|
|
211
|
+
m = re.search(r"\bRANGE_N\s*\(\s*(.*?)\s+BETWEEN\b", e, flags=re.IGNORECASE | re.DOTALL)
|
|
212
|
+
if m:
|
|
213
|
+
kind = "RANGE_N"
|
|
214
|
+
cols = _columns_from_chunk(m.group(1))
|
|
215
|
+
|
|
216
|
+
elif re.search(r"\bCASE_N\s*\(", e, flags=re.IGNORECASE):
|
|
217
|
+
kind = "CASE_N"
|
|
218
|
+
m2 = re.search(r"\bCASE_N\s*\(\s*(.*)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
|
|
219
|
+
inner = m2.group(1) if m2 else e
|
|
220
|
+
parts = _split_top_level_commas(inner)
|
|
221
|
+
tmp: List[str] = []
|
|
222
|
+
for p in parts:
|
|
223
|
+
if re.search(r"\bNO\s+RANGE\b", p, flags=re.IGNORECASE):
|
|
224
|
+
continue
|
|
225
|
+
tmp.extend(_columns_from_chunk(p))
|
|
226
|
+
# dedup
|
|
227
|
+
seen, cols = set(), []
|
|
228
|
+
for c in tmp:
|
|
229
|
+
k = c.lower()
|
|
230
|
+
if k not in seen:
|
|
231
|
+
seen.add(k)
|
|
232
|
+
cols.append(c)
|
|
233
|
+
|
|
234
|
+
elif re.search(r"\bCOLUMNAR\s*\(", e, flags=re.IGNORECASE):
|
|
235
|
+
kind = "COLUMNAR"
|
|
236
|
+
m3 = re.search(r"\bCOLUMNAR\s*\(\s*(.*?)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
|
|
237
|
+
inner = m3.group(1) if m3 else ""
|
|
238
|
+
parts = _split_top_level_commas(inner)
|
|
239
|
+
tmp: List[str] = []
|
|
240
|
+
for p in parts:
|
|
241
|
+
tmp.extend(_columns_from_chunk(p))
|
|
242
|
+
# dedup
|
|
243
|
+
seen, cols = set(), []
|
|
244
|
+
for c in tmp:
|
|
245
|
+
k = c.lower()
|
|
246
|
+
if k not in seen:
|
|
247
|
+
seen.add(k)
|
|
248
|
+
cols.append(c)
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
cols = _columns_from_chunk(e)
|
|
252
|
+
|
|
253
|
+
levels.append(
|
|
254
|
+
{
|
|
255
|
+
"level": level_idx,
|
|
256
|
+
"kind": kind,
|
|
257
|
+
"columns": cols,
|
|
258
|
+
"raw": e,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# If expr was empty, return empty list
|
|
263
|
+
if expr == "":
|
|
264
|
+
return []
|
|
265
|
+
return levels
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _partitioning_by_column(levels: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
269
|
+
out: Dict[str, List[Dict[str, Any]]] = {}
|
|
270
|
+
for lvl in levels:
|
|
271
|
+
for c in lvl.get("columns", []):
|
|
272
|
+
out.setdefault(c, []).append({"level": lvl["level"], "kind": lvl["kind"]})
|
|
273
|
+
return out
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _find_create_table_columns_block_end(ddl_clean: str) -> int:
|
|
277
|
+
"""
|
|
278
|
+
Find index just after the closing ')' of the CREATE TABLE column-definition block.
|
|
279
|
+
Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
|
|
280
|
+
Assumes ddl_clean is comment-stripped + whitespace-compressed.
|
|
281
|
+
"""
|
|
282
|
+
# Find CREATE ... TABLE
|
|
283
|
+
m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
|
|
284
|
+
start_search = m.end() if m else 0
|
|
285
|
+
|
|
286
|
+
# Find first '(' after TABLE keyword (should be the column list opener)
|
|
287
|
+
open_idx = ddl_clean.find("(", start_search)
|
|
288
|
+
if open_idx == -1:
|
|
289
|
+
return 0
|
|
290
|
+
|
|
291
|
+
depth = 0
|
|
292
|
+
i = open_idx
|
|
293
|
+
n = len(ddl_clean)
|
|
294
|
+
|
|
295
|
+
while i < n:
|
|
296
|
+
ch = ddl_clean[i]
|
|
297
|
+
|
|
298
|
+
# Skip single-quoted literals: '...''...'
|
|
299
|
+
if ch == "'":
|
|
300
|
+
i += 1
|
|
301
|
+
while i < n:
|
|
302
|
+
if ddl_clean[i] == "'":
|
|
303
|
+
# escaped quote?
|
|
304
|
+
if i + 1 < n and ddl_clean[i + 1] == "'":
|
|
305
|
+
i += 2
|
|
306
|
+
continue
|
|
307
|
+
i += 1
|
|
308
|
+
break
|
|
309
|
+
i += 1
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
# Skip double-quoted identifiers: "My Col"
|
|
313
|
+
if ch == '"':
|
|
314
|
+
i += 1
|
|
315
|
+
while i < n and ddl_clean[i] != '"':
|
|
316
|
+
i += 1
|
|
317
|
+
i += 1 # consume closing "
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
if ch == "(":
|
|
321
|
+
depth += 1
|
|
322
|
+
elif ch == ")":
|
|
323
|
+
depth -= 1
|
|
324
|
+
if depth == 0:
|
|
325
|
+
return i + 1 # position after the matching ')'
|
|
326
|
+
|
|
327
|
+
i += 1
|
|
328
|
+
|
|
329
|
+
# If we get here, parentheses didn't balance; fall back to 0 (whole string)
|
|
330
|
+
return 0
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
import re
|
|
335
|
+
from typing import List, Dict, Any
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _mask_single_quoted_literals_same_len(sql: str) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Replace each single-quoted literal with spaces of the same length so:
|
|
341
|
+
- keywords inside literals can't be detected
|
|
342
|
+
- string length stays identical (indexes still align)
|
|
343
|
+
Handles escaped quotes like 'It''s'.
|
|
344
|
+
"""
|
|
345
|
+
return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _find_create_table_columns_block_end(ddl_clean: str) -> int:
|
|
349
|
+
"""
|
|
350
|
+
Find index just after the closing ')' of the CREATE TABLE column-definition block.
|
|
351
|
+
Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
|
|
352
|
+
Assumes ddl_clean is comment-stripped + whitespace-compressed.
|
|
353
|
+
"""
|
|
354
|
+
m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
|
|
355
|
+
start_search = m.end() if m else 0
|
|
356
|
+
|
|
357
|
+
open_idx = ddl_clean.find("(", start_search)
|
|
358
|
+
if open_idx == -1:
|
|
359
|
+
return 0
|
|
360
|
+
|
|
361
|
+
depth = 0
|
|
362
|
+
i = open_idx
|
|
363
|
+
n = len(ddl_clean)
|
|
364
|
+
|
|
365
|
+
while i < n:
|
|
366
|
+
ch = ddl_clean[i]
|
|
367
|
+
|
|
368
|
+
# Skip single-quoted literals: '...''...'
|
|
369
|
+
if ch == "'":
|
|
370
|
+
i += 1
|
|
371
|
+
while i < n:
|
|
372
|
+
if ddl_clean[i] == "'":
|
|
373
|
+
if i + 1 < n and ddl_clean[i + 1] == "'": # escaped quote
|
|
374
|
+
i += 2
|
|
375
|
+
continue
|
|
376
|
+
i += 1
|
|
377
|
+
break
|
|
378
|
+
i += 1
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
# Skip double-quoted identifiers: "My Col"
|
|
382
|
+
if ch == '"':
|
|
383
|
+
i += 1
|
|
384
|
+
while i < n and ddl_clean[i] != '"':
|
|
385
|
+
i += 1
|
|
386
|
+
i += 1
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
if ch == "(":
|
|
390
|
+
depth += 1
|
|
391
|
+
elif ch == ")":
|
|
392
|
+
depth -= 1
|
|
393
|
+
if depth == 0:
|
|
394
|
+
return i + 1
|
|
395
|
+
|
|
396
|
+
i += 1
|
|
397
|
+
|
|
398
|
+
# Fallback if unbalanced
|
|
399
|
+
return 0
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def analyze_teradata_ddl(ddl: str) -> Dict[str, Any]:
|
|
403
|
+
"""
|
|
404
|
+
Analyse Teradata CREATE TABLE DDL and return:
|
|
405
|
+
{
|
|
406
|
+
'primary_index_columns': [...],
|
|
407
|
+
'partition_columns': [...],
|
|
408
|
+
'partitioning_levels': [...],
|
|
409
|
+
'partitioning_by_column': {...}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
Critical behavior:
|
|
413
|
+
- Ignores keyword-like text inside single-quoted literals
|
|
414
|
+
- Searches PRIMARY INDEX / PARTITION BY only in the table-options tail
|
|
415
|
+
(after the column-definition block).
|
|
416
|
+
"""
|
|
417
|
+
ddl_clean = _compress_whitespace(_strip_sql_comments(ddl))
|
|
418
|
+
ddl_masked = _mask_single_quoted_literals_same_len(ddl_clean)
|
|
419
|
+
|
|
420
|
+
# Compute tail start from CLEAN (robust scanner ignores strings/quoted identifiers)
|
|
421
|
+
tail_start = _find_create_table_columns_block_end(ddl_clean)
|
|
422
|
+
|
|
423
|
+
# Tail slices (use masked for searching, clean for extracting)
|
|
424
|
+
ddl_tail_clean = ddl_clean[tail_start:]
|
|
425
|
+
ddl_tail_masked = ddl_masked[tail_start:]
|
|
426
|
+
ddl_tail_masked_upper = ddl_tail_masked.upper()
|
|
427
|
+
|
|
428
|
+
# -------- Primary Index --------
|
|
429
|
+
primary_index_columns: List[str] = []
|
|
430
|
+
|
|
431
|
+
if "NO PRIMARY INDEX" in ddl_tail_masked_upper:
|
|
432
|
+
primary_index_columns = []
|
|
433
|
+
else:
|
|
434
|
+
m = re.search(r"\b(?:UNIQUE\s+)?PRIMARY\s+INDEX\b", ddl_tail_masked_upper)
|
|
435
|
+
if m:
|
|
436
|
+
idx_rel = ddl_tail_masked.find("(", m.end())
|
|
437
|
+
if idx_rel != -1:
|
|
438
|
+
inside, _ = _extract_parenthesized_list(ddl_tail_clean, idx_rel)
|
|
439
|
+
items = _split_top_level_commas(inside)
|
|
440
|
+
primary_index_columns = [_normalize_identifier(x) for x in items if x.strip()]
|
|
441
|
+
|
|
442
|
+
# -------- Partition By --------
|
|
443
|
+
partition_columns: List[str] = []
|
|
444
|
+
partitioning_levels: List[Dict[str, Any]] = []
|
|
445
|
+
partitioning_by_column: Dict[str, List[Dict[str, Any]]] = {}
|
|
446
|
+
|
|
447
|
+
# IMPORTANT: search ONLY in tail (prevents matching PARTITION BY inside DEFAULT literals)
|
|
448
|
+
m2 = re.search(r"\bPARTITION\s+BY\b", ddl_tail_masked_upper)
|
|
449
|
+
if m2:
|
|
450
|
+
after_masked = ddl_tail_masked[m2.end():].lstrip()
|
|
451
|
+
start_after_rel = m2.end() + (len(ddl_tail_masked[m2.end():]) - len(after_masked))
|
|
452
|
+
|
|
453
|
+
if after_masked.startswith("("):
|
|
454
|
+
inside_masked, end_idx_rel = _extract_parenthesized_list(after_masked, 0)
|
|
455
|
+
partition_expr_masked = inside_masked
|
|
456
|
+
|
|
457
|
+
raw_after_clean = ddl_tail_clean[start_after_rel : start_after_rel + end_idx_rel]
|
|
458
|
+
partition_expr_raw = raw_after_clean[1:-1].strip() if raw_after_clean.startswith("(") else raw_after_clean.strip()
|
|
459
|
+
else:
|
|
460
|
+
stop = re.search(
|
|
461
|
+
r"\b(?:PRIMARY\s+INDEX|UNIQUE\s+PRIMARY\s+INDEX|INDEX|UNIQUE|WITH|NO\s+FALLBACK|FALLBACK|"
|
|
462
|
+
r"JOURNAL|CHECKSUM|MERGEBLOCKRATIO|MAP|DEFAULT\s+MERGEBLOCKRATIO|DATABLOCKSIZE)\b",
|
|
463
|
+
after_masked,
|
|
464
|
+
flags=re.IGNORECASE,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
partition_expr_masked = after_masked[: stop.start()].strip() if stop else after_masked.strip()
|
|
468
|
+
partition_expr_masked = partition_expr_masked.rstrip(";").strip()
|
|
469
|
+
|
|
470
|
+
after_clean = ddl_tail_clean[start_after_rel:].lstrip()
|
|
471
|
+
partition_expr_raw = after_clean[: stop.start()].strip() if stop else after_clean.strip()
|
|
472
|
+
partition_expr_raw = partition_expr_raw.rstrip(";").strip()
|
|
473
|
+
|
|
474
|
+
# Parse levels from masked expression (literals already neutralized)
|
|
475
|
+
partitioning_levels = _parse_partition_elements(partition_expr_masked)
|
|
476
|
+
|
|
477
|
+
# Overwrite raw with original (unmasked) top-level pieces when possible
|
|
478
|
+
raw_elements = _split_top_level_commas(partition_expr_raw) if partition_expr_raw else []
|
|
479
|
+
if raw_elements and len(raw_elements) == len(partitioning_levels):
|
|
480
|
+
for i in range(len(partitioning_levels)):
|
|
481
|
+
partitioning_levels[i]["raw"] = raw_elements[i].strip()
|
|
482
|
+
|
|
483
|
+
partitioning_by_column = _partitioning_by_column(partitioning_levels)
|
|
484
|
+
|
|
485
|
+
# Flat list of partition columns (dedup in first-seen order)
|
|
486
|
+
seen = set()
|
|
487
|
+
flat: List[str] = []
|
|
488
|
+
for lvl in partitioning_levels:
|
|
489
|
+
for c in lvl.get("columns", []):
|
|
490
|
+
k = c.lower()
|
|
491
|
+
if k not in seen:
|
|
492
|
+
seen.add(k)
|
|
493
|
+
flat.append(c)
|
|
494
|
+
partition_columns = flat
|
|
495
|
+
|
|
496
|
+
return {
|
|
497
|
+
"primary_index_columns": primary_index_columns,
|
|
498
|
+
"partition_columns": partition_columns,
|
|
499
|
+
"partitioning_levels": partitioning_levels,
|
|
500
|
+
"partitioning_by_column": partitioning_by_column,
|
|
501
|
+
}
|