tdfs4ds 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.5.4'
1
+ __version__ = '0.2.5.5'
2
2
  import difflib
3
3
  import logging
4
4
  import json
@@ -231,7 +231,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
231
231
  list_entity_id = [entity_id]
232
232
 
233
233
  # Character set handling / pass-through
234
- res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
234
+ res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).splitlines()}
235
235
  var_temp2 = []
236
236
  for k, v in res.items():
237
237
  if 'UNICODE' in v:
@@ -303,7 +303,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
303
303
  tdml.execute_sql(query_create_volatile)
304
304
  logger_safe('info', 'results calculated and materialized in a volatile table')
305
305
  except Exception as e:
306
- logger_safe('error', f"query execution failed : {str(e).split('\n')[0]}")
306
+ logger_safe('error', f"query execution failed : {str(e).splitlines()[0]}")
307
307
  raise
308
308
 
309
309
 
@@ -334,7 +334,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
334
334
  # else: no duplicates
335
335
  # logger_safe("info", "No duplicate found.") # optional
336
336
  except Exception as e:
337
- logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
337
+ logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).splitlines()[0])
338
338
  raise
339
339
 
340
340
  if getattr(tdfs4ds, "DEBUG_MODE", False):
@@ -783,7 +783,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
783
783
  try:
784
784
  display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
785
785
  except Exception as e:
786
- logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
786
+ logger_safe("warning", "display_table failed: %s", str(e).splitlines()[0])
787
787
 
788
788
  ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
789
789
  ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
@@ -870,7 +870,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
870
870
 
871
871
  for q in queries:
872
872
  if getattr(tdfs4ds, "DEBUG_MODE", False):
873
- logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
873
+ logger_safe("debug", "Executing merge (head): %s", "\n".join(q.splitlines()[0:3]))
874
874
  execute_query(q)
875
875
 
876
876
  elapsed_time = time.time() - start_time
@@ -881,7 +881,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
881
881
  formatted_elapsed_time, elapsed_time
882
882
  )
883
883
  except Exception as e:
884
- logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
884
+ logger_safe("exception", "Feature storage (merge) failed: %s", str(e).splitlines()[0])
885
885
  raise
886
886
 
887
887
  return count_features.NB_ROWS.values[0]
@@ -1028,7 +1028,7 @@ def prepare_feature_ingestion_tdstone2(df, entity_id):
1028
1028
  tdml.execute_sql(query)
1029
1029
  except Exception as e:
1030
1030
  if tdfs4ds.DISPLAY_LOGS:
1031
- logger_safe('debug',str(e).split('\n')[0])
1031
+ logger_safe('debug',str(e).splitlines()[0])
1032
1032
  tdml.execute_sql(f'DELETE {volatile_table_name}')
1033
1033
 
1034
1034
  # Optionally print the query if the display flag is set.
@@ -0,0 +1,21 @@
1
+ from .lineage import (
2
+ analyze_sql_query
3
+ )
4
+
5
+ from .indexing import (
6
+ analyze_teradata_ddl,
7
+ )
8
+
9
+ from .network import (
10
+ build_teradata_dependency_graph,
11
+ plot_lineage_sankey,
12
+ show_plotly_robust
13
+ )
14
+
15
+ __all__ = [
16
+ "analyze_sql_query",
17
+ "analyze_teradata_ddl",
18
+ "build_teradata_dependency_graph",
19
+ "plot_lineage_sankey",
20
+ "show_plotly_robust"
21
+ ]
@@ -0,0 +1,501 @@
1
+ import re
2
+ from typing import List, Dict, Any
3
+
4
+
5
+ def _strip_sql_comments(sql: str) -> str:
6
+ """
7
+ Remove Teradata-style comments:
8
+ - /* ... */ block comments
9
+ - -- ... end-of-line comments
10
+ BUT do not treat comment markers inside single-quoted literals or double-quoted identifiers as comments.
11
+ """
12
+ out = []
13
+ i = 0
14
+ n = len(sql)
15
+
16
+ in_squote = False # '...'
17
+ in_dquote = False # "..."
18
+
19
+ while i < n:
20
+ ch = sql[i]
21
+
22
+ # Toggle double-quoted identifiers
23
+ if not in_squote and ch == '"':
24
+ in_dquote = not in_dquote
25
+ out.append(ch)
26
+ i += 1
27
+ continue
28
+
29
+ # Handle single-quoted literals with doubled quotes '' escape
30
+ if not in_dquote and ch == "'":
31
+ if in_squote:
32
+ # if this is an escaped single quote inside a literal: ''
33
+ if i + 1 < n and sql[i + 1] == "'":
34
+ out.append("''")
35
+ i += 2
36
+ continue
37
+ # end literal
38
+ in_squote = False
39
+ out.append(ch)
40
+ i += 1
41
+ continue
42
+ else:
43
+ in_squote = True
44
+ out.append(ch)
45
+ i += 1
46
+ continue
47
+
48
+ # If not inside quotes, detect comments
49
+ if not in_squote and not in_dquote:
50
+ # Line comment --
51
+ if ch == "-" and i + 1 < n and sql[i + 1] == "-":
52
+ # skip until newline (but keep newline if present)
53
+ i += 2
54
+ while i < n and sql[i] != "\n":
55
+ i += 1
56
+ # keep the newline if any
57
+ if i < n and sql[i] == "\n":
58
+ out.append("\n")
59
+ i += 1
60
+ continue
61
+
62
+ # Block comment /* ... */
63
+ if ch == "/" and i + 1 < n and sql[i + 1] == "*":
64
+ i += 2
65
+ while i + 1 < n and not (sql[i] == "*" and sql[i + 1] == "/"):
66
+ i += 1
67
+ i += 2 if i + 1 < n else 0
68
+ out.append(" ")
69
+ continue
70
+
71
+ # normal character
72
+ out.append(ch)
73
+ i += 1
74
+
75
+ return "".join(out)
76
+
77
+
78
+
79
+ def _compress_whitespace(sql: str) -> str:
80
+ return re.sub(r"\s+", " ", sql).strip()
81
+
82
+
83
+ def _extract_parenthesized_list(text: str, start_idx: int):
84
+ """
85
+ Given text and index pointing at an opening '(',
86
+ return (content_inside_parens, index_after_closing_paren).
87
+ """
88
+ if start_idx >= len(text) or text[start_idx] != "(":
89
+ raise ValueError("start_idx must point to '('")
90
+
91
+ depth = 1
92
+ i = start_idx + 1
93
+ content_chars = []
94
+
95
+ while i < len(text) and depth > 0:
96
+ ch = text[i]
97
+ if ch == "(":
98
+ depth += 1
99
+ content_chars.append(ch)
100
+ elif ch == ")":
101
+ depth -= 1
102
+ if depth > 0:
103
+ content_chars.append(ch)
104
+ else:
105
+ content_chars.append(ch)
106
+ i += 1
107
+
108
+ return "".join(content_chars).strip(), i
109
+
110
+
111
+ def _split_top_level_commas(expr: str) -> List[str]:
112
+ """Split by commas that are not inside parentheses."""
113
+ parts, buf = [], []
114
+ depth = 0
115
+ for ch in expr:
116
+ if ch == "(":
117
+ depth += 1
118
+ elif ch == ")":
119
+ depth = max(0, depth - 1)
120
+
121
+ if ch == "," and depth == 0:
122
+ part = "".join(buf).strip()
123
+ if part:
124
+ parts.append(part)
125
+ buf = []
126
+ else:
127
+ buf.append(ch)
128
+
129
+ tail = "".join(buf).strip()
130
+ if tail:
131
+ parts.append(tail)
132
+ return parts
133
+
134
+
135
+ def _normalize_identifier(ident: str) -> str:
136
+ ident = ident.strip()
137
+ if "." in ident:
138
+ ident = ident.split(".")[-1].strip()
139
+ if len(ident) >= 2 and ident[0] == '"' and ident[-1] == '"':
140
+ ident = ident[1:-1]
141
+ return ident.strip()
142
+
143
+
144
+ def _mask_single_quoted_literals_same_len(sql: str) -> str:
145
+ """
146
+ Replace each single-quoted literal with spaces of the same length so that:
147
+ - keywords inside literals can't be detected
148
+ - string length stays identical (indexes still align)
149
+ Handles escaped quotes like 'It''s'.
150
+ """
151
+ return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
152
+
153
+
154
+ # --- Partition parsing helpers (expects literals already masked in the input) ---
155
+
156
+ _PARTITION_STOPWORDS = {
157
+ "range_n", "case_n", "columnar","case",
158
+ "between", "and", "or", "not", "is", "null", "no", "range",
159
+ "in", "like", "exists", "distinct",
160
+ "each", "interval", "day", "month", "year", "from", "to", "every",
161
+ "cast", "extract", "coalesce", "nullif", "trim", "substr", "substring",
162
+ "current_date", "current_timestamp",
163
+ "date", "timestamp", "integer", "smallint", "bigint", "byteint", "decimal", "float",
164
+ "when", "then", "else", "end",
165
+ "format", "zone", "as",
166
+ }
167
+
168
+
169
+ def _find_identifiers(s: str) -> List[str]:
170
+ # Collapse quoted qualifiers: "db"."table".col -> col
171
+ s = re.sub(r'"[^"]+"\s*\.\s*', '', s)
172
+
173
+ toks = re.findall(
174
+ r'"[^"]+"|[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*',
175
+ s
176
+ )
177
+ out = [_normalize_identifier(t) for t in toks]
178
+ return [x for x in out if x]
179
+
180
+
181
+
182
+ def _columns_from_chunk(masked_chunk: str) -> List[str]:
183
+ cols = []
184
+ for c in _find_identifiers(masked_chunk):
185
+ if c.lower() not in _PARTITION_STOPWORDS:
186
+ cols.append(c)
187
+
188
+ seen, out = set(), []
189
+ for c in cols:
190
+ k = c.lower()
191
+ if k not in seen:
192
+ seen.add(k)
193
+ out.append(c)
194
+ return out
195
+
196
+
197
+ def _parse_partition_elements(partition_expr_masked: str) -> List[Dict[str, Any]]:
198
+ """
199
+ Parse a (masked) PARTITION BY expression into ordered levels.
200
+ Each level: {level, kind, columns, raw}
201
+ """
202
+ expr = partition_expr_masked.strip()
203
+ elements = _split_top_level_commas(expr) if expr else []
204
+
205
+ levels: List[Dict[str, Any]] = []
206
+ for level_idx, elem in enumerate(elements if elements else [expr], 1):
207
+ e = (elem or "").strip()
208
+ kind = "UNKNOWN"
209
+ cols: List[str] = []
210
+
211
+ m = re.search(r"\bRANGE_N\s*\(\s*(.*?)\s+BETWEEN\b", e, flags=re.IGNORECASE | re.DOTALL)
212
+ if m:
213
+ kind = "RANGE_N"
214
+ cols = _columns_from_chunk(m.group(1))
215
+
216
+ elif re.search(r"\bCASE_N\s*\(", e, flags=re.IGNORECASE):
217
+ kind = "CASE_N"
218
+ m2 = re.search(r"\bCASE_N\s*\(\s*(.*)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
219
+ inner = m2.group(1) if m2 else e
220
+ parts = _split_top_level_commas(inner)
221
+ tmp: List[str] = []
222
+ for p in parts:
223
+ if re.search(r"\bNO\s+RANGE\b", p, flags=re.IGNORECASE):
224
+ continue
225
+ tmp.extend(_columns_from_chunk(p))
226
+ # dedup
227
+ seen, cols = set(), []
228
+ for c in tmp:
229
+ k = c.lower()
230
+ if k not in seen:
231
+ seen.add(k)
232
+ cols.append(c)
233
+
234
+ elif re.search(r"\bCOLUMNAR\s*\(", e, flags=re.IGNORECASE):
235
+ kind = "COLUMNAR"
236
+ m3 = re.search(r"\bCOLUMNAR\s*\(\s*(.*?)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
237
+ inner = m3.group(1) if m3 else ""
238
+ parts = _split_top_level_commas(inner)
239
+ tmp: List[str] = []
240
+ for p in parts:
241
+ tmp.extend(_columns_from_chunk(p))
242
+ # dedup
243
+ seen, cols = set(), []
244
+ for c in tmp:
245
+ k = c.lower()
246
+ if k not in seen:
247
+ seen.add(k)
248
+ cols.append(c)
249
+
250
+ else:
251
+ cols = _columns_from_chunk(e)
252
+
253
+ levels.append(
254
+ {
255
+ "level": level_idx,
256
+ "kind": kind,
257
+ "columns": cols,
258
+ "raw": e,
259
+ }
260
+ )
261
+
262
+ # If expr was empty, return empty list
263
+ if expr == "":
264
+ return []
265
+ return levels
266
+
267
+
268
+ def _partitioning_by_column(levels: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
269
+ out: Dict[str, List[Dict[str, Any]]] = {}
270
+ for lvl in levels:
271
+ for c in lvl.get("columns", []):
272
+ out.setdefault(c, []).append({"level": lvl["level"], "kind": lvl["kind"]})
273
+ return out
274
+
275
+
276
+ def _find_create_table_columns_block_end(ddl_clean: str) -> int:
277
+ """
278
+ Find index just after the closing ')' of the CREATE TABLE column-definition block.
279
+ Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
280
+ Assumes ddl_clean is comment-stripped + whitespace-compressed.
281
+ """
282
+ # Find CREATE ... TABLE
283
+ m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
284
+ start_search = m.end() if m else 0
285
+
286
+ # Find first '(' after TABLE keyword (should be the column list opener)
287
+ open_idx = ddl_clean.find("(", start_search)
288
+ if open_idx == -1:
289
+ return 0
290
+
291
+ depth = 0
292
+ i = open_idx
293
+ n = len(ddl_clean)
294
+
295
+ while i < n:
296
+ ch = ddl_clean[i]
297
+
298
+ # Skip single-quoted literals: '...''...'
299
+ if ch == "'":
300
+ i += 1
301
+ while i < n:
302
+ if ddl_clean[i] == "'":
303
+ # escaped quote?
304
+ if i + 1 < n and ddl_clean[i + 1] == "'":
305
+ i += 2
306
+ continue
307
+ i += 1
308
+ break
309
+ i += 1
310
+ continue
311
+
312
+ # Skip double-quoted identifiers: "My Col"
313
+ if ch == '"':
314
+ i += 1
315
+ while i < n and ddl_clean[i] != '"':
316
+ i += 1
317
+ i += 1 # consume closing "
318
+ continue
319
+
320
+ if ch == "(":
321
+ depth += 1
322
+ elif ch == ")":
323
+ depth -= 1
324
+ if depth == 0:
325
+ return i + 1 # position after the matching ')'
326
+
327
+ i += 1
328
+
329
+ # If we get here, parentheses didn't balance; fall back to 0 (whole string)
330
+ return 0
331
+
332
+
333
+
334
+ import re
335
+ from typing import List, Dict, Any
336
+
337
+
338
+ def _mask_single_quoted_literals_same_len(sql: str) -> str:
339
+ """
340
+ Replace each single-quoted literal with spaces of the same length so:
341
+ - keywords inside literals can't be detected
342
+ - string length stays identical (indexes still align)
343
+ Handles escaped quotes like 'It''s'.
344
+ """
345
+ return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
346
+
347
+
348
+ def _find_create_table_columns_block_end(ddl_clean: str) -> int:
349
+ """
350
+ Find index just after the closing ')' of the CREATE TABLE column-definition block.
351
+ Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
352
+ Assumes ddl_clean is comment-stripped + whitespace-compressed.
353
+ """
354
+ m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
355
+ start_search = m.end() if m else 0
356
+
357
+ open_idx = ddl_clean.find("(", start_search)
358
+ if open_idx == -1:
359
+ return 0
360
+
361
+ depth = 0
362
+ i = open_idx
363
+ n = len(ddl_clean)
364
+
365
+ while i < n:
366
+ ch = ddl_clean[i]
367
+
368
+ # Skip single-quoted literals: '...''...'
369
+ if ch == "'":
370
+ i += 1
371
+ while i < n:
372
+ if ddl_clean[i] == "'":
373
+ if i + 1 < n and ddl_clean[i + 1] == "'": # escaped quote
374
+ i += 2
375
+ continue
376
+ i += 1
377
+ break
378
+ i += 1
379
+ continue
380
+
381
+ # Skip double-quoted identifiers: "My Col"
382
+ if ch == '"':
383
+ i += 1
384
+ while i < n and ddl_clean[i] != '"':
385
+ i += 1
386
+ i += 1
387
+ continue
388
+
389
+ if ch == "(":
390
+ depth += 1
391
+ elif ch == ")":
392
+ depth -= 1
393
+ if depth == 0:
394
+ return i + 1
395
+
396
+ i += 1
397
+
398
+ # Fallback if unbalanced
399
+ return 0
400
+
401
+
402
+ def analyze_teradata_ddl(ddl: str) -> Dict[str, Any]:
403
+ """
404
+ Analyse Teradata CREATE TABLE DDL and return:
405
+ {
406
+ 'primary_index_columns': [...],
407
+ 'partition_columns': [...],
408
+ 'partitioning_levels': [...],
409
+ 'partitioning_by_column': {...}
410
+ }
411
+
412
+ Critical behavior:
413
+ - Ignores keyword-like text inside single-quoted literals
414
+ - Searches PRIMARY INDEX / PARTITION BY only in the table-options tail
415
+ (after the column-definition block).
416
+ """
417
+ ddl_clean = _compress_whitespace(_strip_sql_comments(ddl))
418
+ ddl_masked = _mask_single_quoted_literals_same_len(ddl_clean)
419
+
420
+ # Compute tail start from CLEAN (robust scanner ignores strings/quoted identifiers)
421
+ tail_start = _find_create_table_columns_block_end(ddl_clean)
422
+
423
+ # Tail slices (use masked for searching, clean for extracting)
424
+ ddl_tail_clean = ddl_clean[tail_start:]
425
+ ddl_tail_masked = ddl_masked[tail_start:]
426
+ ddl_tail_masked_upper = ddl_tail_masked.upper()
427
+
428
+ # -------- Primary Index --------
429
+ primary_index_columns: List[str] = []
430
+
431
+ if "NO PRIMARY INDEX" in ddl_tail_masked_upper:
432
+ primary_index_columns = []
433
+ else:
434
+ m = re.search(r"\b(?:UNIQUE\s+)?PRIMARY\s+INDEX\b", ddl_tail_masked_upper)
435
+ if m:
436
+ idx_rel = ddl_tail_masked.find("(", m.end())
437
+ if idx_rel != -1:
438
+ inside, _ = _extract_parenthesized_list(ddl_tail_clean, idx_rel)
439
+ items = _split_top_level_commas(inside)
440
+ primary_index_columns = [_normalize_identifier(x) for x in items if x.strip()]
441
+
442
+ # -------- Partition By --------
443
+ partition_columns: List[str] = []
444
+ partitioning_levels: List[Dict[str, Any]] = []
445
+ partitioning_by_column: Dict[str, List[Dict[str, Any]]] = {}
446
+
447
+ # IMPORTANT: search ONLY in tail (prevents matching PARTITION BY inside DEFAULT literals)
448
+ m2 = re.search(r"\bPARTITION\s+BY\b", ddl_tail_masked_upper)
449
+ if m2:
450
+ after_masked = ddl_tail_masked[m2.end():].lstrip()
451
+ start_after_rel = m2.end() + (len(ddl_tail_masked[m2.end():]) - len(after_masked))
452
+
453
+ if after_masked.startswith("("):
454
+ inside_masked, end_idx_rel = _extract_parenthesized_list(after_masked, 0)
455
+ partition_expr_masked = inside_masked
456
+
457
+ raw_after_clean = ddl_tail_clean[start_after_rel : start_after_rel + end_idx_rel]
458
+ partition_expr_raw = raw_after_clean[1:-1].strip() if raw_after_clean.startswith("(") else raw_after_clean.strip()
459
+ else:
460
+ stop = re.search(
461
+ r"\b(?:PRIMARY\s+INDEX|UNIQUE\s+PRIMARY\s+INDEX|INDEX|UNIQUE|WITH|NO\s+FALLBACK|FALLBACK|"
462
+ r"JOURNAL|CHECKSUM|MERGEBLOCKRATIO|MAP|DEFAULT\s+MERGEBLOCKRATIO|DATABLOCKSIZE)\b",
463
+ after_masked,
464
+ flags=re.IGNORECASE,
465
+ )
466
+
467
+ partition_expr_masked = after_masked[: stop.start()].strip() if stop else after_masked.strip()
468
+ partition_expr_masked = partition_expr_masked.rstrip(";").strip()
469
+
470
+ after_clean = ddl_tail_clean[start_after_rel:].lstrip()
471
+ partition_expr_raw = after_clean[: stop.start()].strip() if stop else after_clean.strip()
472
+ partition_expr_raw = partition_expr_raw.rstrip(";").strip()
473
+
474
+ # Parse levels from masked expression (literals already neutralized)
475
+ partitioning_levels = _parse_partition_elements(partition_expr_masked)
476
+
477
+ # Overwrite raw with original (unmasked) top-level pieces when possible
478
+ raw_elements = _split_top_level_commas(partition_expr_raw) if partition_expr_raw else []
479
+ if raw_elements and len(raw_elements) == len(partitioning_levels):
480
+ for i in range(len(partitioning_levels)):
481
+ partitioning_levels[i]["raw"] = raw_elements[i].strip()
482
+
483
+ partitioning_by_column = _partitioning_by_column(partitioning_levels)
484
+
485
+ # Flat list of partition columns (dedup in first-seen order)
486
+ seen = set()
487
+ flat: List[str] = []
488
+ for lvl in partitioning_levels:
489
+ for c in lvl.get("columns", []):
490
+ k = c.lower()
491
+ if k not in seen:
492
+ seen.add(k)
493
+ flat.append(c)
494
+ partition_columns = flat
495
+
496
+ return {
497
+ "primary_index_columns": primary_index_columns,
498
+ "partition_columns": partition_columns,
499
+ "partitioning_levels": partitioning_levels,
500
+ "partitioning_by_column": partitioning_by_column,
501
+ }