tdfs4ds 0.2.5.3__py3-none-any.whl → 0.2.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,501 @@
1
+ import re
2
+ from typing import List, Dict, Any
3
+
4
+
5
+ def _strip_sql_comments(sql: str) -> str:
6
+ """
7
+ Remove Teradata-style comments:
8
+ - /* ... */ block comments
9
+ - -- ... end-of-line comments
10
+ BUT do not treat comment markers inside single-quoted literals or double-quoted identifiers as comments.
11
+ """
12
+ out = []
13
+ i = 0
14
+ n = len(sql)
15
+
16
+ in_squote = False # '...'
17
+ in_dquote = False # "..."
18
+
19
+ while i < n:
20
+ ch = sql[i]
21
+
22
+ # Toggle double-quoted identifiers
23
+ if not in_squote and ch == '"':
24
+ in_dquote = not in_dquote
25
+ out.append(ch)
26
+ i += 1
27
+ continue
28
+
29
+ # Handle single-quoted literals with doubled quotes '' escape
30
+ if not in_dquote and ch == "'":
31
+ if in_squote:
32
+ # if this is an escaped single quote inside a literal: ''
33
+ if i + 1 < n and sql[i + 1] == "'":
34
+ out.append("''")
35
+ i += 2
36
+ continue
37
+ # end literal
38
+ in_squote = False
39
+ out.append(ch)
40
+ i += 1
41
+ continue
42
+ else:
43
+ in_squote = True
44
+ out.append(ch)
45
+ i += 1
46
+ continue
47
+
48
+ # If not inside quotes, detect comments
49
+ if not in_squote and not in_dquote:
50
+ # Line comment --
51
+ if ch == "-" and i + 1 < n and sql[i + 1] == "-":
52
+ # skip until newline (but keep newline if present)
53
+ i += 2
54
+ while i < n and sql[i] != "\n":
55
+ i += 1
56
+ # keep the newline if any
57
+ if i < n and sql[i] == "\n":
58
+ out.append("\n")
59
+ i += 1
60
+ continue
61
+
62
+ # Block comment /* ... */
63
+ if ch == "/" and i + 1 < n and sql[i + 1] == "*":
64
+ i += 2
65
+ while i + 1 < n and not (sql[i] == "*" and sql[i + 1] == "/"):
66
+ i += 1
67
+ i += 2 if i + 1 < n else 0
68
+ out.append(" ")
69
+ continue
70
+
71
+ # normal character
72
+ out.append(ch)
73
+ i += 1
74
+
75
+ return "".join(out)
76
+
77
+
78
+
79
+ def _compress_whitespace(sql: str) -> str:
80
+ return re.sub(r"\s+", " ", sql).strip()
81
+
82
+
83
+ def _extract_parenthesized_list(text: str, start_idx: int):
84
+ """
85
+ Given text and index pointing at an opening '(',
86
+ return (content_inside_parens, index_after_closing_paren).
87
+ """
88
+ if start_idx >= len(text) or text[start_idx] != "(":
89
+ raise ValueError("start_idx must point to '('")
90
+
91
+ depth = 1
92
+ i = start_idx + 1
93
+ content_chars = []
94
+
95
+ while i < len(text) and depth > 0:
96
+ ch = text[i]
97
+ if ch == "(":
98
+ depth += 1
99
+ content_chars.append(ch)
100
+ elif ch == ")":
101
+ depth -= 1
102
+ if depth > 0:
103
+ content_chars.append(ch)
104
+ else:
105
+ content_chars.append(ch)
106
+ i += 1
107
+
108
+ return "".join(content_chars).strip(), i
109
+
110
+
111
+ def _split_top_level_commas(expr: str) -> List[str]:
112
+ """Split by commas that are not inside parentheses."""
113
+ parts, buf = [], []
114
+ depth = 0
115
+ for ch in expr:
116
+ if ch == "(":
117
+ depth += 1
118
+ elif ch == ")":
119
+ depth = max(0, depth - 1)
120
+
121
+ if ch == "," and depth == 0:
122
+ part = "".join(buf).strip()
123
+ if part:
124
+ parts.append(part)
125
+ buf = []
126
+ else:
127
+ buf.append(ch)
128
+
129
+ tail = "".join(buf).strip()
130
+ if tail:
131
+ parts.append(tail)
132
+ return parts
133
+
134
+
135
+ def _normalize_identifier(ident: str) -> str:
136
+ ident = ident.strip()
137
+ if "." in ident:
138
+ ident = ident.split(".")[-1].strip()
139
+ if len(ident) >= 2 and ident[0] == '"' and ident[-1] == '"':
140
+ ident = ident[1:-1]
141
+ return ident.strip()
142
+
143
+
144
+ def _mask_single_quoted_literals_same_len(sql: str) -> str:
145
+ """
146
+ Replace each single-quoted literal with spaces of the same length so that:
147
+ - keywords inside literals can't be detected
148
+ - string length stays identical (indexes still align)
149
+ Handles escaped quotes like 'It''s'.
150
+ """
151
+ return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
152
+
153
+
154
+ # --- Partition parsing helpers (expects literals already masked in the input) ---
155
+
156
+ _PARTITION_STOPWORDS = {
157
+ "range_n", "case_n", "columnar","case",
158
+ "between", "and", "or", "not", "is", "null", "no", "range",
159
+ "in", "like", "exists", "distinct",
160
+ "each", "interval", "day", "month", "year", "from", "to", "every",
161
+ "cast", "extract", "coalesce", "nullif", "trim", "substr", "substring",
162
+ "current_date", "current_timestamp",
163
+ "date", "timestamp", "integer", "smallint", "bigint", "byteint", "decimal", "float",
164
+ "when", "then", "else", "end",
165
+ "format", "zone", "as",
166
+ }
167
+
168
+
169
+ def _find_identifiers(s: str) -> List[str]:
170
+ # Collapse quoted qualifiers: "db"."table".col -> col
171
+ s = re.sub(r'"[^"]+"\s*\.\s*', '', s)
172
+
173
+ toks = re.findall(
174
+ r'"[^"]+"|[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*',
175
+ s
176
+ )
177
+ out = [_normalize_identifier(t) for t in toks]
178
+ return [x for x in out if x]
179
+
180
+
181
+
182
+ def _columns_from_chunk(masked_chunk: str) -> List[str]:
183
+ cols = []
184
+ for c in _find_identifiers(masked_chunk):
185
+ if c.lower() not in _PARTITION_STOPWORDS:
186
+ cols.append(c)
187
+
188
+ seen, out = set(), []
189
+ for c in cols:
190
+ k = c.lower()
191
+ if k not in seen:
192
+ seen.add(k)
193
+ out.append(c)
194
+ return out
195
+
196
+
197
+ def _parse_partition_elements(partition_expr_masked: str) -> List[Dict[str, Any]]:
198
+ """
199
+ Parse a (masked) PARTITION BY expression into ordered levels.
200
+ Each level: {level, kind, columns, raw}
201
+ """
202
+ expr = partition_expr_masked.strip()
203
+ elements = _split_top_level_commas(expr) if expr else []
204
+
205
+ levels: List[Dict[str, Any]] = []
206
+ for level_idx, elem in enumerate(elements if elements else [expr], 1):
207
+ e = (elem or "").strip()
208
+ kind = "UNKNOWN"
209
+ cols: List[str] = []
210
+
211
+ m = re.search(r"\bRANGE_N\s*\(\s*(.*?)\s+BETWEEN\b", e, flags=re.IGNORECASE | re.DOTALL)
212
+ if m:
213
+ kind = "RANGE_N"
214
+ cols = _columns_from_chunk(m.group(1))
215
+
216
+ elif re.search(r"\bCASE_N\s*\(", e, flags=re.IGNORECASE):
217
+ kind = "CASE_N"
218
+ m2 = re.search(r"\bCASE_N\s*\(\s*(.*)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
219
+ inner = m2.group(1) if m2 else e
220
+ parts = _split_top_level_commas(inner)
221
+ tmp: List[str] = []
222
+ for p in parts:
223
+ if re.search(r"\bNO\s+RANGE\b", p, flags=re.IGNORECASE):
224
+ continue
225
+ tmp.extend(_columns_from_chunk(p))
226
+ # dedup
227
+ seen, cols = set(), []
228
+ for c in tmp:
229
+ k = c.lower()
230
+ if k not in seen:
231
+ seen.add(k)
232
+ cols.append(c)
233
+
234
+ elif re.search(r"\bCOLUMNAR\s*\(", e, flags=re.IGNORECASE):
235
+ kind = "COLUMNAR"
236
+ m3 = re.search(r"\bCOLUMNAR\s*\(\s*(.*?)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
237
+ inner = m3.group(1) if m3 else ""
238
+ parts = _split_top_level_commas(inner)
239
+ tmp: List[str] = []
240
+ for p in parts:
241
+ tmp.extend(_columns_from_chunk(p))
242
+ # dedup
243
+ seen, cols = set(), []
244
+ for c in tmp:
245
+ k = c.lower()
246
+ if k not in seen:
247
+ seen.add(k)
248
+ cols.append(c)
249
+
250
+ else:
251
+ cols = _columns_from_chunk(e)
252
+
253
+ levels.append(
254
+ {
255
+ "level": level_idx,
256
+ "kind": kind,
257
+ "columns": cols,
258
+ "raw": e,
259
+ }
260
+ )
261
+
262
+ # If expr was empty, return empty list
263
+ if expr == "":
264
+ return []
265
+ return levels
266
+
267
+
268
+ def _partitioning_by_column(levels: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
269
+ out: Dict[str, List[Dict[str, Any]]] = {}
270
+ for lvl in levels:
271
+ for c in lvl.get("columns", []):
272
+ out.setdefault(c, []).append({"level": lvl["level"], "kind": lvl["kind"]})
273
+ return out
274
+
275
+
276
+ def _find_create_table_columns_block_end(ddl_clean: str) -> int:
277
+ """
278
+ Find index just after the closing ')' of the CREATE TABLE column-definition block.
279
+ Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
280
+ Assumes ddl_clean is comment-stripped + whitespace-compressed.
281
+ """
282
+ # Find CREATE ... TABLE
283
+ m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
284
+ start_search = m.end() if m else 0
285
+
286
+ # Find first '(' after TABLE keyword (should be the column list opener)
287
+ open_idx = ddl_clean.find("(", start_search)
288
+ if open_idx == -1:
289
+ return 0
290
+
291
+ depth = 0
292
+ i = open_idx
293
+ n = len(ddl_clean)
294
+
295
+ while i < n:
296
+ ch = ddl_clean[i]
297
+
298
+ # Skip single-quoted literals: '...''...'
299
+ if ch == "'":
300
+ i += 1
301
+ while i < n:
302
+ if ddl_clean[i] == "'":
303
+ # escaped quote?
304
+ if i + 1 < n and ddl_clean[i + 1] == "'":
305
+ i += 2
306
+ continue
307
+ i += 1
308
+ break
309
+ i += 1
310
+ continue
311
+
312
+ # Skip double-quoted identifiers: "My Col"
313
+ if ch == '"':
314
+ i += 1
315
+ while i < n and ddl_clean[i] != '"':
316
+ i += 1
317
+ i += 1 # consume closing "
318
+ continue
319
+
320
+ if ch == "(":
321
+ depth += 1
322
+ elif ch == ")":
323
+ depth -= 1
324
+ if depth == 0:
325
+ return i + 1 # position after the matching ')'
326
+
327
+ i += 1
328
+
329
+ # If we get here, parentheses didn't balance; fall back to 0 (whole string)
330
+ return 0
331
+
332
+
333
+
334
+ import re
335
+ from typing import List, Dict, Any
336
+
337
+
338
+ def _mask_single_quoted_literals_same_len(sql: str) -> str:
339
+ """
340
+ Replace each single-quoted literal with spaces of the same length so:
341
+ - keywords inside literals can't be detected
342
+ - string length stays identical (indexes still align)
343
+ Handles escaped quotes like 'It''s'.
344
+ """
345
+ return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
346
+
347
+
348
+ def _find_create_table_columns_block_end(ddl_clean: str) -> int:
349
+ """
350
+ Find index just after the closing ')' of the CREATE TABLE column-definition block.
351
+ Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
352
+ Assumes ddl_clean is comment-stripped + whitespace-compressed.
353
+ """
354
+ m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
355
+ start_search = m.end() if m else 0
356
+
357
+ open_idx = ddl_clean.find("(", start_search)
358
+ if open_idx == -1:
359
+ return 0
360
+
361
+ depth = 0
362
+ i = open_idx
363
+ n = len(ddl_clean)
364
+
365
+ while i < n:
366
+ ch = ddl_clean[i]
367
+
368
+ # Skip single-quoted literals: '...''...'
369
+ if ch == "'":
370
+ i += 1
371
+ while i < n:
372
+ if ddl_clean[i] == "'":
373
+ if i + 1 < n and ddl_clean[i + 1] == "'": # escaped quote
374
+ i += 2
375
+ continue
376
+ i += 1
377
+ break
378
+ i += 1
379
+ continue
380
+
381
+ # Skip double-quoted identifiers: "My Col"
382
+ if ch == '"':
383
+ i += 1
384
+ while i < n and ddl_clean[i] != '"':
385
+ i += 1
386
+ i += 1
387
+ continue
388
+
389
+ if ch == "(":
390
+ depth += 1
391
+ elif ch == ")":
392
+ depth -= 1
393
+ if depth == 0:
394
+ return i + 1
395
+
396
+ i += 1
397
+
398
+ # Fallback if unbalanced
399
+ return 0
400
+
401
+
402
+ def analyze_teradata_ddl(ddl: str) -> Dict[str, Any]:
403
+ """
404
+ Analyse Teradata CREATE TABLE DDL and return:
405
+ {
406
+ 'primary_index_columns': [...],
407
+ 'partition_columns': [...],
408
+ 'partitioning_levels': [...],
409
+ 'partitioning_by_column': {...}
410
+ }
411
+
412
+ Critical behavior:
413
+ - Ignores keyword-like text inside single-quoted literals
414
+ - Searches PRIMARY INDEX / PARTITION BY only in the table-options tail
415
+ (after the column-definition block).
416
+ """
417
+ ddl_clean = _compress_whitespace(_strip_sql_comments(ddl))
418
+ ddl_masked = _mask_single_quoted_literals_same_len(ddl_clean)
419
+
420
+ # Compute tail start from CLEAN (robust scanner ignores strings/quoted identifiers)
421
+ tail_start = _find_create_table_columns_block_end(ddl_clean)
422
+
423
+ # Tail slices (use masked for searching, clean for extracting)
424
+ ddl_tail_clean = ddl_clean[tail_start:]
425
+ ddl_tail_masked = ddl_masked[tail_start:]
426
+ ddl_tail_masked_upper = ddl_tail_masked.upper()
427
+
428
+ # -------- Primary Index --------
429
+ primary_index_columns: List[str] = []
430
+
431
+ if "NO PRIMARY INDEX" in ddl_tail_masked_upper:
432
+ primary_index_columns = []
433
+ else:
434
+ m = re.search(r"\b(?:UNIQUE\s+)?PRIMARY\s+INDEX\b", ddl_tail_masked_upper)
435
+ if m:
436
+ idx_rel = ddl_tail_masked.find("(", m.end())
437
+ if idx_rel != -1:
438
+ inside, _ = _extract_parenthesized_list(ddl_tail_clean, idx_rel)
439
+ items = _split_top_level_commas(inside)
440
+ primary_index_columns = [_normalize_identifier(x) for x in items if x.strip()]
441
+
442
+ # -------- Partition By --------
443
+ partition_columns: List[str] = []
444
+ partitioning_levels: List[Dict[str, Any]] = []
445
+ partitioning_by_column: Dict[str, List[Dict[str, Any]]] = {}
446
+
447
+ # IMPORTANT: search ONLY in tail (prevents matching PARTITION BY inside DEFAULT literals)
448
+ m2 = re.search(r"\bPARTITION\s+BY\b", ddl_tail_masked_upper)
449
+ if m2:
450
+ after_masked = ddl_tail_masked[m2.end():].lstrip()
451
+ start_after_rel = m2.end() + (len(ddl_tail_masked[m2.end():]) - len(after_masked))
452
+
453
+ if after_masked.startswith("("):
454
+ inside_masked, end_idx_rel = _extract_parenthesized_list(after_masked, 0)
455
+ partition_expr_masked = inside_masked
456
+
457
+ raw_after_clean = ddl_tail_clean[start_after_rel : start_after_rel + end_idx_rel]
458
+ partition_expr_raw = raw_after_clean[1:-1].strip() if raw_after_clean.startswith("(") else raw_after_clean.strip()
459
+ else:
460
+ stop = re.search(
461
+ r"\b(?:PRIMARY\s+INDEX|UNIQUE\s+PRIMARY\s+INDEX|INDEX|UNIQUE|WITH|NO\s+FALLBACK|FALLBACK|"
462
+ r"JOURNAL|CHECKSUM|MERGEBLOCKRATIO|MAP|DEFAULT\s+MERGEBLOCKRATIO|DATABLOCKSIZE)\b",
463
+ after_masked,
464
+ flags=re.IGNORECASE,
465
+ )
466
+
467
+ partition_expr_masked = after_masked[: stop.start()].strip() if stop else after_masked.strip()
468
+ partition_expr_masked = partition_expr_masked.rstrip(";").strip()
469
+
470
+ after_clean = ddl_tail_clean[start_after_rel:].lstrip()
471
+ partition_expr_raw = after_clean[: stop.start()].strip() if stop else after_clean.strip()
472
+ partition_expr_raw = partition_expr_raw.rstrip(";").strip()
473
+
474
+ # Parse levels from masked expression (literals already neutralized)
475
+ partitioning_levels = _parse_partition_elements(partition_expr_masked)
476
+
477
+ # Overwrite raw with original (unmasked) top-level pieces when possible
478
+ raw_elements = _split_top_level_commas(partition_expr_raw) if partition_expr_raw else []
479
+ if raw_elements and len(raw_elements) == len(partitioning_levels):
480
+ for i in range(len(partitioning_levels)):
481
+ partitioning_levels[i]["raw"] = raw_elements[i].strip()
482
+
483
+ partitioning_by_column = _partitioning_by_column(partitioning_levels)
484
+
485
+ # Flat list of partition columns (dedup in first-seen order)
486
+ seen = set()
487
+ flat: List[str] = []
488
+ for lvl in partitioning_levels:
489
+ for c in lvl.get("columns", []):
490
+ k = c.lower()
491
+ if k not in seen:
492
+ seen.add(k)
493
+ flat.append(c)
494
+ partition_columns = flat
495
+
496
+ return {
497
+ "primary_index_columns": primary_index_columns,
498
+ "partition_columns": partition_columns,
499
+ "partitioning_levels": partitioning_levels,
500
+ "partitioning_by_column": partitioning_by_column,
501
+ }