PyPI - tdfs4ds - Versions diffs - 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl - Mend

tdfs4ds 0.2.5.4py3-none-any.whl → 0.2.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

tdfs4ds/__init__.py +1 -1
tdfs4ds/feature_store/feature_data_processing.py +7 -7
tdfs4ds/lineage/__init__.py +21 -0
tdfs4ds/lineage/indexing.py +501 -0
tdfs4ds/lineage/lineage.py +409 -0
tdfs4ds/lineage/network.py +446 -0
tdfs4ds/utils/lineage.py +2 -0
tdfs4ds/utils/query_management.py +13 -6
{tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/METADATA +10 -3
{tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/RECORD +12 -8
{tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/WHEEL +1 -1
{tdfs4ds-0.2.5.4.dist-info → tdfs4ds-0.2.5.5.dist-info}/top_level.txt +0 -0

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.2.5.4'
+__version__ = '0.2.5.5'
 import difflib
 import logging
 import json

tdfs4ds/feature_store/feature_data_processing.py CHANGED Viewed

@@ -231,7 +231,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
         list_entity_id = [entity_id]
     # Character set handling / pass-through
-    res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
+    res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).splitlines()}
     var_temp2 = []
     for k, v in res.items():
         if 'UNICODE' in v:
@@ -303,7 +303,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
         tdml.execute_sql(query_create_volatile)
         logger_safe('info', 'results calculated and materialized in a volatile table')
     except Exception as e:
-        logger_safe('error', f"query execution failed : {str(e).split('\n')[0]}")
+        logger_safe('error', f"query execution failed : {str(e).splitlines()[0]}")
         raise
@@ -334,7 +334,7 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
         # else: no duplicates
         # logger_safe("info", "No duplicate found.")  # optional
     except Exception as e:
-        logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
+        logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).splitlines()[0])
         raise
     if getattr(tdfs4ds, "DEBUG_MODE", False):
@@ -783,7 +783,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
         try:
             display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
         except Exception as e:
-            logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
+            logger_safe("warning", "display_table failed: %s", str(e).splitlines()[0])
     ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
     ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
@@ -870,7 +870,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
         for q in queries:
             if getattr(tdfs4ds, "DEBUG_MODE", False):
-                logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
+                logger_safe("debug", "Executing merge (head): %s", "\n".join(q.splitlines()[0:3]))
             execute_query(q)
         elapsed_time = time.time() - start_time
@@ -881,7 +881,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
             formatted_elapsed_time, elapsed_time
         )
     except Exception as e:
-        logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
+        logger_safe("exception", "Feature storage (merge) failed: %s", str(e).splitlines()[0])
         raise
     return count_features.NB_ROWS.values[0]
@@ -1028,7 +1028,7 @@ def prepare_feature_ingestion_tdstone2(df, entity_id):
         tdml.execute_sql(query)
     except Exception as e:
         if tdfs4ds.DISPLAY_LOGS:
-            logger_safe('debug',str(e).split('\n')[0])
+            logger_safe('debug',str(e).splitlines()[0])
         tdml.execute_sql(f'DELETE {volatile_table_name}')
     # Optionally print the query if the display flag is set.

tdfs4ds/lineage/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .lineage import (
+    analyze_sql_query
+)
+from .indexing import (
+    analyze_teradata_ddl,
+)
+from .network import (
+    build_teradata_dependency_graph,
+    plot_lineage_sankey,
+    show_plotly_robust
+)
+__all__ = [
+    "analyze_sql_query",
+    "analyze_teradata_ddl",
+    "build_teradata_dependency_graph",
+    "plot_lineage_sankey",
+    "show_plotly_robust"
+]

tdfs4ds/lineage/indexing.py ADDED Viewed

@@ -0,0 +1,501 @@
+import re
+from typing import List, Dict, Any
+def _strip_sql_comments(sql: str) -> str:
+    """
+    Remove Teradata-style comments:
+      - /* ... */ block comments
+      - -- ... end-of-line comments
+    BUT do not treat comment markers inside single-quoted literals or double-quoted identifiers as comments.
+    """
+    out = []
+    i = 0
+    n = len(sql)
+    in_squote = False  # '...'
+    in_dquote = False  # "..."
+    while i < n:
+        ch = sql[i]
+        # Toggle double-quoted identifiers
+        if not in_squote and ch == '"':
+            in_dquote = not in_dquote
+            out.append(ch)
+            i += 1
+            continue
+        # Handle single-quoted literals with doubled quotes '' escape
+        if not in_dquote and ch == "'":
+            if in_squote:
+                # if this is an escaped single quote inside a literal: ''
+                if i + 1 < n and sql[i + 1] == "'":
+                    out.append("''")
+                    i += 2
+                    continue
+                # end literal
+                in_squote = False
+                out.append(ch)
+                i += 1
+                continue
+            else:
+                in_squote = True
+                out.append(ch)
+                i += 1
+                continue
+        # If not inside quotes, detect comments
+        if not in_squote and not in_dquote:
+            # Line comment --
+            if ch == "-" and i + 1 < n and sql[i + 1] == "-":
+                # skip until newline (but keep newline if present)
+                i += 2
+                while i < n and sql[i] != "\n":
+                    i += 1
+                # keep the newline if any
+                if i < n and sql[i] == "\n":
+                    out.append("\n")
+                    i += 1
+                continue
+            # Block comment /* ... */
+            if ch == "/" and i + 1 < n and sql[i + 1] == "*":
+                i += 2
+                while i + 1 < n and not (sql[i] == "*" and sql[i + 1] == "/"):
+                    i += 1
+                i += 2 if i + 1 < n else 0
+                out.append(" ")
+                continue
+        # normal character
+        out.append(ch)
+        i += 1
+    return "".join(out)
+def _compress_whitespace(sql: str) -> str:
+    return re.sub(r"\s+", " ", sql).strip()
+def _extract_parenthesized_list(text: str, start_idx: int):
+    """
+    Given text and index pointing at an opening '(',
+    return (content_inside_parens, index_after_closing_paren).
+    """
+    if start_idx >= len(text) or text[start_idx] != "(":
+        raise ValueError("start_idx must point to '('")
+    depth = 1
+    i = start_idx + 1
+    content_chars = []
+    while i < len(text) and depth > 0:
+        ch = text[i]
+        if ch == "(":
+            depth += 1
+            content_chars.append(ch)
+        elif ch == ")":
+            depth -= 1
+            if depth > 0:
+                content_chars.append(ch)
+        else:
+            content_chars.append(ch)
+        i += 1
+    return "".join(content_chars).strip(), i
+def _split_top_level_commas(expr: str) -> List[str]:
+    """Split by commas that are not inside parentheses."""
+    parts, buf = [], []
+    depth = 0
+    for ch in expr:
+        if ch == "(":
+            depth += 1
+        elif ch == ")":
+            depth = max(0, depth - 1)
+        if ch == "," and depth == 0:
+            part = "".join(buf).strip()
+            if part:
+                parts.append(part)
+            buf = []
+        else:
+            buf.append(ch)
+    tail = "".join(buf).strip()
+    if tail:
+        parts.append(tail)
+    return parts
+def _normalize_identifier(ident: str) -> str:
+    ident = ident.strip()
+    if "." in ident:
+        ident = ident.split(".")[-1].strip()
+    if len(ident) >= 2 and ident[0] == '"' and ident[-1] == '"':
+        ident = ident[1:-1]
+    return ident.strip()
+def _mask_single_quoted_literals_same_len(sql: str) -> str:
+    """
+    Replace each single-quoted literal with spaces of the same length so that:
+      - keywords inside literals can't be detected
+      - string length stays identical (indexes still align)
+    Handles escaped quotes like 'It''s'.
+    """
+    return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
+# --- Partition parsing helpers (expects literals already masked in the input) ---
+_PARTITION_STOPWORDS = {
+    "range_n", "case_n", "columnar","case",
+    "between", "and", "or", "not", "is", "null", "no", "range",
+    "in", "like", "exists", "distinct",
+    "each", "interval", "day", "month", "year", "from", "to", "every",
+    "cast", "extract", "coalesce", "nullif", "trim", "substr", "substring",
+    "current_date", "current_timestamp",
+    "date", "timestamp", "integer", "smallint", "bigint", "byteint", "decimal", "float",
+    "when", "then", "else", "end",
+    "format", "zone", "as",
+}
+def _find_identifiers(s: str) -> List[str]:
+    # Collapse quoted qualifiers: "db"."table".col  -> col
+    s = re.sub(r'"[^"]+"\s*\.\s*', '', s)
+    toks = re.findall(
+        r'"[^"]+"|[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*',
+        s
+    )
+    out = [_normalize_identifier(t) for t in toks]
+    return [x for x in out if x]
+def _columns_from_chunk(masked_chunk: str) -> List[str]:
+    cols = []
+    for c in _find_identifiers(masked_chunk):
+        if c.lower() not in _PARTITION_STOPWORDS:
+            cols.append(c)
+    seen, out = set(), []
+    for c in cols:
+        k = c.lower()
+        if k not in seen:
+            seen.add(k)
+            out.append(c)
+    return out
+def _parse_partition_elements(partition_expr_masked: str) -> List[Dict[str, Any]]:
+    """
+    Parse a (masked) PARTITION BY expression into ordered levels.
+    Each level: {level, kind, columns, raw}
+    """
+    expr = partition_expr_masked.strip()
+    elements = _split_top_level_commas(expr) if expr else []
+    levels: List[Dict[str, Any]] = []
+    for level_idx, elem in enumerate(elements if elements else [expr], 1):
+        e = (elem or "").strip()
+        kind = "UNKNOWN"
+        cols: List[str] = []
+        m = re.search(r"\bRANGE_N\s*\(\s*(.*?)\s+BETWEEN\b", e, flags=re.IGNORECASE | re.DOTALL)
+        if m:
+            kind = "RANGE_N"
+            cols = _columns_from_chunk(m.group(1))
+        elif re.search(r"\bCASE_N\s*\(", e, flags=re.IGNORECASE):
+            kind = "CASE_N"
+            m2 = re.search(r"\bCASE_N\s*\(\s*(.*)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
+            inner = m2.group(1) if m2 else e
+            parts = _split_top_level_commas(inner)
+            tmp: List[str] = []
+            for p in parts:
+                if re.search(r"\bNO\s+RANGE\b", p, flags=re.IGNORECASE):
+                    continue
+                tmp.extend(_columns_from_chunk(p))
+            # dedup
+            seen, cols = set(), []
+            for c in tmp:
+                k = c.lower()
+                if k not in seen:
+                    seen.add(k)
+                    cols.append(c)
+        elif re.search(r"\bCOLUMNAR\s*\(", e, flags=re.IGNORECASE):
+            kind = "COLUMNAR"
+            m3 = re.search(r"\bCOLUMNAR\s*\(\s*(.*?)\s*\)\s*$", e, flags=re.IGNORECASE | re.DOTALL)
+            inner = m3.group(1) if m3 else ""
+            parts = _split_top_level_commas(inner)
+            tmp: List[str] = []
+            for p in parts:
+                tmp.extend(_columns_from_chunk(p))
+            # dedup
+            seen, cols = set(), []
+            for c in tmp:
+                k = c.lower()
+                if k not in seen:
+                    seen.add(k)
+                    cols.append(c)
+        else:
+            cols = _columns_from_chunk(e)
+        levels.append(
+            {
+                "level": level_idx,
+                "kind": kind,
+                "columns": cols,
+                "raw": e,
+            }
+        )
+    # If expr was empty, return empty list
+    if expr == "":
+        return []
+    return levels
+def _partitioning_by_column(levels: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
+    out: Dict[str, List[Dict[str, Any]]] = {}
+    for lvl in levels:
+        for c in lvl.get("columns", []):
+            out.setdefault(c, []).append({"level": lvl["level"], "kind": lvl["kind"]})
+    return out
+def _find_create_table_columns_block_end(ddl_clean: str) -> int:
+    """
+    Find index just after the closing ')' of the CREATE TABLE column-definition block.
+    Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
+    Assumes ddl_clean is comment-stripped + whitespace-compressed.
+    """
+    # Find CREATE ... TABLE
+    m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
+    start_search = m.end() if m else 0
+    # Find first '(' after TABLE keyword (should be the column list opener)
+    open_idx = ddl_clean.find("(", start_search)
+    if open_idx == -1:
+        return 0
+    depth = 0
+    i = open_idx
+    n = len(ddl_clean)
+    while i < n:
+        ch = ddl_clean[i]
+        # Skip single-quoted literals: '...''...'
+        if ch == "'":
+            i += 1
+            while i < n:
+                if ddl_clean[i] == "'":
+                    # escaped quote?
+                    if i + 1 < n and ddl_clean[i + 1] == "'":
+                        i += 2
+                        continue
+                    i += 1
+                    break
+                i += 1
+            continue
+        # Skip double-quoted identifiers: "My Col"
+        if ch == '"':
+            i += 1
+            while i < n and ddl_clean[i] != '"':
+                i += 1
+            i += 1  # consume closing "
+            continue
+        if ch == "(":
+            depth += 1
+        elif ch == ")":
+            depth -= 1
+            if depth == 0:
+                return i + 1  # position after the matching ')'
+        i += 1
+    # If we get here, parentheses didn't balance; fall back to 0 (whole string)
+    return 0
+import re
+from typing import List, Dict, Any
+def _mask_single_quoted_literals_same_len(sql: str) -> str:
+    """
+    Replace each single-quoted literal with spaces of the same length so:
+      - keywords inside literals can't be detected
+      - string length stays identical (indexes still align)
+    Handles escaped quotes like 'It''s'.
+    """
+    return re.sub(r"'([^']|'')*'", lambda m: " " * len(m.group(0)), sql)
+def _find_create_table_columns_block_end(ddl_clean: str) -> int:
+    """
+    Find index just after the closing ')' of the CREATE TABLE column-definition block.
+    Robust: ignores parentheses inside single-quoted literals and double-quoted identifiers.
+    Assumes ddl_clean is comment-stripped + whitespace-compressed.
+    """
+    m = re.search(r"\bCREATE\b.*?\bTABLE\b", ddl_clean, flags=re.IGNORECASE)
+    start_search = m.end() if m else 0
+    open_idx = ddl_clean.find("(", start_search)
+    if open_idx == -1:
+        return 0
+    depth = 0
+    i = open_idx
+    n = len(ddl_clean)
+    while i < n:
+        ch = ddl_clean[i]
+        # Skip single-quoted literals: '...''...'
+        if ch == "'":
+            i += 1
+            while i < n:
+                if ddl_clean[i] == "'":
+                    if i + 1 < n and ddl_clean[i + 1] == "'":  # escaped quote
+                        i += 2
+                        continue
+                    i += 1
+                    break
+                i += 1
+            continue
+        # Skip double-quoted identifiers: "My Col"
+        if ch == '"':
+            i += 1
+            while i < n and ddl_clean[i] != '"':
+                i += 1
+            i += 1
+            continue
+        if ch == "(":
+            depth += 1
+        elif ch == ")":
+            depth -= 1
+            if depth == 0:
+                return i + 1
+        i += 1
+    # Fallback if unbalanced
+    return 0
+def analyze_teradata_ddl(ddl: str) -> Dict[str, Any]:
+    """
+    Analyse Teradata CREATE TABLE DDL and return:
+      {
+        'primary_index_columns': [...],
+        'partition_columns': [...],
+        'partitioning_levels': [...],
+        'partitioning_by_column': {...}
+      }
+    Critical behavior:
+    - Ignores keyword-like text inside single-quoted literals
+    - Searches PRIMARY INDEX / PARTITION BY only in the table-options tail
+      (after the column-definition block).
+    """
+    ddl_clean = _compress_whitespace(_strip_sql_comments(ddl))
+    ddl_masked = _mask_single_quoted_literals_same_len(ddl_clean)
+    # Compute tail start from CLEAN (robust scanner ignores strings/quoted identifiers)
+    tail_start = _find_create_table_columns_block_end(ddl_clean)
+    # Tail slices (use masked for searching, clean for extracting)
+    ddl_tail_clean = ddl_clean[tail_start:]
+    ddl_tail_masked = ddl_masked[tail_start:]
+    ddl_tail_masked_upper = ddl_tail_masked.upper()
+    # -------- Primary Index --------
+    primary_index_columns: List[str] = []
+    if "NO PRIMARY INDEX" in ddl_tail_masked_upper:
+        primary_index_columns = []
+    else:
+        m = re.search(r"\b(?:UNIQUE\s+)?PRIMARY\s+INDEX\b", ddl_tail_masked_upper)
+        if m:
+            idx_rel = ddl_tail_masked.find("(", m.end())
+            if idx_rel != -1:
+                inside, _ = _extract_parenthesized_list(ddl_tail_clean, idx_rel)
+                items = _split_top_level_commas(inside)
+                primary_index_columns = [_normalize_identifier(x) for x in items if x.strip()]
+    # -------- Partition By --------
+    partition_columns: List[str] = []
+    partitioning_levels: List[Dict[str, Any]] = []
+    partitioning_by_column: Dict[str, List[Dict[str, Any]]] = {}
+    # IMPORTANT: search ONLY in tail (prevents matching PARTITION BY inside DEFAULT literals)
+    m2 = re.search(r"\bPARTITION\s+BY\b", ddl_tail_masked_upper)
+    if m2:
+        after_masked = ddl_tail_masked[m2.end():].lstrip()
+        start_after_rel = m2.end() + (len(ddl_tail_masked[m2.end():]) - len(after_masked))
+        if after_masked.startswith("("):
+            inside_masked, end_idx_rel = _extract_parenthesized_list(after_masked, 0)
+            partition_expr_masked = inside_masked
+            raw_after_clean = ddl_tail_clean[start_after_rel : start_after_rel + end_idx_rel]
+            partition_expr_raw = raw_after_clean[1:-1].strip() if raw_after_clean.startswith("(") else raw_after_clean.strip()
+        else:
+            stop = re.search(
+                r"\b(?:PRIMARY\s+INDEX|UNIQUE\s+PRIMARY\s+INDEX|INDEX|UNIQUE|WITH|NO\s+FALLBACK|FALLBACK|"
+                r"JOURNAL|CHECKSUM|MERGEBLOCKRATIO|MAP|DEFAULT\s+MERGEBLOCKRATIO|DATABLOCKSIZE)\b",
+                after_masked,
+                flags=re.IGNORECASE,
+            )
+            partition_expr_masked = after_masked[: stop.start()].strip() if stop else after_masked.strip()
+            partition_expr_masked = partition_expr_masked.rstrip(";").strip()
+            after_clean = ddl_tail_clean[start_after_rel:].lstrip()
+            partition_expr_raw = after_clean[: stop.start()].strip() if stop else after_clean.strip()
+            partition_expr_raw = partition_expr_raw.rstrip(";").strip()
+        # Parse levels from masked expression (literals already neutralized)
+        partitioning_levels = _parse_partition_elements(partition_expr_masked)
+        # Overwrite raw with original (unmasked) top-level pieces when possible
+        raw_elements = _split_top_level_commas(partition_expr_raw) if partition_expr_raw else []
+        if raw_elements and len(raw_elements) == len(partitioning_levels):
+            for i in range(len(partitioning_levels)):
+                partitioning_levels[i]["raw"] = raw_elements[i].strip()
+        partitioning_by_column = _partitioning_by_column(partitioning_levels)
+        # Flat list of partition columns (dedup in first-seen order)
+        seen = set()
+        flat: List[str] = []
+        for lvl in partitioning_levels:
+            for c in lvl.get("columns", []):
+                k = c.lower()
+                if k not in seen:
+                    seen.add(k)
+                    flat.append(c)
+        partition_columns = flat
+    return {
+        "primary_index_columns": primary_index_columns,
+        "partition_columns": partition_columns,
+        "partitioning_levels": partitioning_levels,
+        "partitioning_by_column": partitioning_by_column,
+    }

tdfs4ds 0.2.5.4__py3-none-any.whl → 0.2.5.5__py3-none-any.whl

tdfs4ds 0.2.5.4py3-none-any.whl → 0.2.5.5py3-none-any.whl