PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

clickzetta-semantic-model-generator 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

semantic_model_generator/relationships/discovery.py CHANGED Viewed

@@ -2,19 +2,18 @@ from __future__ import annotations
 import time
 from dataclasses import dataclass
-from typing import Any, Iterable, List, Optional, Sequence, Tuple
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
 import pandas as pd
 from loguru import logger
 from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     _TABLE_NAME_COL,
-    _TABLE_SCHEMA_COL,
     get_table_representation,
     get_valid_schemas_tables_columns_df,
 )
 from semantic_model_generator.data_processing import data_types
-from semantic_model_generator.data_processing.data_types import FQNParts, Table
+from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
 from semantic_model_generator.generate_model import (
     _DEFAULT_N_SAMPLE_VALUES_PER_COL,
     _infer_relationships,
@@ -35,6 +34,10 @@ class RelationshipSummary:
     total_columns: int
     total_relationships_found: int
     processing_time_ms: int
+    limited_by_timeout: bool = False
+    limited_by_max_relationships: bool = False
+    limited_by_table_cap: bool = False
+    notes: Optional[str] = None
 @dataclass
@@ -68,11 +71,7 @@ def _build_tables_from_dataframe(
         )
     table_order = (
-        columns_df[_TABLE_NAME_COL]
-        .astype(str)
-        .str.upper()
-        .drop_duplicates()
-        .tolist()
+        columns_df[_TABLE_NAME_COL].astype(str).str.upper().drop_duplicates().tolist()
     )
     tables: List[Tuple[FQNParts, Table]] = []
@@ -102,20 +101,125 @@ def _build_tables_from_dataframe(
     return tables
+def _tables_payload_to_raw_tables(
+    tables: Sequence[Mapping[str, Any]],
+    *,
+    default_workspace: str = "OFFLINE",
+    default_schema: str = "PUBLIC",
+) -> List[Tuple[FQNParts, Table]]:
+    raw_tables: List[Tuple[FQNParts, Table]] = []
+    for table_index, table_entry in enumerate(tables):
+        if not isinstance(table_entry, Mapping):
+            raise TypeError("Each table definition must be a mapping of table metadata")
+        table_name = str(
+            table_entry.get("table_name")
+            or table_entry.get("name")
+            or table_entry.get("table")
+            or ""
+        ).strip()
+        if not table_name:
+            raise ValueError("Table definition missing 'table_name'")
+        workspace = str(table_entry.get("workspace") or default_workspace).strip() or default_workspace
+        schema = str(
+            table_entry.get("schema")
+            or table_entry.get("schema_name")
+            or default_schema
+        ).strip() or default_schema
+        columns_payload = table_entry.get("columns")
+        if not isinstance(columns_payload, Sequence) or not columns_payload:
+            raise ValueError(
+                f"Table '{table_name}' must include a non-empty 'columns' list"
+            )
+        columns: List[Column] = []
+        for column_index, column_entry in enumerate(columns_payload):
+            if not isinstance(column_entry, Mapping):
+                raise TypeError(
+                    f"Column definition for table '{table_name}' must be a mapping"
+                )
+            column_name = str(
+                column_entry.get("name")
+                or column_entry.get("column_name")
+                or column_entry.get("field")
+                or ""
+            ).strip()
+            if not column_name:
+                raise ValueError(
+                    f"Column definition in table '{table_name}' missing 'name'"
+                )
+            column_type = str(
+                column_entry.get("type")
+                or column_entry.get("data_type")
+                or "STRING"
+            ).strip()
+            values = column_entry.get("sample_values") or column_entry.get("values")
+            if isinstance(values, Sequence) and not isinstance(values, (str, bytes)):
+                sample_values = [str(value) for value in values]
+            else:
+                sample_values = None
+            is_primary = bool(
+                column_entry.get("is_primary_key")
+                or column_entry.get("primary_key")
+                or column_entry.get("is_primary")
+            )
+            columns.append(
+                Column(
+                    id_=column_index,
+                    column_name=column_name,
+                    column_type=column_type,
+                    values=sample_values,
+                    comment=column_entry.get("comment"),
+                    is_primary_key=is_primary,
+                )
+            )
+        table_proto = Table(
+            id_=table_index,
+            name=table_name.upper(),
+            columns=columns,
+            comment=table_entry.get("comment"),
+        )
+        fqn = FQNParts(
+            database=workspace.upper(),
+            schema_name=schema.upper(),
+            table=table_name,
+        )
+        raw_tables.append((fqn, table_proto))
+    return raw_tables
 def _discover_relationships(
     raw_tables: List[Tuple[FQNParts, Table]],
     strict_join_inference: bool,
     session: Optional[Session],
-) -> List[semantic_model_pb2.Relationship]:
+    *,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = None,
+) -> Tuple[List[semantic_model_pb2.Relationship], Dict[str, bool]]:
     if not raw_tables:
-        return []
+        return [], {"limited_by_timeout": False, "limited_by_max_relationships": False}
+    status: Dict[str, bool] = {}
     relationships = _infer_relationships(
         raw_tables,
         session=session if strict_join_inference else None,
         strict_join_inference=strict_join_inference,
+        status=status,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
     )
-    return relationships
+    return relationships, status
 def discover_relationships_from_tables(
@@ -123,33 +227,86 @@ def discover_relationships_from_tables(
     *,
     strict_join_inference: bool = False,
     session: Optional[Session] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 30.0,
+    max_tables: Optional[int] = None,
 ) -> RelationshipDiscoveryResult:
     """
     Run relationship inference using pre-constructed table metadata.
     """
     start = time.perf_counter()
-    relationships = _discover_relationships(
-        list(tables),
+    raw_tables = list(tables)
+    limited_by_table_cap = False
+    notes: List[str] = []
+    if max_tables is not None and len(raw_tables) > max_tables:
+        limited_by_table_cap = True
+        notes.append(
+            f"Input contained {len(raw_tables)} tables; analysis limited to first {max_tables}."
+        )
+        raw_tables = raw_tables[:max_tables]
+    relationships, status = _discover_relationships(
+        raw_tables,
         strict_join_inference=strict_join_inference,
         session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
     )
     end = time.perf_counter()
-    all_columns = sum(len(table.columns) for _, table in tables)
+    all_columns = sum(len(table.columns) for _, table in raw_tables)
     summary = RelationshipSummary(
-        total_tables=len(tables),
+        total_tables=len(raw_tables),
         total_columns=all_columns,
         total_relationships_found=len(relationships),
         processing_time_ms=int((end - start) * 1000),
+        limited_by_timeout=status.get("limited_by_timeout", False),
+        limited_by_max_relationships=status.get("limited_by_max_relationships", False),
+        limited_by_table_cap=limited_by_table_cap,
+        notes=" ".join(notes) if notes else None,
     )
     return RelationshipDiscoveryResult(
         relationships=relationships,
-        tables=[table for _, table in tables],
+        tables=[table for _, table in raw_tables],
         summary=summary,
     )
+def discover_relationships_from_table_definitions(
+    table_definitions: Sequence[Mapping[str, Any]],
+    *,
+    default_workspace: str = "OFFLINE",
+    default_schema: str = "PUBLIC",
+    strict_join_inference: bool = False,
+    session: Optional[Session] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 15.0,
+    max_tables: Optional[int] = None,
+) -> RelationshipDiscoveryResult:
+    """Run relationship inference using raw table metadata dictionaries."""
+    raw_tables = _tables_payload_to_raw_tables(
+        table_definitions,
+        default_workspace=default_workspace,
+        default_schema=default_schema,
+    )
+    return discover_relationships_from_tables(
+        raw_tables,
+        strict_join_inference=strict_join_inference,
+        session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
+        max_tables=max_tables,
+    )
 def discover_relationships_from_schema(
     session: Session,
     workspace: str,
@@ -159,6 +316,10 @@ def discover_relationships_from_schema(
     sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
     strict_join_inference: bool = False,
     max_workers: int = DEFAULT_MAX_WORKERS,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 30.0,
+    max_tables: Optional[int] = 60,
 ) -> RelationshipDiscoveryResult:
     """
     Discover table relationships for all tables in a ClickZetta schema.
@@ -204,4 +365,8 @@ def discover_relationships_from_schema(
         raw_tables,
         strict_join_inference=strict_join_inference,
         session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
+        max_tables=max_tables,
     )

semantic_model_generator/tests/clickzetta_connector_test.py CHANGED Viewed

@@ -3,15 +3,13 @@ from unittest import mock
 import pandas as pd
-from semantic_model_generator.clickzetta_utils import env_vars
 from semantic_model_generator.clickzetta_utils import clickzetta_connector as connector
+from semantic_model_generator.clickzetta_utils import env_vars
 def test_fetch_stages_includes_user_volume(monkeypatch):
     data = pd.DataFrame({"name": ["shared_stage"]})
-    with mock.patch.object(
-        connector, "_execute_query_to_pandas", return_value=data
-    ):
+    with mock.patch.object(connector, "_execute_query_to_pandas", return_value=data):
         stages = connector.fetch_stages_in_schema(
             connection=mock.MagicMock(), schema_name="WORKSPACE.SCHEMA"
         )
@@ -29,9 +27,7 @@ def test_fetch_yaml_names_in_user_volume(monkeypatch):
             ]
         }
     )
-    with mock.patch.object(
-        connector, "_execute_query_to_pandas", return_value=data
-    ):
+    with mock.patch.object(connector, "_execute_query_to_pandas", return_value=data):
         files = connector.fetch_yaml_names_in_stage(
             connection=mock.MagicMock(),
             stage="volume:user://~/semantic_models/",

semantic_model_generator/tests/cte_utils_test.py CHANGED Viewed

@@ -4,10 +4,11 @@ import pytest
 import sqlglot
 from semantic_model_generator.data_processing.cte_utils import (
+    ClickzettaDialect,
+    _prepare_sql_for_parsing,
     _enrich_column_in_expr_with_aggregation,
     _get_col_expr,
     _validate_col,
-    ClickzettaDialect,
     context_to_column_format,
     expand_all_logical_tables_as_ctes,
     generate_select,
@@ -304,7 +305,7 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100"
+            "WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
         ]
         assert got == want
@@ -312,8 +313,8 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format_w_agg()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100",
-            "WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100",
+            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
+            "WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
         ]
         assert sorted(got) == sorted(want)
@@ -321,7 +322,7 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format_w_agg_only()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100"
+            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
         ]
         assert sorted(got) == sorted(want)
@@ -437,21 +438,21 @@ class SemanticModelTest(TestCase):
         want = """WITH __t1 AS (SELECT
     d1_expr AS d1,
     d2_expr AS d2
-  FROM db.sc.t1
+  FROM `db`.`sc`.`t1`
 ), __t2 AS (
   SELECT
     td1_expr AS td1,
     m1_expr AS m1,
     m1_expr AS m2,
     m3_expr
-  FROM db.sc.t2
+  FROM `db`.`sc`.`t2`
 )
 SELECT
   *
 FROM __t2"""
-        assert sqlglot.parse_one(want, ClickzettaDialect) == sqlglot.parse_one(
-            got, ClickzettaDialect
-        )
+        assert sqlglot.parse_one(
+            _prepare_sql_for_parsing(want), ClickzettaDialect
+        ) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)
     def test_expand_all_logical_tables_as_ctes_with_column_renaming(self) -> None:
         ctx = semantic_model_pb2.SemanticModel(
@@ -465,12 +466,12 @@ FROM __t2"""
     clcks AS clicks,
     clcks,
     cst
-  FROM db.sc.t1
+  FROM `db`.`sc`.`t1`
 )
 SELECT
   *
 FROM __t1
         """
-        assert sqlglot.parse_one(want, ClickzettaDialect) == sqlglot.parse_one(
-            got, ClickzettaDialect
-        )
+        assert sqlglot.parse_one(
+            _prepare_sql_for_parsing(want), ClickzettaDialect
+        ) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)

semantic_model_generator/tests/generate_model_classification_test.py CHANGED Viewed

@@ -31,8 +31,18 @@ def test_string_date_promoted_to_time_dimension() -> None:
         id_=0,
         name="ORDERS",
         columns=[
-            Column(id_=0, column_name="order_date", column_type="STRING", values=["2024-01-01", "2024-02-01"]),
-            Column(id_=1, column_name="order_status", column_type="STRING", values=["OPEN", "CLOSED"]),
+            Column(
+                id_=0,
+                column_name="order_date",
+                column_type="STRING",
+                values=["2024-01-01", "2024-02-01"],
+            ),
+            Column(
+                id_=1,
+                column_name="order_status",
+                column_type="STRING",
+                values=["OPEN", "CLOSED"],
+            ),
         ],
     )

clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

clickzetta-semantic-model-generator 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl