PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

clickzetta-semantic-model-generator 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

semantic_model_generator/relationships/discovery.py ADDED Viewed

@@ -0,0 +1,202 @@
+from __future__ import annotations
+import time
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional, Sequence, Tuple
+import pandas as pd
+from loguru import logger
+from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
+    _TABLE_NAME_COL,
+    get_table_representation,
+    get_valid_schemas_tables_columns_df,
+)
+from semantic_model_generator.data_processing import data_types
+from semantic_model_generator.data_processing.data_types import FQNParts, Table
+from semantic_model_generator.generate_model import (
+    _DEFAULT_N_SAMPLE_VALUES_PER_COL,
+    _infer_relationships,
+)
+from semantic_model_generator.protos import semantic_model_pb2
+try:  # pragma: no cover - optional dependency for type checking
+    from clickzetta.zettapark.session import Session
+except Exception:  # pragma: no cover
+    Session = Any  # type: ignore
+DEFAULT_MAX_WORKERS = 4
+@dataclass
+class RelationshipSummary:
+    total_tables: int
+    total_columns: int
+    total_relationships_found: int
+    processing_time_ms: int
+@dataclass
+class RelationshipDiscoveryResult:
+    relationships: List[semantic_model_pb2.Relationship]
+    tables: List[Table]
+    summary: RelationshipSummary
+def _normalize_table_names(table_names: Optional[Iterable[str]]) -> Optional[List[str]]:
+    if table_names is None:
+        return None
+    return [name.upper() for name in table_names]
+def _build_tables_from_dataframe(
+    session: Session,
+    workspace: str,
+    schema: str,
+    columns_df: pd.DataFrame,
+    sample_values_per_column: int,
+    max_workers: int = DEFAULT_MAX_WORKERS,
+) -> List[Tuple[FQNParts, Table]]:
+    if columns_df.empty:
+        return []
+    if _TABLE_NAME_COL not in columns_df.columns:
+        raise KeyError(
+            f"Expected '{_TABLE_NAME_COL}' column in metadata dataframe. "
+            "Ensure information_schema query returned table names."
+        )
+    table_order = (
+        columns_df[_TABLE_NAME_COL].astype(str).str.upper().drop_duplicates().tolist()
+    )
+    tables: List[Tuple[FQNParts, Table]] = []
+    for idx, table_name in enumerate(table_order):
+        table_columns_df = columns_df[columns_df[_TABLE_NAME_COL] == table_name]
+        if table_columns_df.empty:
+            continue
+        max_workers_for_table = min(max_workers, len(table_columns_df.index) or 1)
+        table_proto = get_table_representation(
+            session=session,
+            workspace=workspace,
+            schema_name=schema,
+            table_name=table_name,
+            table_index=idx,
+            ndv_per_column=sample_values_per_column,
+            columns_df=table_columns_df,
+            max_workers=max_workers_for_table,
+        )
+        tables.append(
+            (
+                FQNParts(database=workspace, schema_name=schema, table=table_name),
+                table_proto,
+            )
+        )
+    return tables
+def _discover_relationships(
+    raw_tables: List[Tuple[FQNParts, Table]],
+    strict_join_inference: bool,
+    session: Optional[Session],
+) -> List[semantic_model_pb2.Relationship]:
+    if not raw_tables:
+        return []
+    relationships = _infer_relationships(
+        raw_tables,
+        session=session if strict_join_inference else None,
+        strict_join_inference=strict_join_inference,
+    )
+    return relationships
+def discover_relationships_from_tables(
+    tables: Sequence[Tuple[FQNParts, Table]],
+    *,
+    strict_join_inference: bool = False,
+    session: Optional[Session] = None,
+) -> RelationshipDiscoveryResult:
+    """
+    Run relationship inference using pre-constructed table metadata.
+    """
+    start = time.perf_counter()
+    relationships = _discover_relationships(
+        list(tables),
+        strict_join_inference=strict_join_inference,
+        session=session,
+    )
+    end = time.perf_counter()
+    all_columns = sum(len(table.columns) for _, table in tables)
+    summary = RelationshipSummary(
+        total_tables=len(tables),
+        total_columns=all_columns,
+        total_relationships_found=len(relationships),
+        processing_time_ms=int((end - start) * 1000),
+    )
+    return RelationshipDiscoveryResult(
+        relationships=relationships,
+        tables=[table for _, table in tables],
+        summary=summary,
+    )
+def discover_relationships_from_schema(
+    session: Session,
+    workspace: str,
+    schema: str,
+    *,
+    table_names: Optional[Sequence[str]] = None,
+    sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
+    strict_join_inference: bool = False,
+    max_workers: int = DEFAULT_MAX_WORKERS,
+) -> RelationshipDiscoveryResult:
+    """
+    Discover table relationships for all tables in a ClickZetta schema.
+    """
+    normalized_tables = _normalize_table_names(table_names)
+    metadata_df = get_valid_schemas_tables_columns_df(
+        session=session,
+        workspace=workspace,
+        table_schema=schema,
+        table_names=normalized_tables,
+    )
+    metadata_df.columns = [str(col).upper() for col in metadata_df.columns]
+    if metadata_df.empty:
+        logger.warning(
+            "No column metadata found for workspace=%s schema=%s tables=%s",
+            workspace,
+            schema,
+            table_names,
+        )
+        return RelationshipDiscoveryResult(
+            relationships=[],
+            tables=[],
+            summary=RelationshipSummary(
+                total_tables=0,
+                total_columns=0,
+                total_relationships_found=0,
+                processing_time_ms=0,
+            ),
+        )
+    raw_tables = _build_tables_from_dataframe(
+        session=session,
+        workspace=workspace,
+        schema=schema,
+        columns_df=metadata_df,
+        sample_values_per_column=sample_values_per_column,
+        max_workers=max_workers,
+    )
+    return discover_relationships_from_tables(
+        raw_tables,
+        strict_join_inference=strict_join_inference,
+        session=session,
+    )

semantic_model_generator/tests/clickzetta_connector_test.py CHANGED Viewed

@@ -3,15 +3,13 @@ from unittest import mock
 import pandas as pd
-from semantic_model_generator.clickzetta_utils import env_vars
 from semantic_model_generator.clickzetta_utils import clickzetta_connector as connector
+from semantic_model_generator.clickzetta_utils import env_vars
 def test_fetch_stages_includes_user_volume(monkeypatch):
     data = pd.DataFrame({"name": ["shared_stage"]})
-    with mock.patch.object(
-        connector, "_execute_query_to_pandas", return_value=data
-    ):
+    with mock.patch.object(connector, "_execute_query_to_pandas", return_value=data):
         stages = connector.fetch_stages_in_schema(
             connection=mock.MagicMock(), schema_name="WORKSPACE.SCHEMA"
         )
@@ -29,9 +27,7 @@ def test_fetch_yaml_names_in_user_volume(monkeypatch):
             ]
         }
     )
-    with mock.patch.object(
-        connector, "_execute_query_to_pandas", return_value=data
-    ):
+    with mock.patch.object(connector, "_execute_query_to_pandas", return_value=data):
         files = connector.fetch_yaml_names_in_stage(
             connection=mock.MagicMock(),
             stage="volume:user://~/semantic_models/",

semantic_model_generator/tests/cte_utils_test.py CHANGED Viewed

@@ -4,10 +4,10 @@ import pytest
 import sqlglot
 from semantic_model_generator.data_processing.cte_utils import (
+    ClickzettaDialect,
     _enrich_column_in_expr_with_aggregation,
     _get_col_expr,
     _validate_col,
-    ClickzettaDialect,
     context_to_column_format,
     expand_all_logical_tables_as_ctes,
     generate_select,

semantic_model_generator/tests/generate_model_classification_test.py CHANGED Viewed

@@ -31,8 +31,18 @@ def test_string_date_promoted_to_time_dimension() -> None:
         id_=0,
         name="ORDERS",
         columns=[
-            Column(id_=0, column_name="order_date", column_type="STRING", values=["2024-01-01", "2024-02-01"]),
-            Column(id_=1, column_name="order_status", column_type="STRING", values=["OPEN", "CLOSED"]),
+            Column(
+                id_=0,
+                column_name="order_date",
+                column_type="STRING",
+                values=["2024-01-01", "2024-02-01"],
+            ),
+            Column(
+                id_=1,
+                column_name="order_status",
+                column_type="STRING",
+                values=["OPEN", "CLOSED"],
+            ),
         ],
     )

semantic_model_generator/tests/llm_enrichment_test.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
-from semantic_model_generator import generate_model
 from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
 from semantic_model_generator.llm.dashscope_client import DashscopeResponse
 from semantic_model_generator.llm.enrichment import enrich_semantic_model
@@ -16,9 +15,16 @@ class _FakeDashscopeClient:
         self._index = 0
     def chat_completion(self, messages):  # type: ignore[no-untyped-def]
-        payload = self._payloads[self._index] if self._index < len(self._payloads) else self._payloads[-1]
+        payload = (
+            self._payloads[self._index]
+            if self._index < len(self._payloads)
+            else self._payloads[-1]
+        )
         self._index += 1
-        return DashscopeResponse(content=json.dumps(payload, ensure_ascii=False), request_id=f"test_{self._index}")
+        return DashscopeResponse(
+            content=json.dumps(payload, ensure_ascii=False),
+            request_id=f"test_{self._index}",
+        )
 def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
@@ -26,15 +32,27 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
         id_=0,
         name="orders",
         columns=[
-            Column(id_=0, column_name="order_status", column_type="STRING", values=["OPEN", "CLOSED"]),
-            Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["12.5", "18.3"]),
+            Column(
+                id_=0,
+                column_name="order_status",
+                column_type="STRING",
+                values=["OPEN", "CLOSED"],
+            ),
+            Column(
+                id_=1,
+                column_name="total_amount",
+                column_type="NUMBER",
+                values=["12.5", "18.3"],
+            ),
         ],
     )
     table_proto = semantic_model_pb2.Table(
         name="ORDERS",
         description="  ",
-        base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
+        base_table=semantic_model_pb2.FullyQualifiedTable(
+            database="SALES", schema="PUBLIC", table="ORDERS"
+        ),
         dimensions=[
             semantic_model_pb2.Dimension(
                 name="order_status",
@@ -98,16 +116,18 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
             }
         ],
         "filters": [
-        {
-            "name": "order_status_include_values",
-            "description": "Limit the result set to a sample of order statuses.",
-            "synonyms": ["Order status filter"],
-        }
+            {
+                "name": "order_status_include_values",
+                "description": "Limit the result set to a sample of order statuses.",
+                "synonyms": ["Order status filter"],
+            }
         ],
         "model_description": "Semantic model for customer orders and related metrics.",
     }
-    client = _FakeDashscopeClient([fake_response, {"model_metrics": []}, {"verified_queries": []}])
+    client = _FakeDashscopeClient(
+        [fake_response, {"model_metrics": []}, {"verified_queries": []}]
+    )
     enrich_semantic_model(
         model,
         [(FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_table)],
@@ -116,7 +136,10 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
     )
     table = model.tables[0]
-    assert table.description == "Orders fact table that records order status and total amount."
+    assert (
+        table.description
+        == "Orders fact table that records order status and total amount."
+    )
     dimension = next(dim for dim in table.dimensions if dim.expr == "order_status")
     assert dimension.description == "Current execution status for each order."
@@ -126,8 +149,12 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
     assert fact.description == "Order total including taxes."
     assert "Order total" in list(fact.synonyms)
-    filter_obj = next(flt for flt in table.filters if flt.name == "order_status_include_values")
-    assert filter_obj.description == "Limit the result set to a sample of order statuses."
+    filter_obj = next(
+        flt for flt in table.filters if flt.name == "order_status_include_values"
+    )
+    assert (
+        filter_obj.description == "Limit the result set to a sample of order statuses."
+    )
     assert "Order status filter" in list(filter_obj.synonyms)
     assert len(table.metrics) == 1
@@ -135,10 +162,15 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
     assert metric.name.startswith("gmv")
     assert metric.expr == "SUM(total_amount)"
     assert "GMV" in list(metric.synonyms)
-    assert metric.description == "Based on total_amount and used as gross merchandise value."
+    assert (
+        metric.description
+        == "Based on total_amount and used as gross merchandise value."
+    )
     assert model.custom_instructions == ""
-    assert model.description == "Semantic model for customer orders and related metrics."
+    assert (
+        model.description == "Semantic model for customer orders and related metrics."
+    )
 class _FakeSession:
@@ -160,8 +192,15 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
         id_=0,
         name="orders",
         columns=[
-            Column(id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]),
-            Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["10", "20"]),
+            Column(
+                id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
+            ),
+            Column(
+                id_=1,
+                column_name="total_amount",
+                column_type="NUMBER",
+                values=["10", "20"],
+            ),
         ],
     )
@@ -169,15 +208,21 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
         id_=1,
         name="payments",
         columns=[
-            Column(id_=0, column_name="payment_id", column_type="NUMBER", values=["1", "2"]),
-            Column(id_=1, column_name="amount", column_type="NUMBER", values=["5", "15"]),
+            Column(
+                id_=0, column_name="payment_id", column_type="NUMBER", values=["1", "2"]
+            ),
+            Column(
+                id_=1, column_name="amount", column_type="NUMBER", values=["5", "15"]
+            ),
         ],
     )
     orders_proto = semantic_model_pb2.Table(
         name="ORDERS",
         description="  ",
-        base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
+        base_table=semantic_model_pb2.FullyQualifiedTable(
+            database="SALES", schema="PUBLIC", table="ORDERS"
+        ),
         facts=[
             semantic_model_pb2.Fact(
                 name="total_amount",
@@ -191,7 +236,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
     payments_proto = semantic_model_pb2.Table(
         name="PAYMENTS",
         description="  ",
-        base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="PAYMENTS"),
+        base_table=semantic_model_pb2.FullyQualifiedTable(
+            database="SALES", schema="PUBLIC", table="PAYMENTS"
+        ),
         facts=[
             semantic_model_pb2.Fact(
                 name="amount",
@@ -202,7 +249,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
         ],
     )
-    model = semantic_model_pb2.SemanticModel(name="Orders Model", tables=[orders_proto, payments_proto])
+    model = semantic_model_pb2.SemanticModel(
+        name="Orders Model", tables=[orders_proto, payments_proto]
+    )
     table_payload = {
         "table_description": "Orders fact table with totals.",
@@ -253,22 +302,32 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
     }
     # Model description response for when _summarize_model_description is called
-    model_description_payload = "This is an orders model for tracking sales and payments."
-    client = _FakeDashscopeClient([
-        table_payload,
-        table_payload_payments,
-        model_description_payload,
-        model_metrics_payload,
-        verified_queries_payload,
-    ])
+    model_description_payload = (
+        "This is an orders model for tracking sales and payments."
+    )
+    client = _FakeDashscopeClient(
+        [
+            table_payload,
+            table_payload_payments,
+            model_description_payload,
+            model_metrics_payload,
+            verified_queries_payload,
+        ]
+    )
     session = _FakeSession()
     enrich_semantic_model(
         model,
         [
-            (FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_orders),
-            (FQNParts(database="SALES", schema_name="PUBLIC", table="PAYMENTS"), raw_payments),
+            (
+                FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
+                raw_orders,
+            ),
+            (
+                FQNParts(database="SALES", schema_name="PUBLIC", table="PAYMENTS"),
+                raw_payments,
+            ),
         ],
         client,
         placeholder="  ",
@@ -297,15 +356,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
         id_=0,
         name="orders",
         columns=[
-            Column(id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]),
-            Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["10", "20"]),
+            Column(
+                id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
+            ),
+            Column(
+                id_=1,
+                column_name="total_amount",
+                column_type="NUMBER",
+                values=["10", "20"],
+            ),
         ],
     )
     orders_proto = semantic_model_pb2.Table(
         name="ORDERS",
         description="  ",
-        base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
+        base_table=semantic_model_pb2.FullyQualifiedTable(
+            database="SALES", schema="PUBLIC", table="ORDERS"
+        ),
         facts=[
             semantic_model_pb2.Fact(
                 name="total_amount",
@@ -341,12 +409,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
     # Model description response for when _summarize_model_description is called
     model_description_payload = "This is an orders model for tracking order metrics."
-    client = _FakeDashscopeClient([table_payload, model_description_payload, model_metrics_payload, verified_queries_payload])
+    client = _FakeDashscopeClient(
+        [
+            table_payload,
+            model_description_payload,
+            model_metrics_payload,
+            verified_queries_payload,
+        ]
+    )
     session = _FakeSession()
     enrich_semantic_model(
         model,
-        [(FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_orders)],
+        [
+            (
+                FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
+                raw_orders,
+            )
+        ],
         client,
         placeholder="  ",
         session=session,
@@ -365,15 +445,27 @@ def test_model_metrics_skipped_with_no_facts() -> None:
         id_=0,
         name="customers",
         columns=[
-            Column(id_=0, column_name="customer_id", column_type="NUMBER", values=["1", "2"]),
-            Column(id_=1, column_name="customer_name", column_type="STRING", values=["Alice", "Bob"]),
+            Column(
+                id_=0,
+                column_name="customer_id",
+                column_type="NUMBER",
+                values=["1", "2"],
+            ),
+            Column(
+                id_=1,
+                column_name="customer_name",
+                column_type="STRING",
+                values=["Alice", "Bob"],
+            ),
         ],
     )
     customers_proto = semantic_model_pb2.Table(
         name="CUSTOMERS",
         description="  ",
-        base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="CUSTOMERS"),
+        base_table=semantic_model_pb2.FullyQualifiedTable(
+            database="SALES", schema="PUBLIC", table="CUSTOMERS"
+        ),
         dimensions=[
             semantic_model_pb2.Dimension(
                 name="customer_name",
@@ -384,7 +476,9 @@ def test_model_metrics_skipped_with_no_facts() -> None:
         ],
     )
-    model = semantic_model_pb2.SemanticModel(name="Customer Model", tables=[customers_proto])
+    model = semantic_model_pb2.SemanticModel(
+        name="Customer Model", tables=[customers_proto]
+    )
     table_payload = {
         "table_description": "Customer dimension table.",
@@ -408,12 +502,24 @@ def test_model_metrics_skipped_with_no_facts() -> None:
     # Model description response for when _summarize_model_description is called
     model_description_payload = "This is a customer dimension model."
-    client = _FakeDashscopeClient([table_payload, model_description_payload, model_metrics_payload, verified_queries_payload])
+    client = _FakeDashscopeClient(
+        [
+            table_payload,
+            model_description_payload,
+            model_metrics_payload,
+            verified_queries_payload,
+        ]
+    )
     session = _FakeSession()
     enrich_semantic_model(
         model,
-        [(FQNParts(database="SALES", schema_name="PUBLIC", table="CUSTOMERS"), raw_customers)],
+        [
+            (
+                FQNParts(database="SALES", schema_name="PUBLIC", table="CUSTOMERS"),
+                raw_customers,
+            )
+        ],
         client,
         placeholder="  ",
         session=session,

clickzetta-semantic-model-generator 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

clickzetta-semantic-model-generator 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl