PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

clickzetta-semantic-model-generator 1.0.3py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: clickzetta-semantic-model-generator
-Version: 1.0.3
+Version: 1.0.4
 Summary: Curate a Semantic Model for ClickZetta Lakehouse
 License: Apache Software License; BSD License
 Author: qililiang

{clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 semantic_model_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=rFBWdNQerLYinn6RoDV_J4k2G4LLofiFkLDa7j8hmng,32888
+semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=LnGQTBj94aC8Zk9aVe2efA6-3UX_E8Q7ITvnfEoByjw,32819
 semantic_model_generator/clickzetta_utils/env_vars.py,sha256=8cbL6R75c1-aVQ2i1TDr9SiHCUjTrgvXbIRz4MbcmbE,7664
-semantic_model_generator/clickzetta_utils/utils.py,sha256=D0SX2faBjwvhFJLt1Yk4mlZmyHmQt7LN93Jrc5YIU-A,3800
+semantic_model_generator/clickzetta_utils/utils.py,sha256=UBfWy9qOTyut8tL02gOHHbh6Uz8RqRz5Mm2YdKWFN54,4950
 semantic_model_generator/data_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-semantic_model_generator/data_processing/cte_utils.py,sha256=jfTJIwc89-0nnelVw_5vpIVRout7V0YooUDfZzTzDr4,16086
+semantic_model_generator/data_processing/cte_utils.py,sha256=-kQw_PfPPe3mf7shQf1XV5rqfqYdB9WK4A-EwAcKc_o,16928
 semantic_model_generator/data_processing/cte_utils_test.py,sha256=l6QkyyH22FexLKjvvbS9Je3YtdTrJE3a-BiknCy1g9s,2822
 semantic_model_generator/data_processing/data_types.py,sha256=1HsSCkdCWvcXiwN3o1-HVQi_ZVIR0lYevXG9CE1TvRc,1172
 semantic_model_generator/data_processing/proto_utils.py,sha256=UwqCfQYilTx68KcA4IYZN7PeM4Pz_pK1h0FrVJomzV8,2938
-semantic_model_generator/generate_model.py,sha256=ogNvx1HNOnC5KIZlGDwcWL7PLMHRs8zcZZbwricffDo,121843
+semantic_model_generator/generate_model.py,sha256=vwISWJzYf4XS1TuLclpxKbberlsRKM99olrFlWaTCUw,125549
 semantic_model_generator/llm/__init__.py,sha256=rLQt2pzRmxtnBLKjxN_qZ2a_nvkFHtmguU5lyajCldw,1030
 semantic_model_generator/llm/dashscope_client.py,sha256=lHS36iqNZbFhwgidPpW1Bwwy4S2O7GeLyMSMdlSoBsY,6050
 semantic_model_generator/llm/enrichment.py,sha256=49e9Jg_jHfhUIEQ3JserEc5DV5sFWA12K76TY4UwnCg,41448
@@ -16,13 +16,13 @@ semantic_model_generator/output_models/.keep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 semantic_model_generator/protos/semantic_model.proto,sha256=WZiN4b8vR-ZX-Lj9Vsm6HjZNAyNvM1znIyut_YkPVSI,16473
 semantic_model_generator/protos/semantic_model_pb2.py,sha256=scbWkW-I-r3_hp_5SHoOWn02p52RJ9DJ0_-nRgr0LHc,25606
 semantic_model_generator/protos/semantic_model_pb2.pyi,sha256=iiBIZxtX9d6IuUO3aLcsJsHUeZqdi14vYNuUsSM8C0g,18267
-semantic_model_generator/relationships/__init__.py,sha256=HN6Opie25Oawt2fCDM_bZwRBVBEzqRsEXgDzYC7ytns,373
-semantic_model_generator/relationships/discovery.py,sha256=l_CixbfRvHBqxmLCmCq7bvQHRt3iUl0o5mui4R5LHXQ,5961
+semantic_model_generator/relationships/__init__.py,sha256=I9-_QJdp36nEllzKTGXi2aWbRjiXrrexQXUfB6mi3Ww,477
+semantic_model_generator/relationships/discovery.py,sha256=BdPHIvlE6yuaQv0ELWwQlq0qx0uX7fkoEMfuvK8wO60,12147
 semantic_model_generator/tests/clickzetta_connector_test.py,sha256=Fdx7jooNt1lslKB2Ub51wqOZ8OM0osgZiDDl3bV6riw,3086
-semantic_model_generator/tests/cte_utils_test.py,sha256=LdhWw_bHZDE1LyS2hBVy_VTNjLgodonesWaxw8jXpV4,17385
+semantic_model_generator/tests/cte_utils_test.py,sha256=_9GAJiOPGSagdWmQsoAEOOhEgsBY0LFlr_xtwrlgf4A,17561
 semantic_model_generator/tests/generate_model_classification_test.py,sha256=Amq29cmeKd0S7iVikJ60RFm9gpWaQv1TijXofp3J-lI,2275
 semantic_model_generator/tests/llm_enrichment_test.py,sha256=1avLrPWp7J7o_K3PKbI_PIvduM5Id21MmoL0JTeDTfs,15738
-semantic_model_generator/tests/relationship_discovery_test.py,sha256=SOuXCwbmSUgvZoOS2s5oGK1w0LW283M1hg--QlLaDVA,3490
+semantic_model_generator/tests/relationship_discovery_test.py,sha256=OvnK2jhWNFfHI31eeIEmclgaUoFjj_mZuDFAnjLMBpw,5411
 semantic_model_generator/tests/relationships_filters_test.py,sha256=bUm3r1UGaXca-hJOot7jMPz4It_TVsoddd-Xpk-76zM,10166
 semantic_model_generator/tests/samples/validate_yamls.py,sha256=262j-2i2oFZtTyK2susOrbxxE5eS-6IN-V0jFEOpt_w,156249
 semantic_model_generator/tests/utils_test.py,sha256=HWRXR45QYL1f6L8xsMppqLXzF9HAsrMwTMQIKpZrc_M,539
@@ -32,7 +32,7 @@ semantic_model_generator/validate/context_length.py,sha256=HL-GfaRXNcVji1-pAFGXG
 semantic_model_generator/validate/keywords.py,sha256=frZ5HjRXP69K6dYAU5_d86oSp40_3yoLUg1eQwU3oLM,7080
 semantic_model_generator/validate/schema.py,sha256=eL_wl5yscIeczwNBRUKhF_7QqWW2wSGimkgaOhMFsrA,5893
 semantic_model_generator/validate_model.py,sha256=Uq-V-GfPeF2Dy4l9uF5Guv104gDCDGh0Cxz1AJOu5dk,836
-clickzetta_semantic_model_generator-1.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-clickzetta_semantic_model_generator-1.0.3.dist-info/METADATA,sha256=A1kBc4PO_LEbIjWM-24jHnnV6NynmowuX5Jy91tlWBk,7816
-clickzetta_semantic_model_generator-1.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-clickzetta_semantic_model_generator-1.0.3.dist-info/RECORD,,
+clickzetta_semantic_model_generator-1.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+clickzetta_semantic_model_generator-1.0.4.dist-info/METADATA,sha256=zf4rBVSbisDDtZOnw5SoxGCRrO-PjfMQ66PinfYK3xg,7816
+clickzetta_semantic_model_generator-1.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+clickzetta_semantic_model_generator-1.0.4.dist-info/RECORD,,

semantic_model_generator/clickzetta_utils/clickzetta_connector.py CHANGED Viewed

@@ -11,7 +11,12 @@ from clickzetta.zettapark.session import Session
 from loguru import logger
 from semantic_model_generator.clickzetta_utils import env_vars
-from semantic_model_generator.clickzetta_utils.utils import create_session
+from semantic_model_generator.clickzetta_utils.utils import (
+    create_session,
+    join_quoted_identifiers,
+    normalize_identifier,
+    quote_identifier,
+)
 from semantic_model_generator.data_processing.data_types import Column, Table
 ConnectionType = TypeVar("ConnectionType", bound=Session)
@@ -151,18 +156,8 @@ class ClickzettaConnectionProxy:
         self.session.close()
-def _quote_identifier(name: str) -> str:
-    return f'"{name}"'
 def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
-    return ".".join(
-        [
-            _quote_identifier(workspace),
-            _quote_identifier(schema_name),
-            _quote_identifier(table_name),
-        ]
-    )
+    return join_quoted_identifiers(workspace, schema_name, table_name)
 def _value_is_true(value: Any) -> bool:
@@ -175,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
 def _sanitize_identifier(value: Any, fallback: str = "") -> str:
-    if value is None or value == "":
+    normalized = normalize_identifier(value)
+    if not normalized:
         return fallback
-    normalized = str(value).strip()
-    if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
-        normalized = normalized[1:-1]
     return normalized
@@ -216,21 +209,19 @@ def _fetch_distinct_values(
     column_name: str,
     ndv: int,
 ) -> Optional[List[str]]:
-    workspace_part = (
-        _sanitize_identifier(workspace, workspace).upper() if workspace else ""
-    )
+    workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
     schema_part = (
-        _sanitize_identifier(schema_name, schema_name).upper() if schema_name else ""
+        _sanitize_identifier(schema_name, schema_name) if schema_name else ""
     )
-    table_part = _sanitize_identifier(table_name, table_name).upper()
-    column_part = _sanitize_identifier(column_name, column_name).upper()
+    table_part = _sanitize_identifier(table_name, table_name)
+    column_part = _sanitize_identifier(column_name, column_name)
-    qualified_parts = [
-        part for part in (workspace_part, schema_part, table_part) if part
-    ]
-    qualified_table = ".".join(qualified_parts)
+    qualified_table = join_quoted_identifiers(
+        workspace_part, schema_part, table_part
+    )
+    column_expr = quote_identifier(column_part)
-    query = f"SELECT DISTINCT {column_part} FROM {qualified_table} LIMIT {ndv}"
+    query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
     try:
         df = session.sql(query).to_pandas()
         if df.empty:
@@ -660,9 +651,8 @@ def fetch_tables_views_in_schema(
     try:
         if workspace_upper and schema_upper:
-            df = session.sql(
-                f"SHOW TABLES IN {workspace_upper}.{schema_upper}"
-            ).to_pandas()
+            scope = join_quoted_identifiers(workspace_upper, schema_upper)
+            df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
         else:
             df = session.sql("SHOW TABLES").to_pandas()
     except Exception as exc:  # pragma: no cover
@@ -738,11 +728,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
     queries: List[str] = []
     if schema:
-        queries.append(f"SHOW VOLUMES IN {workspace}.{schema}")
-        queries.append(f"SHOW STAGES IN SCHEMA {workspace}.{schema}")
+        scope = join_quoted_identifiers(workspace, schema)
+        if scope:
+            queries.append(f"SHOW VOLUMES IN {scope}")
+            queries.append(f"SHOW STAGES IN SCHEMA {scope}")
     else:
-        queries.append(f"SHOW VOLUMES IN {workspace}")
-        queries.append(f"SHOW STAGES IN DATABASE {workspace}")
+        workspace_identifier = quote_identifier(workspace)
+        if workspace_identifier:
+            queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
+            queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
     stage_names: List[str] = ["volume:user://~/semantic_models/"]
     seen: set[str] = set(stage_names)
@@ -899,7 +893,7 @@ def create_table_in_schema(
     columns_schema: Dict[str, str],
 ) -> bool:
     fields = ", ".join(
-        f"{_quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
+        f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
     )
     query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
     try:

semantic_model_generator/clickzetta_utils/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from contextlib import contextmanager
-from typing import Dict, Iterable
+from typing import Any, Dict, Iterable
 from clickzetta.zettapark.session import Session
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
 }
+def normalize_identifier(value: Any) -> str:
+    """
+    Strips outer quotes/backticks and surrounding whitespace from an identifier.
+    Returns an empty string when the identifier is missing.
+    """
+    if value is None:
+        return ""
+    text = str(value).strip()
+    if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
+        return text[1:-1]
+    return text
+def quote_identifier(value: Any) -> str:
+    """
+    Wraps an identifier in backticks, escaping embedded backticks as needed.
+    Returns an empty string if the identifier is missing.
+    """
+    normalized = normalize_identifier(value)
+    if not normalized:
+        return ""
+    escaped = normalized.replace("`", "``")
+    return f"`{escaped}`"
+def join_quoted_identifiers(*parts: Any) -> str:
+    """
+    Joins identifier parts with '.' and ensures each segment is backtick-quoted.
+    Empty segments are skipped.
+    """
+    quoted_parts = [
+        quote_identifier(part)
+        for part in parts
+        if normalize_identifier(part)
+    ]
+    return ".".join(part for part in quoted_parts if part)
 def create_fqn_table(fqn_str: str) -> FQNParts:
     """
     Splits a fully qualified table name into its ClickZetta components.
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
         ("schema", schema),
         ("vcluster", vcluster),
     ):
-        session.sql(f"USE {component.upper()} {value.upper()}")
+        identifier = quote_identifier(value)
+        session.sql(f"USE {component.upper()} {identifier}")
 def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:

semantic_model_generator/data_processing/cte_utils.py CHANGED Viewed

@@ -11,12 +11,34 @@ from sqlglot import Dialect
 from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     OBJECT_DATATYPES,
 )
+from semantic_model_generator.clickzetta_utils.utils import (
+    join_quoted_identifiers,
+    normalize_identifier,
+)
 from semantic_model_generator.protos import semantic_model_pb2
 _SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
 ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
 _LOGICAL_TABLE_PREFIX = "__"
+_SQLGLOT_QUOTE_CHAR = '"'
+def _prepare_sql_for_parsing(sql: str) -> str:
+    """
+    Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
+    """
+    return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
+def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
+    """
+    Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
+    """
+    rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
+    return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
 def is_logical_table(table_name: str) -> bool:
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
 def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
     """Returns fully qualified table name such as my_db.my_schema.my_table"""
-    fqn = table.table
-    if len(table.schema) > 0:
-        fqn = f"{table.schema}.{fqn}"
-    if len(table.database) > 0:
-        fqn = f"{table.database}.{fqn}"
-    return fqn  # type: ignore[no-any-return]
+    parts = [
+        normalize_identifier(component)
+        for component in (table.database, table.schema, table.table)
+        if component
+    ]
+    return join_quoted_identifiers(*parts)  # type: ignore[no-any-return]
 def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
@@ -156,8 +178,8 @@ def _generate_cte_for(
         cte = f"WITH {logical_table_name(table)} AS (\n"
         cte += "SELECT \n"
         cte += ",\n".join(expr_columns) + "\n"
-        cte += f"FROM {fully_qualified_table_name(table.base_table)}"
-        cte += ")"
+        cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
+        cte += ")\n"
         return cte
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
     str: The SQL statement in ClickZetta syntax.
     """
     try:
-        expression = sqlglot.parse_one(sql, dialect=ClickzettaDialect)
+        expression = sqlglot.parse_one(
+            _prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
+        )
     except Exception as e:
         raise ValueError(
             f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
         )
-    return expression.sql(dialect=ClickzettaDialect)
+    return _render_clickzetta_sql(expression)
 def generate_select(
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
     for cte in ctes:
         new_withs.append(
             sqlglot.parse_one(
-                cte, read=ClickzettaDialect, into=sqlglot.expressions.With
+                _prepare_sql_for_parsing(cte),
+                read=ClickzettaDialect,
+                into=sqlglot.expressions.With,
             )
         )
     # Step 3: Prefix the CTEs to the original query.
-    ast = sqlglot.parse_one(sql_query, read=ClickzettaDialect)
+    ast = sqlglot.parse_one(
+        _prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
+    )
     with_ = ast.args.get("with")
     # If the query doesn't have a WITH clause, then generate one.
     if with_ is None:
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
     else:
         new_ctes = [w.expressions[0] for w in new_withs]
         with_.set("expressions", new_ctes + with_.expressions)
-    return ast.sql(dialect=ClickzettaDialect, pretty=True)  # type: ignore [no-any-return]
+    return _render_clickzetta_sql(
+        ast, pretty=True
+    )  # type: ignore [no-any-return]
 def context_to_column_format(

semantic_model_generator/generate_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import math
 import os
 import re
+import time
 from collections import defaultdict
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -17,7 +18,12 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     get_table_representation,
     get_valid_schemas_tables_columns_df,
 )
-from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
+from semantic_model_generator.clickzetta_utils.utils import (
+    create_fqn_table,
+    join_quoted_identifiers,
+    normalize_identifier,
+    quote_identifier,
+)
 from semantic_model_generator.data_processing import data_types, proto_utils
 from semantic_model_generator.llm import (
     DashscopeClient,
@@ -41,6 +47,14 @@ _AUTOGEN_COMMENT_TOKEN = (
 )
 _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
 _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
+_GENERIC_IDENTIFIER_TOKENS = {
+    "ID",
+    "NAME",
+    "CODE",
+    "KEY",
+    "VALUE",
+    "NUMBER",
+}
 def _singularize(token: str) -> str:
@@ -90,6 +104,14 @@ def _identifier_tokens(
     return tokens
+def _is_generic_identifier(name: str) -> bool:
+    tokens = [token for token in _identifier_tokens(name) if token]
+    if not tokens:
+        return True
+    normalized_tokens = {token.upper() for token in tokens}
+    return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
 def _sanitize_identifier_name(
     name: str, prefixes_to_drop: Optional[set[str]] = None
 ) -> str:
@@ -354,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
 def _format_sql_identifier(name: str) -> str:
     """
-    Formats an identifier for SQL (without quoting) by stripping quotes and uppercasing.
+    Formats an identifier for SQL by wrapping it in backticks.
     """
-    if not name:
-        return ""
-    return str(name).replace('"', "").replace("`", "").strip().upper()
+    return quote_identifier(name)
 def _qualified_table_name(fqn: data_types.FQNParts) -> str:
     """
-    Builds a fully qualified table name without quoting.
+    Builds a fully qualified, backtick-quoted table name.
     """
-    parts = [part for part in (fqn.database, fqn.schema_name, fqn.table) if part]
-    return ".".join(_format_sql_identifier(part) for part in parts if part)
+    parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
+    return join_quoted_identifiers(*(part for part in parts if part))
 def _levenshtein_distance(s1: str, s2: str) -> int:
@@ -977,6 +997,19 @@ def _calculate_relationship_confidence(
         confidence_score += name_confidence
+        generic_pair_count = sum(
+            1
+            for left_col, right_col in column_pairs
+            if _is_generic_identifier(left_col)
+            and _is_generic_identifier(right_col)
+        )
+        if generic_pair_count:
+            penalty = min(0.15 * generic_pair_count, 0.3)
+            confidence_score = max(confidence_score - penalty, 0.0)
+            reasoning_factors.append(
+                f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
+            )
         # Check for foreign key naming patterns
         fk_pattern_confidence = 0.0
         for left_col, right_col in column_pairs:
@@ -2326,11 +2359,31 @@ def _infer_relationships(
     *,
     session: Optional[Session] = None,
     strict_join_inference: bool = False,
+    status: Optional[Dict[str, bool]] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.2,
+    timeout_seconds: Optional[float] = None,
 ) -> List[semantic_model_pb2.Relationship]:
+    status_dict = status if status is not None else {}
+    if "limited_by_timeout" not in status_dict:
+        status_dict["limited_by_timeout"] = False
+    if "limited_by_max_relationships" not in status_dict:
+        status_dict["limited_by_max_relationships"] = False
     relationships: List[semantic_model_pb2.Relationship] = []
     if not raw_tables:
         return relationships
+    start_time = time.perf_counter()
+    min_confidence = max(0.0, min(min_confidence, 1.0))
+    limit_reached = False
+    def _timed_out() -> bool:
+        return (
+            timeout_seconds is not None
+            and (time.perf_counter() - start_time) >= timeout_seconds
+        )
     metadata = {}
     prefix_counter: Dict[str, int] = {}
     for _, raw_table in raw_tables:
@@ -2392,14 +2445,39 @@ def _infer_relationships(
     def _record_pair(
         left_table: str, right_table: str, left_col: str, right_col: str
     ) -> None:
+        nonlocal limit_reached
+        if limit_reached:
+            return
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            limit_reached = True
+            return
         key = (left_table, right_table)
         value = (left_col, right_col)
-        if value not in pairs.setdefault(key, []):
-            pairs[key].append(value)
+        bucket = pairs.setdefault(key, [])
+        if value not in bucket:
+            bucket.append(value)
+            if (
+                max_relationships is not None
+                and len(pairs) >= max_relationships
+            ):
+                status_dict["limited_by_max_relationships"] = True
+                limit_reached = True
     table_names = list(metadata.keys())
     for i in range(len(table_names)):
+        if limit_reached or status_dict["limited_by_timeout"]:
+            break
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
         for j in range(i + 1, len(table_names)):
+            if limit_reached or status_dict["limited_by_timeout"]:
+                break
+            if _timed_out():
+                status_dict["limited_by_timeout"] = True
+                break
             table_a_name = table_names[i]
             table_b_name = table_names[j]
             table_a = metadata[table_a_name]
@@ -2575,6 +2653,15 @@ def _infer_relationships(
     # Build relationships with inferred cardinality
     for (left_table, right_table), column_pairs in pairs.items():
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
+        if (
+            max_relationships is not None
+            and len(relationships) >= max_relationships
+        ):
+            status_dict["limited_by_max_relationships"] = True
+            break
         # Infer cardinality based on available metadata
         left_meta = metadata[left_table]
         right_meta = metadata[right_table]
@@ -2777,6 +2864,16 @@ def _infer_relationships(
             for factor in confidence_analysis["reasoning_factors"][:3]:  # Top 3 factors
                 logger.debug(f"  + {factor}")
+        if confidence_analysis["confidence_score"] < min_confidence:
+            logger.debug(
+                "Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
+                left_table,
+                right_table,
+                confidence_analysis["confidence_score"],
+                min_confidence,
+            )
+            continue
         # Determine relationship type based on cardinality
         if left_card == "1" and right_card == "1":
             rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2804,16 +2901,27 @@ def _infer_relationships(
         relationships.append(relationship)
     # Phase 2: Detect many-to-many relationships through bridge table analysis
-    many_to_many_relationships = _detect_many_to_many_relationships(
-        raw_tables, metadata, relationships
-    )
-    if many_to_many_relationships:
-        relationships.extend(many_to_many_relationships)
-        logger.info(
-            f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
+    many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
+    if not status_dict["limited_by_timeout"] and (
+        max_relationships is None or len(relationships) < max_relationships
+    ):
+        many_to_many_relationships = _detect_many_to_many_relationships(
+            raw_tables, metadata, relationships
         )
+        if many_to_many_relationships and max_relationships is not None:
+            remaining = max_relationships - len(relationships)
+            if remaining <= 0:
+                many_to_many_relationships = []
+            else:
+                many_to_many_relationships = many_to_many_relationships[:remaining]
+        if many_to_many_relationships:
+            relationships.extend(many_to_many_relationships)
+            logger.info(
+                f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
+            )
     logger.info(
         f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
     )

semantic_model_generator/relationships/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .discovery import (
     RelationshipDiscoveryResult,
     RelationshipSummary,
     discover_relationships_from_schema,
+    discover_relationships_from_table_definitions,
     discover_relationships_from_tables,
 )
@@ -11,5 +12,6 @@ __all__ = [
     "RelationshipDiscoveryResult",
     "RelationshipSummary",
     "discover_relationships_from_schema",
+    "discover_relationships_from_table_definitions",
     "discover_relationships_from_tables",
 ]

semantic_model_generator/relationships/discovery.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import time
 from dataclasses import dataclass
-from typing import Any, Iterable, List, Optional, Sequence, Tuple
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
 import pandas as pd
 from loguru import logger
@@ -13,7 +13,7 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     get_valid_schemas_tables_columns_df,
 )
 from semantic_model_generator.data_processing import data_types
-from semantic_model_generator.data_processing.data_types import FQNParts, Table
+from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
 from semantic_model_generator.generate_model import (
     _DEFAULT_N_SAMPLE_VALUES_PER_COL,
     _infer_relationships,
@@ -34,6 +34,10 @@ class RelationshipSummary:
     total_columns: int
     total_relationships_found: int
     processing_time_ms: int
+    limited_by_timeout: bool = False
+    limited_by_max_relationships: bool = False
+    limited_by_table_cap: bool = False
+    notes: Optional[str] = None
 @dataclass
@@ -97,20 +101,125 @@ def _build_tables_from_dataframe(
     return tables
+def _tables_payload_to_raw_tables(
+    tables: Sequence[Mapping[str, Any]],
+    *,
+    default_workspace: str = "OFFLINE",
+    default_schema: str = "PUBLIC",
+) -> List[Tuple[FQNParts, Table]]:
+    raw_tables: List[Tuple[FQNParts, Table]] = []
+    for table_index, table_entry in enumerate(tables):
+        if not isinstance(table_entry, Mapping):
+            raise TypeError("Each table definition must be a mapping of table metadata")
+        table_name = str(
+            table_entry.get("table_name")
+            or table_entry.get("name")
+            or table_entry.get("table")
+            or ""
+        ).strip()
+        if not table_name:
+            raise ValueError("Table definition missing 'table_name'")
+        workspace = str(table_entry.get("workspace") or default_workspace).strip() or default_workspace
+        schema = str(
+            table_entry.get("schema")
+            or table_entry.get("schema_name")
+            or default_schema
+        ).strip() or default_schema
+        columns_payload = table_entry.get("columns")
+        if not isinstance(columns_payload, Sequence) or not columns_payload:
+            raise ValueError(
+                f"Table '{table_name}' must include a non-empty 'columns' list"
+            )
+        columns: List[Column] = []
+        for column_index, column_entry in enumerate(columns_payload):
+            if not isinstance(column_entry, Mapping):
+                raise TypeError(
+                    f"Column definition for table '{table_name}' must be a mapping"
+                )
+            column_name = str(
+                column_entry.get("name")
+                or column_entry.get("column_name")
+                or column_entry.get("field")
+                or ""
+            ).strip()
+            if not column_name:
+                raise ValueError(
+                    f"Column definition in table '{table_name}' missing 'name'"
+                )
+            column_type = str(
+                column_entry.get("type")
+                or column_entry.get("data_type")
+                or "STRING"
+            ).strip()
+            values = column_entry.get("sample_values") or column_entry.get("values")
+            if isinstance(values, Sequence) and not isinstance(values, (str, bytes)):
+                sample_values = [str(value) for value in values]
+            else:
+                sample_values = None
+            is_primary = bool(
+                column_entry.get("is_primary_key")
+                or column_entry.get("primary_key")
+                or column_entry.get("is_primary")
+            )
+            columns.append(
+                Column(
+                    id_=column_index,
+                    column_name=column_name,
+                    column_type=column_type,
+                    values=sample_values,
+                    comment=column_entry.get("comment"),
+                    is_primary_key=is_primary,
+                )
+            )
+        table_proto = Table(
+            id_=table_index,
+            name=table_name.upper(),
+            columns=columns,
+            comment=table_entry.get("comment"),
+        )
+        fqn = FQNParts(
+            database=workspace.upper(),
+            schema_name=schema.upper(),
+            table=table_name,
+        )
+        raw_tables.append((fqn, table_proto))
+    return raw_tables
 def _discover_relationships(
     raw_tables: List[Tuple[FQNParts, Table]],
     strict_join_inference: bool,
     session: Optional[Session],
-) -> List[semantic_model_pb2.Relationship]:
+    *,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = None,
+) -> Tuple[List[semantic_model_pb2.Relationship], Dict[str, bool]]:
     if not raw_tables:
-        return []
+        return [], {"limited_by_timeout": False, "limited_by_max_relationships": False}
+    status: Dict[str, bool] = {}
     relationships = _infer_relationships(
         raw_tables,
         session=session if strict_join_inference else None,
         strict_join_inference=strict_join_inference,
+        status=status,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
     )
-    return relationships
+    return relationships, status
 def discover_relationships_from_tables(
@@ -118,33 +227,86 @@ def discover_relationships_from_tables(
     *,
     strict_join_inference: bool = False,
     session: Optional[Session] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 30.0,
+    max_tables: Optional[int] = None,
 ) -> RelationshipDiscoveryResult:
     """
     Run relationship inference using pre-constructed table metadata.
     """
     start = time.perf_counter()
-    relationships = _discover_relationships(
-        list(tables),
+    raw_tables = list(tables)
+    limited_by_table_cap = False
+    notes: List[str] = []
+    if max_tables is not None and len(raw_tables) > max_tables:
+        limited_by_table_cap = True
+        notes.append(
+            f"Input contained {len(raw_tables)} tables; analysis limited to first {max_tables}."
+        )
+        raw_tables = raw_tables[:max_tables]
+    relationships, status = _discover_relationships(
+        raw_tables,
         strict_join_inference=strict_join_inference,
         session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
     )
     end = time.perf_counter()
-    all_columns = sum(len(table.columns) for _, table in tables)
+    all_columns = sum(len(table.columns) for _, table in raw_tables)
     summary = RelationshipSummary(
-        total_tables=len(tables),
+        total_tables=len(raw_tables),
         total_columns=all_columns,
         total_relationships_found=len(relationships),
         processing_time_ms=int((end - start) * 1000),
+        limited_by_timeout=status.get("limited_by_timeout", False),
+        limited_by_max_relationships=status.get("limited_by_max_relationships", False),
+        limited_by_table_cap=limited_by_table_cap,
+        notes=" ".join(notes) if notes else None,
     )
     return RelationshipDiscoveryResult(
         relationships=relationships,
-        tables=[table for _, table in tables],
+        tables=[table for _, table in raw_tables],
         summary=summary,
     )
+def discover_relationships_from_table_definitions(
+    table_definitions: Sequence[Mapping[str, Any]],
+    *,
+    default_workspace: str = "OFFLINE",
+    default_schema: str = "PUBLIC",
+    strict_join_inference: bool = False,
+    session: Optional[Session] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 15.0,
+    max_tables: Optional[int] = None,
+) -> RelationshipDiscoveryResult:
+    """Run relationship inference using raw table metadata dictionaries."""
+    raw_tables = _tables_payload_to_raw_tables(
+        table_definitions,
+        default_workspace=default_workspace,
+        default_schema=default_schema,
+    )
+    return discover_relationships_from_tables(
+        raw_tables,
+        strict_join_inference=strict_join_inference,
+        session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
+        max_tables=max_tables,
+    )
 def discover_relationships_from_schema(
     session: Session,
     workspace: str,
@@ -154,6 +316,10 @@ def discover_relationships_from_schema(
     sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
     strict_join_inference: bool = False,
     max_workers: int = DEFAULT_MAX_WORKERS,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.5,
+    timeout_seconds: Optional[float] = 30.0,
+    max_tables: Optional[int] = 60,
 ) -> RelationshipDiscoveryResult:
     """
     Discover table relationships for all tables in a ClickZetta schema.
@@ -199,4 +365,8 @@ def discover_relationships_from_schema(
         raw_tables,
         strict_join_inference=strict_join_inference,
         session=session,
+        max_relationships=max_relationships,
+        min_confidence=min_confidence,
+        timeout_seconds=timeout_seconds,
+        max_tables=max_tables,
     )

semantic_model_generator/tests/cte_utils_test.py CHANGED Viewed

@@ -5,6 +5,7 @@ import sqlglot
 from semantic_model_generator.data_processing.cte_utils import (
     ClickzettaDialect,
+    _prepare_sql_for_parsing,
     _enrich_column_in_expr_with_aggregation,
     _get_col_expr,
     _validate_col,
@@ -304,7 +305,7 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100"
+            "WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
         ]
         assert got == want
@@ -312,8 +313,8 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format_w_agg()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100",
-            "WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100",
+            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
+            "WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
         ]
         assert sorted(got) == sorted(want)
@@ -321,7 +322,7 @@ class SemanticModelTest(TestCase):
         col_format_tbl = get_test_table_col_format_w_agg_only()
         got = generate_select(col_format_tbl, 100)
         want = [
-            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db.sc.t1) SELECT * FROM __t1 LIMIT 100"
+            "WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
         ]
         assert sorted(got) == sorted(want)
@@ -437,21 +438,21 @@ class SemanticModelTest(TestCase):
         want = """WITH __t1 AS (SELECT
     d1_expr AS d1,
     d2_expr AS d2
-  FROM db.sc.t1
+  FROM `db`.`sc`.`t1`
 ), __t2 AS (
   SELECT
     td1_expr AS td1,
     m1_expr AS m1,
     m1_expr AS m2,
     m3_expr
-  FROM db.sc.t2
+  FROM `db`.`sc`.`t2`
 )
 SELECT
   *
 FROM __t2"""
-        assert sqlglot.parse_one(want, ClickzettaDialect) == sqlglot.parse_one(
-            got, ClickzettaDialect
-        )
+        assert sqlglot.parse_one(
+            _prepare_sql_for_parsing(want), ClickzettaDialect
+        ) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)
     def test_expand_all_logical_tables_as_ctes_with_column_renaming(self) -> None:
         ctx = semantic_model_pb2.SemanticModel(
@@ -465,12 +466,12 @@ FROM __t2"""
     clcks AS clicks,
     clcks,
     cst
-  FROM db.sc.t1
+  FROM `db`.`sc`.`t1`
 )
 SELECT
   *
 FROM __t1
         """
-        assert sqlglot.parse_one(want, ClickzettaDialect) == sqlglot.parse_one(
-            got, ClickzettaDialect
-        )
+        assert sqlglot.parse_one(
+            _prepare_sql_for_parsing(want), ClickzettaDialect
+        ) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)

semantic_model_generator/tests/relationship_discovery_test.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pandas as pd
 from semantic_model_generator.relationships.discovery import (
     discover_relationships_from_schema,
+    discover_relationships_from_table_definitions,
 )
@@ -112,3 +113,66 @@ def test_discover_relationships_from_schema_builds_relationships():
     right_tables = {rel.right_table for rel in result.relationships}
     assert "ORDERS" in left_tables
     assert "CUSTOMER" in right_tables
+def test_discover_relationships_from_table_definitions_allows_manual_metadata() -> None:
+    payload = [
+        {
+            "table_name": "orders",
+            "columns": [
+                {"name": "order_id", "type": "NUMBER", "is_primary_key": True},
+                {"name": "customer_id", "type": "NUMBER"},
+            ],
+        },
+        {
+            "table_name": "customers",
+            "columns": [
+                {"name": "customer_id", "type": "NUMBER", "is_primary_key": True},
+                {"name": "name", "type": "STRING"},
+            ],
+        },
+    ]
+    result = discover_relationships_from_table_definitions(
+        payload,
+        default_workspace="demo",
+        default_schema="sales",
+        max_relationships=5,
+        timeout_seconds=5.0,
+    )
+    assert result.summary.total_tables == 2
+    assert result.summary.total_relationships_found >= 1
+    assert not result.summary.limited_by_timeout
+    assert any(
+        rel.left_table == "ORDERS" and rel.right_table == "CUSTOMERS"
+        for rel in result.relationships
+    )
+def test_discover_relationships_from_table_definitions_filters_generic_ids() -> None:
+    payload = [
+        {
+            "table_name": "table_a",
+            "columns": [
+                {"name": "id", "type": "NUMBER", "is_primary_key": True},
+                {"name": "value", "type": "NUMBER"},
+            ],
+        },
+        {
+            "table_name": "table_b",
+            "columns": [
+                {"name": "id", "type": "NUMBER", "is_primary_key": True},
+                {"name": "value", "type": "NUMBER"},
+            ],
+        },
+    ]
+    result = discover_relationships_from_table_definitions(
+        payload,
+        min_confidence=0.6,
+        max_relationships=5,
+    )
+    assert result.summary.total_relationships_found == 0
+    assert not result.relationships

{clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/WHEEL RENAMED Viewed

File without changes

clickzetta-semantic-model-generator 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

clickzetta-semantic-model-generator 1.0.3py3-none-any.whl → 1.0.4py3-none-any.whl