PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

clickzetta-semantic-model-generator 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: clickzetta-semantic-model-generator
-Version: 1.0.3
+Version: 1.0.5
 Summary: Curate a Semantic Model for ClickZetta Lakehouse
 License: Apache Software License; BSD License
 Author: qililiang

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clickzetta-semantic-model-generator"
-version = "1.0.3"
+version = "1.0.5"
 description = "Curate a Semantic Model for ClickZetta Lakehouse"
 authors = ["qililiang <qililiang@clickzetta.com>"]
 license = "Apache Software License; BSD License"

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py RENAMED Viewed

@@ -11,7 +11,12 @@ from clickzetta.zettapark.session import Session
 from loguru import logger
 from semantic_model_generator.clickzetta_utils import env_vars
-from semantic_model_generator.clickzetta_utils.utils import create_session
+from semantic_model_generator.clickzetta_utils.utils import (
+    create_session,
+    join_quoted_identifiers,
+    normalize_identifier,
+    quote_identifier,
+)
 from semantic_model_generator.data_processing.data_types import Column, Table
 ConnectionType = TypeVar("ConnectionType", bound=Session)
@@ -151,18 +156,8 @@ class ClickzettaConnectionProxy:
         self.session.close()
-def _quote_identifier(name: str) -> str:
-    return f'"{name}"'
 def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
-    return ".".join(
-        [
-            _quote_identifier(workspace),
-            _quote_identifier(schema_name),
-            _quote_identifier(table_name),
-        ]
-    )
+    return join_quoted_identifiers(workspace, schema_name, table_name)
 def _value_is_true(value: Any) -> bool:
@@ -175,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
 def _sanitize_identifier(value: Any, fallback: str = "") -> str:
-    if value is None or value == "":
+    normalized = normalize_identifier(value)
+    if not normalized:
         return fallback
-    normalized = str(value).strip()
-    if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
-        normalized = normalized[1:-1]
     return normalized
@@ -216,21 +209,19 @@ def _fetch_distinct_values(
     column_name: str,
     ndv: int,
 ) -> Optional[List[str]]:
-    workspace_part = (
-        _sanitize_identifier(workspace, workspace).upper() if workspace else ""
-    )
+    workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
     schema_part = (
-        _sanitize_identifier(schema_name, schema_name).upper() if schema_name else ""
+        _sanitize_identifier(schema_name, schema_name) if schema_name else ""
     )
-    table_part = _sanitize_identifier(table_name, table_name).upper()
-    column_part = _sanitize_identifier(column_name, column_name).upper()
+    table_part = _sanitize_identifier(table_name, table_name)
+    column_part = _sanitize_identifier(column_name, column_name)
-    qualified_parts = [
-        part for part in (workspace_part, schema_part, table_part) if part
-    ]
-    qualified_table = ".".join(qualified_parts)
+    qualified_table = join_quoted_identifiers(
+        workspace_part, schema_part, table_part
+    )
+    column_expr = quote_identifier(column_part)
-    query = f"SELECT DISTINCT {column_part} FROM {qualified_table} LIMIT {ndv}"
+    query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
     try:
         df = session.sql(query).to_pandas()
         if df.empty:
@@ -489,15 +480,30 @@ def _fetch_columns_via_show(
         return pd.DataFrame()
     rows: List[pd.DataFrame] = []
-    catalog = workspace.upper()
-    schema = table_schema.upper() if table_schema else ""
+    category = _catalog_category(session, workspace)
+    is_shared_catalog = category in {"SHARED", "EXTERNAL"}
+    catalog = workspace if is_shared_catalog else workspace.upper()
+    schema = (
+        table_schema or ""
+    )
+    if schema and not is_shared_catalog:
+        schema = schema.upper()
     for table_name in table_names:
         qualified_parts = [
-            part for part in (catalog, schema, table_name.upper()) if part
+            part
+            for part in (
+                catalog,
+                schema,
+                table_name.upper() if not is_shared_catalog else table_name,
+            )
+            if part
         ]
         qualified_table = ".".join(qualified_parts)
-        query = f"SHOW COLUMNS IN {qualified_table}"
+        if is_shared_catalog:
+            query = f"SHOW COLUMNS IN SHARE {qualified_table}"
+        else:
+            query = f"SHOW COLUMNS IN {qualified_table}"
         try:
             df = session.sql(query).to_pandas()
         except Exception as exc:
@@ -655,14 +661,25 @@ def fetch_tables_views_in_schema(
     parts = schema_name.split(".", maxsplit=1)
     workspace = parts[0]
     schema = parts[1] if len(parts) > 1 else ""
-    workspace_upper = workspace.upper()
-    schema_upper = schema.upper()
+    category = _catalog_category(session, workspace)
+    is_shared_catalog = category in {"SHARED", "EXTERNAL"}
+    workspace_token = workspace if is_shared_catalog else workspace.upper()
+    schema_token = schema if is_shared_catalog else schema.upper()
     try:
-        if workspace_upper and schema_upper:
-            df = session.sql(
-                f"SHOW TABLES IN {workspace_upper}.{schema_upper}"
-            ).to_pandas()
+        if workspace_token and schema_token:
+            if is_shared_catalog:
+                scope = ".".join(
+                    part for part in (workspace_token, schema_token) if part
+                )
+                df = session.sql(f"SHOW TABLES IN SHARE {scope}").to_pandas()
+            else:
+                scope = join_quoted_identifiers(
+                    workspace_token,
+                    schema_token,
+                )
+                df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
         else:
             df = session.sql("SHOW TABLES").to_pandas()
     except Exception as exc:  # pragma: no cover
@@ -738,11 +755,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
     queries: List[str] = []
     if schema:
-        queries.append(f"SHOW VOLUMES IN {workspace}.{schema}")
-        queries.append(f"SHOW STAGES IN SCHEMA {workspace}.{schema}")
+        scope = join_quoted_identifiers(workspace, schema)
+        if scope:
+            queries.append(f"SHOW VOLUMES IN {scope}")
+            queries.append(f"SHOW STAGES IN SCHEMA {scope}")
     else:
-        queries.append(f"SHOW VOLUMES IN {workspace}")
-        queries.append(f"SHOW STAGES IN DATABASE {workspace}")
+        workspace_identifier = quote_identifier(workspace)
+        if workspace_identifier:
+            queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
+            queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
     stage_names: List[str] = ["volume:user://~/semantic_models/"]
     seen: set[str] = set(stage_names)
@@ -899,7 +920,7 @@ def create_table_in_schema(
     columns_schema: Dict[str, str],
 ) -> bool:
     fields = ", ".join(
-        f"{_quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
+        f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
     )
     query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
     try:

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/utils.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from contextlib import contextmanager
-from typing import Dict, Iterable
+from typing import Any, Dict, Iterable
 from clickzetta.zettapark.session import Session
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
 }
+def normalize_identifier(value: Any) -> str:
+    """
+    Strips outer quotes/backticks and surrounding whitespace from an identifier.
+    Returns an empty string when the identifier is missing.
+    """
+    if value is None:
+        return ""
+    text = str(value).strip()
+    if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
+        return text[1:-1]
+    return text
+def quote_identifier(value: Any) -> str:
+    """
+    Wraps an identifier in backticks, escaping embedded backticks as needed.
+    Returns an empty string if the identifier is missing.
+    """
+    normalized = normalize_identifier(value)
+    if not normalized:
+        return ""
+    escaped = normalized.replace("`", "``")
+    return f"`{escaped}`"
+def join_quoted_identifiers(*parts: Any) -> str:
+    """
+    Joins identifier parts with '.' and ensures each segment is backtick-quoted.
+    Empty segments are skipped.
+    """
+    quoted_parts = [
+        quote_identifier(part)
+        for part in parts
+        if normalize_identifier(part)
+    ]
+    return ".".join(part for part in quoted_parts if part)
 def create_fqn_table(fqn_str: str) -> FQNParts:
     """
     Splits a fully qualified table name into its ClickZetta components.
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
         ("schema", schema),
         ("vcluster", vcluster),
     ):
-        session.sql(f"USE {component.upper()} {value.upper()}")
+        identifier = quote_identifier(value)
+        session.sql(f"USE {component.upper()} {identifier}")
 def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/cte_utils.py RENAMED Viewed

@@ -11,12 +11,34 @@ from sqlglot import Dialect
 from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     OBJECT_DATATYPES,
 )
+from semantic_model_generator.clickzetta_utils.utils import (
+    join_quoted_identifiers,
+    normalize_identifier,
+)
 from semantic_model_generator.protos import semantic_model_pb2
 _SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
 ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
 _LOGICAL_TABLE_PREFIX = "__"
+_SQLGLOT_QUOTE_CHAR = '"'
+def _prepare_sql_for_parsing(sql: str) -> str:
+    """
+    Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
+    """
+    return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
+def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
+    """
+    Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
+    """
+    rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
+    return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
 def is_logical_table(table_name: str) -> bool:
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
 def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
     """Returns fully qualified table name such as my_db.my_schema.my_table"""
-    fqn = table.table
-    if len(table.schema) > 0:
-        fqn = f"{table.schema}.{fqn}"
-    if len(table.database) > 0:
-        fqn = f"{table.database}.{fqn}"
-    return fqn  # type: ignore[no-any-return]
+    parts = [
+        normalize_identifier(component)
+        for component in (table.database, table.schema, table.table)
+        if component
+    ]
+    return join_quoted_identifiers(*parts)  # type: ignore[no-any-return]
 def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
@@ -156,8 +178,8 @@ def _generate_cte_for(
         cte = f"WITH {logical_table_name(table)} AS (\n"
         cte += "SELECT \n"
         cte += ",\n".join(expr_columns) + "\n"
-        cte += f"FROM {fully_qualified_table_name(table.base_table)}"
-        cte += ")"
+        cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
+        cte += ")\n"
         return cte
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
     str: The SQL statement in ClickZetta syntax.
     """
     try:
-        expression = sqlglot.parse_one(sql, dialect=ClickzettaDialect)
+        expression = sqlglot.parse_one(
+            _prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
+        )
     except Exception as e:
         raise ValueError(
             f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
         )
-    return expression.sql(dialect=ClickzettaDialect)
+    return _render_clickzetta_sql(expression)
 def generate_select(
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
     for cte in ctes:
         new_withs.append(
             sqlglot.parse_one(
-                cte, read=ClickzettaDialect, into=sqlglot.expressions.With
+                _prepare_sql_for_parsing(cte),
+                read=ClickzettaDialect,
+                into=sqlglot.expressions.With,
             )
         )
     # Step 3: Prefix the CTEs to the original query.
-    ast = sqlglot.parse_one(sql_query, read=ClickzettaDialect)
+    ast = sqlglot.parse_one(
+        _prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
+    )
     with_ = ast.args.get("with")
     # If the query doesn't have a WITH clause, then generate one.
     if with_ is None:
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
     else:
         new_ctes = [w.expressions[0] for w in new_withs]
         with_.set("expressions", new_ctes + with_.expressions)
-    return ast.sql(dialect=ClickzettaDialect, pretty=True)  # type: ignore [no-any-return]
+    return _render_clickzetta_sql(
+        ast, pretty=True
+    )  # type: ignore [no-any-return]
 def context_to_column_format(

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/generate_model.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import math
 import os
 import re
+import time
 from collections import defaultdict
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -17,7 +18,12 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     get_table_representation,
     get_valid_schemas_tables_columns_df,
 )
-from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
+from semantic_model_generator.clickzetta_utils.utils import (
+    create_fqn_table,
+    join_quoted_identifiers,
+    normalize_identifier,
+    quote_identifier,
+)
 from semantic_model_generator.data_processing import data_types, proto_utils
 from semantic_model_generator.llm import (
     DashscopeClient,
@@ -41,6 +47,14 @@ _AUTOGEN_COMMENT_TOKEN = (
 )
 _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
 _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
+_GENERIC_IDENTIFIER_TOKENS = {
+    "ID",
+    "NAME",
+    "CODE",
+    "KEY",
+    "VALUE",
+    "NUMBER",
+}
 def _singularize(token: str) -> str:
@@ -90,6 +104,14 @@ def _identifier_tokens(
     return tokens
+def _is_generic_identifier(name: str) -> bool:
+    tokens = [token for token in _identifier_tokens(name) if token]
+    if not tokens:
+        return True
+    normalized_tokens = {token.upper() for token in tokens}
+    return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
 def _sanitize_identifier_name(
     name: str, prefixes_to_drop: Optional[set[str]] = None
 ) -> str:
@@ -354,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
 def _format_sql_identifier(name: str) -> str:
     """
-    Formats an identifier for SQL (without quoting) by stripping quotes and uppercasing.
+    Formats an identifier for SQL by wrapping it in backticks.
     """
-    if not name:
-        return ""
-    return str(name).replace('"', "").replace("`", "").strip().upper()
+    return quote_identifier(name)
 def _qualified_table_name(fqn: data_types.FQNParts) -> str:
     """
-    Builds a fully qualified table name without quoting.
+    Builds a fully qualified, backtick-quoted table name.
     """
-    parts = [part for part in (fqn.database, fqn.schema_name, fqn.table) if part]
-    return ".".join(_format_sql_identifier(part) for part in parts if part)
+    parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
+    return join_quoted_identifiers(*(part for part in parts if part))
 def _levenshtein_distance(s1: str, s2: str) -> int:
@@ -977,6 +997,19 @@ def _calculate_relationship_confidence(
         confidence_score += name_confidence
+        generic_pair_count = sum(
+            1
+            for left_col, right_col in column_pairs
+            if _is_generic_identifier(left_col)
+            and _is_generic_identifier(right_col)
+        )
+        if generic_pair_count:
+            penalty = min(0.15 * generic_pair_count, 0.3)
+            confidence_score = max(confidence_score - penalty, 0.0)
+            reasoning_factors.append(
+                f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
+            )
         # Check for foreign key naming patterns
         fk_pattern_confidence = 0.0
         for left_col, right_col in column_pairs:
@@ -2326,11 +2359,31 @@ def _infer_relationships(
     *,
     session: Optional[Session] = None,
     strict_join_inference: bool = False,
+    status: Optional[Dict[str, bool]] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.2,
+    timeout_seconds: Optional[float] = None,
 ) -> List[semantic_model_pb2.Relationship]:
+    status_dict = status if status is not None else {}
+    if "limited_by_timeout" not in status_dict:
+        status_dict["limited_by_timeout"] = False
+    if "limited_by_max_relationships" not in status_dict:
+        status_dict["limited_by_max_relationships"] = False
     relationships: List[semantic_model_pb2.Relationship] = []
     if not raw_tables:
         return relationships
+    start_time = time.perf_counter()
+    min_confidence = max(0.0, min(min_confidence, 1.0))
+    limit_reached = False
+    def _timed_out() -> bool:
+        return (
+            timeout_seconds is not None
+            and (time.perf_counter() - start_time) >= timeout_seconds
+        )
     metadata = {}
     prefix_counter: Dict[str, int] = {}
     for _, raw_table in raw_tables:
@@ -2392,14 +2445,39 @@ def _infer_relationships(
     def _record_pair(
         left_table: str, right_table: str, left_col: str, right_col: str
     ) -> None:
+        nonlocal limit_reached
+        if limit_reached:
+            return
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            limit_reached = True
+            return
         key = (left_table, right_table)
         value = (left_col, right_col)
-        if value not in pairs.setdefault(key, []):
-            pairs[key].append(value)
+        bucket = pairs.setdefault(key, [])
+        if value not in bucket:
+            bucket.append(value)
+            if (
+                max_relationships is not None
+                and len(pairs) >= max_relationships
+            ):
+                status_dict["limited_by_max_relationships"] = True
+                limit_reached = True
     table_names = list(metadata.keys())
     for i in range(len(table_names)):
+        if limit_reached or status_dict["limited_by_timeout"]:
+            break
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
         for j in range(i + 1, len(table_names)):
+            if limit_reached or status_dict["limited_by_timeout"]:
+                break
+            if _timed_out():
+                status_dict["limited_by_timeout"] = True
+                break
             table_a_name = table_names[i]
             table_b_name = table_names[j]
             table_a = metadata[table_a_name]
@@ -2575,6 +2653,15 @@ def _infer_relationships(
     # Build relationships with inferred cardinality
     for (left_table, right_table), column_pairs in pairs.items():
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
+        if (
+            max_relationships is not None
+            and len(relationships) >= max_relationships
+        ):
+            status_dict["limited_by_max_relationships"] = True
+            break
         # Infer cardinality based on available metadata
         left_meta = metadata[left_table]
         right_meta = metadata[right_table]
@@ -2777,6 +2864,16 @@ def _infer_relationships(
             for factor in confidence_analysis["reasoning_factors"][:3]:  # Top 3 factors
                 logger.debug(f"  + {factor}")
+        if confidence_analysis["confidence_score"] < min_confidence:
+            logger.debug(
+                "Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
+                left_table,
+                right_table,
+                confidence_analysis["confidence_score"],
+                min_confidence,
+            )
+            continue
         # Determine relationship type based on cardinality
         if left_card == "1" and right_card == "1":
             rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2804,16 +2901,27 @@ def _infer_relationships(
         relationships.append(relationship)
     # Phase 2: Detect many-to-many relationships through bridge table analysis
-    many_to_many_relationships = _detect_many_to_many_relationships(
-        raw_tables, metadata, relationships
-    )
-    if many_to_many_relationships:
-        relationships.extend(many_to_many_relationships)
-        logger.info(
-            f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
+    many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
+    if not status_dict["limited_by_timeout"] and (
+        max_relationships is None or len(relationships) < max_relationships
+    ):
+        many_to_many_relationships = _detect_many_to_many_relationships(
+            raw_tables, metadata, relationships
         )
+        if many_to_many_relationships and max_relationships is not None:
+            remaining = max_relationships - len(relationships)
+            if remaining <= 0:
+                many_to_many_relationships = []
+            else:
+                many_to_many_relationships = many_to_many_relationships[:remaining]
+        if many_to_many_relationships:
+            relationships.extend(many_to_many_relationships)
+            logger.info(
+                f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
+            )
     logger.info(
         f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
     )

{clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/relationships/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ from .discovery import (
     RelationshipDiscoveryResult,
     RelationshipSummary,
     discover_relationships_from_schema,
+    discover_relationships_from_table_definitions,
     discover_relationships_from_tables,
 )
@@ -11,5 +12,6 @@ __all__ = [
     "RelationshipDiscoveryResult",
     "RelationshipSummary",
     "discover_relationships_from_schema",
+    "discover_relationships_from_table_definitions",
     "discover_relationships_from_tables",
 ]

clickzetta-semantic-model-generator 1.0.3__tar.gz → 1.0.5__tar.gz

clickzetta-semantic-model-generator 1.0.3tar.gz → 1.0.5tar.gz