PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

clickzetta-semantic-model-generator 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

semantic_model_generator/llm/dashscope_client.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import json
 import os
 from dataclasses import dataclass
 from http import HTTPStatus
@@ -8,6 +7,7 @@ from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse, urlunparse
 from loguru import logger
 try:
     import dashscope  # type: ignore
     from dashscope import Generation  # type: ignore
@@ -144,7 +144,9 @@ class DashscopeClient:
         output = getattr(response, "output", None)
         if not output or not hasattr(output, "choices"):
-            raise DashscopeError(f"DashScope response missing output choices: {response}")
+            raise DashscopeError(
+                f"DashScope response missing output choices: {response}"
+            )
         choices = getattr(output, "choices")
         if not choices:

semantic_model_generator/llm/enrichment.py CHANGED Viewed

@@ -11,7 +11,7 @@ from semantic_model_generator.data_processing import data_types
 from semantic_model_generator.protos import semantic_model_pb2
 from .dashscope_client import DashscopeClient, DashscopeError
-from .progress_tracker import EnrichmentProgressTracker, EnrichmentStage, ProgressUpdate
+from .progress_tracker import EnrichmentProgressTracker, EnrichmentStage
 if TYPE_CHECKING:  # pragma: no cover
     from clickzetta.zettapark.session import Session
@@ -65,24 +65,27 @@ def enrich_semantic_model(
     # Initialize progress tracking
     total_tables = len(model.tables)
-    total_model_steps = 3  # model description, model metrics, verified queries
     if progress_tracker:
         progress_tracker.update_progress(
             EnrichmentStage.TABLE_ENRICHMENT,
             0,
             total_tables,
-            message="Starting table enrichment"
+            message="Starting table enrichment",
         )
-    raw_lookup: Dict[str, data_types.Table] = {tbl.name.upper(): tbl for _, tbl in raw_tables}
+    raw_lookup: Dict[str, data_types.Table] = {
+        tbl.name.upper(): tbl for _, tbl in raw_tables
+    }
     metric_notes: List[str] = []
     # Process each table with progress tracking
     for table_index, table in enumerate(model.tables):
         raw_table = raw_lookup.get(table.name.upper())
         if not raw_table:
-            logger.debug("No raw metadata for table {}; skipping enrichment.", table.name)
+            logger.debug(
+                "No raw metadata for table {}; skipping enrichment.", table.name
+            )
             continue
         # Update progress for current table
@@ -92,11 +95,13 @@ def enrich_semantic_model(
                 table_index + 1,
                 total_tables,
                 table_name=table.name,
-                message=f"Enriching table {table.name}"
+                message=f"Enriching table {table.name}",
             )
         try:
-            payload = _serialize_table_prompt(table, raw_table, model.description, placeholder, custom_prompt)
+            payload = _serialize_table_prompt(
+                table, raw_table, model.description, placeholder, custom_prompt
+            )
             response = client.chat_completion(payload["messages"])
             enrichment = _parse_llm_response(response.content)
             if enrichment:
@@ -108,10 +113,15 @@ def enrich_semantic_model(
                 if (
                     model_description
                     and isinstance(model_description, str)
-                    and (model.description == placeholder or not model.description.strip())
+                    and (
+                        model.description == placeholder
+                        or not model.description.strip()
+                    )
                 ):
                     model.description = model_description.strip()
-        except DashscopeError as exc:  # pragma: no cover - network failures or remote errors
+        except (
+            DashscopeError
+        ) as exc:  # pragma: no cover - network failures or remote errors
             logger.warning("DashScope enrichment failed for {}: {}", table.name, exc)
         except Exception as exc:  # pragma: no cover - defensive guard
             logger.exception("Unexpected error enriching table {}: {}", table.name, exc)
@@ -121,7 +131,7 @@ def enrich_semantic_model(
             EnrichmentStage.MODEL_DESCRIPTION,
             0,
             1,
-            message="Generating model description"
+            message="Generating model description",
         )
     if model.description == placeholder or not model.description.strip():
@@ -132,7 +142,7 @@ def enrich_semantic_model(
             EnrichmentStage.MODEL_DESCRIPTION,
             1,
             1,
-            message="Model description generated"
+            message="Model description generated",
         )
     # Model metrics generation
@@ -141,7 +151,7 @@ def enrich_semantic_model(
             EnrichmentStage.MODEL_METRICS,
             0,
             1,
-            message="Generating model-level metrics"
+            message="Generating model-level metrics",
         )
     overview = _build_model_overview(model, raw_lookup, raw_tables)
@@ -154,10 +164,7 @@ def enrich_semantic_model(
     if progress_tracker:
         progress_tracker.update_progress(
-            EnrichmentStage.MODEL_METRICS,
-            1,
-            1,
-            message="Model metrics generated"
+            EnrichmentStage.MODEL_METRICS, 1, 1, message="Model metrics generated"
         )
     # Verified queries generation
@@ -166,7 +173,7 @@ def enrich_semantic_model(
             EnrichmentStage.VERIFIED_QUERIES,
             0,
             1,
-            message="Generating verified queries"
+            message="Generating verified queries",
         )
     try:
@@ -185,10 +192,7 @@ def enrich_semantic_model(
     if progress_tracker:
         progress_tracker.update_progress(
-            EnrichmentStage.VERIFIED_QUERIES,
-            1,
-            1,
-            message="Verified queries generated"
+            EnrichmentStage.VERIFIED_QUERIES, 1, 1, message="Verified queries generated"
         )
     if metric_notes:
@@ -248,7 +252,9 @@ def _serialize_table_prompt(
                 "name": nf.name,
                 "expr": nf.expr,
                 "has_description": bool(nf.description.strip()),
-                "has_synonyms": any(s.strip() and s != placeholder for s in nf.synonyms),
+                "has_synonyms": any(
+                    s.strip() and s != placeholder for s in nf.synonyms
+                ),
             }
             for nf in table.filters
         ],
@@ -270,14 +276,14 @@ def _serialize_table_prompt(
         "{\n"
         '  "table_description": "Orders fact table that captures the status and finances of each order",\n'
         '  "columns": [\n'
-        '    {\n'
+        "    {\n"
         '      "name": "O_TOTALPRICE",\n'
         '      "description": "Total order value including tax",\n'
         '      "synonyms": ["Order amount", "Order total"]\n'
         "    }\n"
         "  ],\n"
         '  "business_metrics": [\n'
-        '    {\n'
+        "    {\n"
         '      "name": "Gross merchandise value",\n'
         '      "source_columns": ["O_TOTALPRICE"],\n'
         '      "description": "Used to measure GMV derived from the total order price."\n'
@@ -306,7 +312,9 @@ def _parse_llm_response(content: str) -> Optional[Dict[str, object]]:
     try:
         data = json.loads(json_text)
     except json.JSONDecodeError as exc:
-        logger.warning("Unable to parse DashScope response as JSON: {} | raw={}", exc, content)
+        logger.warning(
+            "Unable to parse DashScope response as JSON: {} | raw={}", exc, content
+        )
         return None
     if not isinstance(data, dict):
         return None
@@ -368,7 +376,10 @@ def _apply_column_enrichment(
             continue
         description = entry.get("description")
-        if isinstance(description, str) and getattr(target, "description", "") == placeholder:
+        if (
+            isinstance(description, str)
+            and getattr(target, "description", "") == placeholder
+        ):
             target.description = description.strip()
         synonyms = entry.get("synonyms")
@@ -376,7 +387,9 @@ def _apply_column_enrichment(
             _apply_synonyms(target, synonyms, placeholder)
-def _apply_synonyms(target: object, synonyms: Sequence[object], placeholder: str) -> None:
+def _apply_synonyms(
+    target: object, synonyms: Sequence[object], placeholder: str
+) -> None:
     clean_synonyms: List[str] = []
     for item in synonyms:
         if isinstance(item, str):
@@ -386,7 +399,11 @@ def _apply_synonyms(target: object, synonyms: Sequence[object], placeholder: str
     if not clean_synonyms:
         return
-    existing = [syn for syn in getattr(target, "synonyms", []) if syn.strip() and syn != placeholder]
+    existing = [
+        syn
+        for syn in getattr(target, "synonyms", [])
+        if syn.strip() and syn != placeholder
+    ]
     merged = _deduplicate(existing + clean_synonyms)
     if hasattr(target, "synonyms"):
@@ -452,7 +469,11 @@ def _apply_filter_enrichment(
             target.description = description.strip()
         synonyms = entry.get("synonyms")
         if isinstance(synonyms, list):
-            clean_synonyms = [str(item).strip() for item in synonyms if isinstance(item, (str, int, float))]
+            clean_synonyms = [
+                str(item).strip()
+                for item in synonyms
+                if isinstance(item, (str, int, float))
+            ]
             if clean_synonyms:
                 del target.synonyms[:]
                 target.synonyms.extend(clean_synonyms)
@@ -483,7 +504,15 @@ _COUNT_KEYWORDS = (
     "headcount",
 )
 _DISTINCT_KEYWORDS = ("distinct", "unique", "deduplicated")
-_AVERAGE_KEYWORDS = ("average", "avg", "mean", "typical", "expected", "per order", "per customer")
+_AVERAGE_KEYWORDS = (
+    "average",
+    "avg",
+    "mean",
+    "typical",
+    "expected",
+    "per order",
+    "per customer",
+)
 _SUM_KEYWORDS = (
     "total",
     "sum",
@@ -610,7 +639,9 @@ def _apply_metric_enrichment(
     business_metrics: Sequence[object],
     placeholder: str,
 ) -> tuple[Optional[str], bool]:
-    column_type_map = {col.column_name.upper(): col.column_type for col in raw_table.columns}
+    column_type_map = {
+        col.column_name.upper(): col.column_type for col in raw_table.columns
+    }
     existing_names: set[str] = {metric.name for metric in table.metrics}
     notes: List[Dict[str, object]] = []
     metrics_added = False
@@ -634,8 +665,12 @@ def _apply_metric_enrichment(
                 continue
         metric_name = _sanitize_metric_name(name, existing_names)
-        aggregation, use_product = _derive_metric_intent(entry, resolved_sources, column_type_map)
-        expression = _build_metric_expression(resolved_sources, column_type_map, aggregation, use_product)
+        aggregation, use_product = _derive_metric_intent(
+            entry, resolved_sources, column_type_map
+        )
+        expression = _build_metric_expression(
+            resolved_sources, column_type_map, aggregation, use_product
+        )
         metric = table.metrics.add()
         metric.name = metric_name
@@ -643,7 +678,9 @@ def _apply_metric_enrichment(
         description = entry.get("description")
         metric.description = (
-            description.strip() if isinstance(description, str) and description.strip() else placeholder
+            description.strip()
+            if isinstance(description, str) and description.strip()
+            else placeholder
         )
         synonyms = entry.get("synonyms")
@@ -661,8 +698,16 @@ def _apply_metric_enrichment(
         notes.append(
             {
                 "name": name.strip(),
-                "source_columns": raw_sources if isinstance(raw_sources, list) and raw_sources else resolved_sources,
-                "description": description.strip() if isinstance(description, str) and description.strip() else "",
+                "source_columns": (
+                    raw_sources
+                    if isinstance(raw_sources, list) and raw_sources
+                    else resolved_sources
+                ),
+                "description": (
+                    description.strip()
+                    if isinstance(description, str) and description.strip()
+                    else ""
+                ),
             }
         )
         metrics_added = True
@@ -683,7 +728,9 @@ def _summarize_model_description(
     table_lines = []
     for table in model.tables:
         role = "fact" if table.facts or table.metrics else "dimension"
-        desc = table.description.strip() if table.description.strip() else "No description"
+        desc = (
+            table.description.strip() if table.description.strip() else "No description"
+        )
         metrics = ", ".join(metric.name for metric in table.metrics) or "None"
         table_lines.append(f"- {table.name} ({role}): {desc}. Metrics: {metrics}")
@@ -692,7 +739,8 @@ def _summarize_model_description(
         parts = [f"{rel.left_table} -> {rel.right_table}"]
         if rel.relationship_columns:
             columns = ", ".join(
-                f"{col.left_column}={col.right_column}" for col in rel.relationship_columns
+                f"{col.left_column}={col.right_column}"
+                for col in rel.relationship_columns
             )
             parts.append(f"on {columns}")
         relationship_lines.append(" ".join(parts))
@@ -754,8 +802,12 @@ def _build_model_overview(
             "name": table.name,
             "description": (table.description or "").strip(),
             "base_table": {
-                "database": table.base_table.database if table.HasField("base_table") else "",
-                "schema": table.base_table.schema if table.HasField("base_table") else "",
+                "database": (
+                    table.base_table.database if table.HasField("base_table") else ""
+                ),
+                "schema": (
+                    table.base_table.schema if table.HasField("base_table") else ""
+                ),
                 "table": table.base_table.table if table.HasField("base_table") else "",
             },
             "dimensions": [
@@ -833,7 +885,9 @@ def _build_model_overview(
                 "left_table": rel.left_table,
                 "right_table": rel.right_table,
                 "join_type": semantic_model_pb2.JoinType.Name(rel.join_type),
-                "relationship_type": semantic_model_pb2.RelationshipType.Name(rel.relationship_type),
+                "relationship_type": semantic_model_pb2.RelationshipType.Name(
+                    rel.relationship_type
+                ),
                 "columns": [
                     {"left_column": col.left_column, "right_column": col.right_column}
                     for col in rel.relationship_columns
@@ -859,8 +913,10 @@ def _generate_model_metrics(
     metrics_accessible = False
     # Step 1: Check if metrics attribute exists
-    if not hasattr(model, 'metrics'):
-        logger.warning("Model object missing 'metrics' attribute, skipping model-level metrics generation")
+    if not hasattr(model, "metrics"):
+        logger.warning(
+            "Model object missing 'metrics' attribute, skipping model-level metrics generation"
+        )
         return
     # Step 2: Test basic read access
@@ -888,14 +944,22 @@ def _generate_model_metrics(
                 logger.debug("Metrics field write access verified and cleaned up")
             else:
                 # Metric add appeared to succeed but count didn't change - something is wrong
-                logger.warning("Metrics field write access inconsistent (count: {} -> {}), skipping model-level metrics", current_count, new_count)
+                logger.warning(
+                    "Metrics field write access inconsistent (count: {} -> {}), skipping model-level metrics",
+                    current_count,
+                    new_count,
+                )
                 return
         except Exception as exc:
             logger.warning("Cannot write to model.metrics field: {}", str(exc))
             # Try to provide diagnostic information without causing more errors
             try:
-                logger.debug("Model type: {}, metrics type: {}", type(model).__name__, type(getattr(model, 'metrics', None)))
+                logger.debug(
+                    "Model type: {}, metrics type: {}",
+                    type(model).__name__,
+                    type(getattr(model, "metrics", None)),
+                )
             except Exception:
                 pass
             return
@@ -920,12 +984,12 @@ def _generate_model_metrics(
         "Design up to three model-level business metrics (KPIs) using the semantic model summary below.\n"
         "Return JSON with the structure:\n"
         "{\n"
-        "  \"model_metrics\": [\n"
+        '  "model_metrics": [\n'
         "    {\n"
-        "      \"name\": \"...\",\n"
-        "      \"expr\": \"SUM(FACT_SALES.total_amount)\",\n"
-        "      \"description\": \"...\",\n"
-        "      \"synonyms\": [\"...\"]\n"
+        '      "name": "...",\n'
+        '      "expr": "SUM(FACT_SALES.total_amount)",\n'
+        '      "description": "...",\n'
+        '      "synonyms": ["..."]\n'
         "    }\n"
         "  ]\n"
         "}\n"
@@ -953,7 +1017,10 @@ def _generate_model_metrics(
     entries = payload.get("model_metrics")
     if not isinstance(entries, list):
-        logger.debug("No model_metrics list found in LLM response: {}", payload.keys() if payload else "None")
+        logger.debug(
+            "No model_metrics list found in LLM response: {}",
+            payload.keys() if payload else "None",
+        )
         return
     logger.debug("Found {} model metrics entries to process", len(entries))
@@ -981,8 +1048,14 @@ def _generate_model_metrics(
         try:
             metric = model.metrics.add()
         except Exception as exc:
-            logger.warning("Failed to add model-level metric '{}' despite pre-check: {}", name, str(exc))
-            logger.info("Aborting model-level metrics generation due to unexpected field access failure")
+            logger.warning(
+                "Failed to add model-level metric '{}' despite pre-check: {}",
+                name,
+                str(exc),
+            )
+            logger.info(
+                "Aborting model-level metrics generation due to unexpected field access failure"
+            )
             return
         metric.name = _sanitize_metric_name(name, existing_names)
@@ -996,7 +1069,11 @@ def _generate_model_metrics(
         synonyms = entry.get("synonyms")
         if isinstance(synonyms, list):
-            clean_synonyms = [str(item).strip() for item in synonyms if isinstance(item, (str, int, float)) and str(item).strip()]
+            clean_synonyms = [
+                str(item).strip()
+                for item in synonyms
+                if isinstance(item, (str, int, float)) and str(item).strip()
+            ]
             if clean_synonyms:
                 metric.synonyms.extend(clean_synonyms)
@@ -1033,7 +1110,9 @@ def _generate_verified_queries(
     max_items: int = 3,
 ) -> None:
     if session is None:
-        logger.debug("Skipping verified query generation because no ClickZetta session was provided.")
+        logger.debug(
+            "Skipping verified query generation because no ClickZetta session was provided."
+        )
         return
     prompt_json = json.dumps(overview, ensure_ascii=False, indent=2)
@@ -1079,7 +1158,11 @@ def _generate_verified_queries(
             continue
         query_name = entry.get("name")
         if not isinstance(query_name, str) or not query_name.strip():
-            query_name = question if isinstance(question, str) and question.strip() else "Verified query"
+            query_name = (
+                question
+                if isinstance(question, str) and question.strip()
+                else "Verified query"
+            )
         normalized_sql = _ensure_limit_clause(sql)
         if normalized_sql.strip().lower() in existing_sql:
@@ -1088,7 +1171,11 @@ def _generate_verified_queries(
         try:
             session.sql(normalized_sql).to_pandas()
         except Exception as exc:  # pragma: no cover - ClickZetta query failed
-            logger.warning("Skipping verified query '{}' due to validation failure: {}", query_name, exc)
+            logger.warning(
+                "Skipping verified query '{}' due to validation failure: {}",
+                query_name,
+                exc,
+            )
             continue
         verified_query = model.verified_queries.add()

semantic_model_generator/llm/progress_tracker.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Progress tracking system for semantic model enrichment process.
 """
 from __future__ import annotations
 from dataclasses import dataclass
@@ -41,7 +42,9 @@ class EnrichmentProgressTracker:
     to UI components via callback functions.
     """
-    def __init__(self, progress_callback: Optional[Callable[[ProgressUpdate], None]] = None):
+    def __init__(
+        self, progress_callback: Optional[Callable[[ProgressUpdate], None]] = None
+    ):
         """
         Initialize the progress tracker.
@@ -53,11 +56,11 @@ class EnrichmentProgressTracker:
         # Weight distribution across stages (should sum to 1.0)
         self.stage_weights = {
-            EnrichmentStage.METADATA_FETCH: 0.05,      # 5% - Quick metadata collection
-            EnrichmentStage.TABLE_ENRICHMENT: 0.70,    # 70% - Most time-consuming (multiple LLM calls)
-            EnrichmentStage.MODEL_DESCRIPTION: 0.05,   # 5% - Single LLM call
-            EnrichmentStage.MODEL_METRICS: 0.10,       # 10% - Single LLM call
-            EnrichmentStage.VERIFIED_QUERIES: 0.10,    # 10% - Single LLM call + validation
+            EnrichmentStage.METADATA_FETCH: 0.05,  # 5% - Quick metadata collection
+            EnrichmentStage.TABLE_ENRICHMENT: 0.70,  # 70% - Most time-consuming (multiple LLM calls)
+            EnrichmentStage.MODEL_DESCRIPTION: 0.05,  # 5% - Single LLM call
+            EnrichmentStage.MODEL_METRICS: 0.10,  # 10% - Single LLM call
+            EnrichmentStage.VERIFIED_QUERIES: 0.10,  # 10% - Single LLM call + validation
         }
         # Track accumulated progress from completed stages
@@ -70,7 +73,7 @@ class EnrichmentProgressTracker:
         total: int,
         table_name: Optional[str] = None,
         message: str = "",
-        details: Optional[Dict[str, Any]] = None
+        details: Optional[Dict[str, Any]] = None,
     ) -> None:
         """
         Update progress for the current enrichment stage.
@@ -99,7 +102,7 @@ class EnrichmentProgressTracker:
             table_name=table_name,
             message=message,
             percentage=percentage,
-            details=details or {}
+            details=details or {},
         )
         # Send update via callback
@@ -116,10 +119,7 @@ class EnrichmentProgressTracker:
             self.completed_stage_progress += self.stage_weights[stage]
     def _calculate_overall_percentage(
-        self,
-        stage: EnrichmentStage,
-        current: int,
-        total: int
+        self, stage: EnrichmentStage, current: int, total: int
     ) -> float:
         """
         Calculate overall progress percentage across all stages.
@@ -153,7 +153,7 @@ class EnrichmentProgressTracker:
             stage=EnrichmentStage.COMPLETE,
             current=1,
             total=1,
-            message="Enrichment complete"
+            message="Enrichment complete",
         )
@@ -164,6 +164,7 @@ def create_ui_progress_callback() -> Callable[[ProgressUpdate], None]:
     Returns:
         Callback function that formats progress updates for UI display.
     """
     def callback(update: ProgressUpdate) -> None:
         """Format and display progress update in UI."""
         # Build progress message
@@ -173,7 +174,7 @@ def create_ui_progress_callback() -> Callable[[ProgressUpdate], None]:
             EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
             EnrichmentStage.MODEL_METRICS: "Generating model metrics",
             EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
-            EnrichmentStage.COMPLETE: "Complete"
+            EnrichmentStage.COMPLETE: "Complete",
         }
         stage_label = stage_labels.get(update.stage, update.stage.value)
@@ -195,4 +196,4 @@ def create_ui_progress_callback() -> Callable[[ProgressUpdate], None]:
         # For now, we'll use the existing progress callback mechanism
         print(f"[{update.percentage:.1f}%] {full_message}")
-    return callback
+    return callback

semantic_model_generator/relationships/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Public APIs for relationship discovery."""
+from .discovery import (
+    RelationshipDiscoveryResult,
+    RelationshipSummary,
+    discover_relationships_from_schema,
+    discover_relationships_from_tables,
+)
+__all__ = [
+    "RelationshipDiscoveryResult",
+    "RelationshipSummary",
+    "discover_relationships_from_schema",
+    "discover_relationships_from_tables",
+]

clickzetta-semantic-model-generator 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

clickzetta-semantic-model-generator 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl