PyPI - clickzetta-semantic-model-generator - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

clickzetta-semantic-model-generator 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

semantic_model_generator/generate_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
+import math
 import os
 import re
-import math
+import time
 from collections import defaultdict
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -8,8 +9,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 from clickzetta.zettapark.session import Session
 from loguru import logger
-from semantic_model_generator.data_processing import data_types, proto_utils
-from semantic_model_generator.protos import semantic_model_pb2
 from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     AUTOGEN_TOKEN,
     DIMENSION_DATATYPES,
@@ -19,15 +18,25 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
     get_table_representation,
     get_valid_schemas_tables_columns_df,
 )
-from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
-from semantic_model_generator.validate.context_length import validate_context_length
+from semantic_model_generator.clickzetta_utils.utils import (
+    create_fqn_table,
+    join_quoted_identifiers,
+    normalize_identifier,
+    quote_identifier,
+)
+from semantic_model_generator.data_processing import data_types, proto_utils
 from semantic_model_generator.llm import (
     DashscopeClient,
     DashscopeSettings,
     enrich_semantic_model,
     get_dashscope_settings,
 )
-from semantic_model_generator.llm.progress_tracker import EnrichmentProgressTracker, EnrichmentStage
+from semantic_model_generator.llm.progress_tracker import (
+    EnrichmentProgressTracker,
+    EnrichmentStage,
+)
+from semantic_model_generator.protos import semantic_model_pb2
+from semantic_model_generator.validate.context_length import validate_context_length
 from semantic_model_generator.validate.keywords import CZ_RESERVED_WORDS
 _PLACEHOLDER_COMMENT = "  "
@@ -38,6 +47,15 @@ _AUTOGEN_COMMENT_TOKEN = (
 )
 _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
 _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
+_GENERIC_IDENTIFIER_TOKENS = {
+    "ID",
+    "NAME",
+    "CODE",
+    "KEY",
+    "VALUE",
+    "NUMBER",
+}
 def _singularize(token: str) -> str:
     if token.endswith("IES") and len(token) > 3:
@@ -68,7 +86,9 @@ def _base_type_from_type(column_type: str) -> str:
     return token.split("(")[0]
-def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -> List[str]:
+def _identifier_tokens(
+    name: str, prefixes_to_drop: Optional[set[str]] = None
+) -> List[str]:
     name = name.replace("-", "_")
     raw_tokens = re.split(r"[^0-9A-Za-z]+", name)
     tokens: List[str] = []
@@ -84,7 +104,17 @@ def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -
     return tokens
-def _sanitize_identifier_name(name: str, prefixes_to_drop: Optional[set[str]] = None) -> str:
+def _is_generic_identifier(name: str) -> bool:
+    tokens = [token for token in _identifier_tokens(name) if token]
+    if not tokens:
+        return True
+    normalized_tokens = {token.upper() for token in tokens}
+    return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
+def _sanitize_identifier_name(
+    name: str, prefixes_to_drop: Optional[set[str]] = None
+) -> str:
     if not name:
         return ""
@@ -271,7 +301,9 @@ def _looks_like_primary_key(table_name: str, column_name: str) -> bool:
         "PRIMARY_KEY",
     }
     for variant in variants:
-        direct_matches.update({f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"})
+        direct_matches.update(
+            {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}
+        )
     if upper_name in direct_matches:
         return True
@@ -344,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
 def _format_sql_identifier(name: str) -> str:
     """
-    Formats an identifier for SQL (without quoting) by stripping quotes and uppercasing.
+    Formats an identifier for SQL by wrapping it in backticks.
     """
-    if not name:
-        return ""
-    return str(name).replace('"', "").replace("`", "").strip().upper()
+    return quote_identifier(name)
 def _qualified_table_name(fqn: data_types.FQNParts) -> str:
     """
-    Builds a fully qualified table name without quoting.
+    Builds a fully qualified, backtick-quoted table name.
     """
-    parts = [part for part in (fqn.database, fqn.schema_name, fqn.table) if part]
-    return ".".join(_format_sql_identifier(part) for part in parts if part)
+    parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
+    return join_quoted_identifiers(*(part for part in parts if part))
 def _levenshtein_distance(s1: str, s2: str) -> int:
@@ -368,7 +398,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
         return _levenshtein_distance(s2, s1)
     if len(s2) == 0:
         return len(s1)
     previous_row = range(len(s2) + 1)
     for i, c1 in enumerate(s1):
         current_row = [i + 1]
@@ -378,7 +408,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
             substitutions = previous_row[j] + (c1 != c2)
             current_row.append(min(insertions, deletions, substitutions))
         previous_row = current_row
     return previous_row[-1]
@@ -389,26 +419,26 @@ def _name_similarity(name1: str, name2: str) -> float:
     """
     if not name1 or not name2:
         return 0.0
     # Exact match
     if name1.upper() == name2.upper():
         return 1.0
     # Normalize names for comparison
     norm1 = name1.upper().replace("_", "").replace("-", "")
     norm2 = name2.upper().replace("_", "").replace("-", "")
     if norm1 == norm2:
         return 0.95
     # Calculate Levenshtein-based similarity
     max_len = max(len(norm1), len(norm2))
     if max_len == 0:
         return 0.0
     distance = _levenshtein_distance(norm1, norm2)
     similarity = 1.0 - (distance / max_len)
     return max(0.0, similarity)
@@ -427,17 +457,24 @@ def _analyze_composite_key_patterns(
         Dict with composite key analysis results
     """
     pk_candidates = table_meta.get("pk_candidates", {})
-    columns_meta = table_meta.get("columns", {})
     # Check if all relationship columns form a composite key
-    relationship_cols = [pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs]
+    relationship_cols = [
+        pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs
+    ]
     # Normalize column names for comparison
     global_prefixes = set()  # This should come from context but we'll handle it locally
-    table_prefixes = _table_prefixes(list(table_meta.get("columns", {}).keys())[0] if table_meta.get("columns") else "")
+    table_prefixes = _table_prefixes(
+        list(table_meta.get("columns", {}).keys())[0]
+        if table_meta.get("columns")
+        else ""
+    )
     normalized_rel_cols = [
-        _sanitize_identifier_name(col, prefixes_to_drop=global_prefixes | table_prefixes)
+        _sanitize_identifier_name(
+            col, prefixes_to_drop=global_prefixes | table_prefixes
+        )
         for col in relationship_cols
     ]
@@ -448,7 +485,9 @@ def _analyze_composite_key_patterns(
     analysis = {
         "is_composite_pk": pk_col_count > 1 and pk_col_count == total_pk_candidates,
         "partial_pk": pk_col_count > 0 and pk_col_count < total_pk_candidates,
-        "pk_coverage_ratio": pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0,
+        "pk_coverage_ratio": (
+            pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0
+        ),
         "relationship_column_count": len(relationship_cols),
         "pk_column_count": pk_col_count,
     }
@@ -457,7 +496,10 @@ def _analyze_composite_key_patterns(
     if len(relationship_cols) > 1:
         sequential_patterns = []
         for col in relationship_cols:
-            if any(pattern in col.upper() for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]):
+            if any(
+                pattern in col.upper()
+                for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]
+            ):
                 sequential_patterns.append(col)
         analysis["sequential_id_pattern"] = len(sequential_patterns) >= 2
@@ -504,9 +546,12 @@ def _infer_composite_cardinality(
     # Rule 3: Composite key uniqueness analysis (if we have sufficient samples)
     MIN_SAMPLE_SIZE = 20  # Lower threshold for composite keys
-    if (left_values_all and right_values_all and
-        len(left_values_all) >= MIN_SAMPLE_SIZE and
-        len(right_values_all) >= MIN_SAMPLE_SIZE):
+    if (
+        left_values_all
+        and right_values_all
+        and len(left_values_all) >= MIN_SAMPLE_SIZE
+        and len(right_values_all) >= MIN_SAMPLE_SIZE
+    ):
         # Create composite keys by concatenating values
         left_composite_keys = []
@@ -515,10 +560,12 @@ def _infer_composite_cardinality(
         sample_size = min(len(left_values_all), len(right_values_all))
         for i in range(sample_size):
-            left_key = "|".join(str(vals[i]) if i < len(vals) else ""
-                              for vals in left_values_all)
-            right_key = "|".join(str(vals[i]) if i < len(vals) else ""
-                               for vals in right_values_all)
+            left_key = "|".join(
+                str(vals[i]) if i < len(vals) else "" for vals in left_values_all
+            )
+            right_key = "|".join(
+                str(vals[i]) if i < len(vals) else "" for vals in right_values_all
+            )
             if left_key and not _is_nullish(left_key):
                 left_composite_keys.append(left_key)
@@ -527,7 +574,9 @@ def _infer_composite_cardinality(
         if left_composite_keys and right_composite_keys:
             left_unique_ratio = len(set(left_composite_keys)) / len(left_composite_keys)
-            right_unique_ratio = len(set(right_composite_keys)) / len(right_composite_keys)
+            right_unique_ratio = len(set(right_composite_keys)) / len(
+                right_composite_keys
+            )
             # Lower threshold for composite key uniqueness
             if right_unique_ratio > 0.9:
@@ -561,6 +610,7 @@ def _infer_composite_cardinality(
             adaptive_thresholds=adaptive_thresholds,
         )
 def _detect_bridge_table_pattern(
     table_meta: Dict[str, Any],
     all_tables_meta: Dict[str, Dict[str, Any]],
@@ -606,7 +656,9 @@ def _detect_bridge_table_pattern(
         base_type = col_info.get("base_type", "")
         # Check if column looks like an ID/foreign key
-        if any(pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]):
+        if any(
+            pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]
+        ):
             id_columns.append(original_name)
             # Check if this could be a foreign key to another table
@@ -615,11 +667,13 @@ def _detect_bridge_table_pattern(
                     continue
                 if _looks_like_foreign_key(table_name, other_table_name, original_name):
-                    fk_like_columns.append({
-                        "column": original_name,
-                        "references_table": other_table_name,
-                        "confidence": 0.8
-                    })
+                    fk_like_columns.append(
+                        {
+                            "column": original_name,
+                            "references_table": other_table_name,
+                            "confidence": 0.8,
+                        }
+                    )
                     break
                 # Check if column name contains the other table name
@@ -628,11 +682,13 @@ def _detect_bridge_table_pattern(
                 for variant in other_variants:
                     if variant in col_tokens:
-                        fk_like_columns.append({
-                            "column": original_name,
-                            "references_table": other_table_name,
-                            "confidence": 0.6
-                        })
+                        fk_like_columns.append(
+                            {
+                                "column": original_name,
+                                "references_table": other_table_name,
+                                "confidence": 0.6,
+                            }
+                        )
                         break
         else:
             # Count descriptive/non-ID columns
@@ -680,8 +736,18 @@ def _detect_bridge_table_pattern(
     # Name-based heuristics
     table_upper = table_name.upper()
     bridge_keywords = {
-        "BRIDGE", "JUNCTION", "LINK", "ASSOC", "ASSOCIATION", "REL", "RELATIONSHIP",
-        "MAP", "MAPPING", "XREF", "CROSS_REF", "CONNECTOR"
+        "BRIDGE",
+        "JUNCTION",
+        "LINK",
+        "ASSOC",
+        "ASSOCIATION",
+        "REL",
+        "RELATIONSHIP",
+        "MAP",
+        "MAPPING",
+        "XREF",
+        "CROSS_REF",
+        "CONNECTOR",
     }
     for keyword in bridge_keywords:
@@ -708,7 +774,9 @@ def _detect_bridge_table_pattern(
     is_bridge = confidence >= 0.6  # Threshold for bridge table classification
-    connected_tables = [fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5]
+    connected_tables = [
+        fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5
+    ]
     return {
         "is_bridge": is_bridge,
@@ -718,14 +786,14 @@ def _detect_bridge_table_pattern(
         "fk_ratio": fk_ratio,
         "id_ratio": id_ratio,
         "total_columns": total_columns,
-        "descriptive_columns": descriptive_columns
+        "descriptive_columns": descriptive_columns,
     }
 def _detect_many_to_many_relationships(
     raw_tables: List[tuple[data_types.FQNParts, data_types.Table]],
     metadata: Dict[str, Dict[str, Any]],
-    existing_relationships: List[semantic_model_pb2.Relationship]
+    existing_relationships: List[semantic_model_pb2.Relationship],
 ) -> List[semantic_model_pb2.Relationship]:
     """
     Detect many-to-many relationships through bridge table analysis.
@@ -746,7 +814,10 @@ def _detect_many_to_many_relationships(
     for table_name, table_meta in metadata.items():
         bridge_analysis = _detect_bridge_table_pattern(table_meta, metadata)
-        if bridge_analysis["is_bridge"] and len(bridge_analysis["connected_tables"]) >= 2:
+        if (
+            bridge_analysis["is_bridge"]
+            and len(bridge_analysis["connected_tables"]) >= 2
+        ):
             bridge_tables[table_name] = bridge_analysis
             logger.debug(
@@ -780,9 +851,15 @@ def _detect_many_to_many_relationships(
                 right_fk_cols = []
                 for fk_info in bridge_info["fk_like_columns"]:
-                    if fk_info["references_table"] == left_table and fk_info["confidence"] >= 0.5:
+                    if (
+                        fk_info["references_table"] == left_table
+                        and fk_info["confidence"] >= 0.5
+                    ):
                         left_fk_cols.append(fk_info["column"])
-                    elif fk_info["references_table"] == right_table and fk_info["confidence"] >= 0.5:
+                    elif (
+                        fk_info["references_table"] == right_table
+                        and fk_info["confidence"] >= 0.5
+                    ):
                         right_fk_cols.append(fk_info["column"])
                 if not left_fk_cols or not right_fk_cols:
@@ -806,8 +883,12 @@ def _detect_many_to_many_relationships(
                     # Use the first detected FK columns as a representative
                     relationship.relationship_columns.append(
                         semantic_model_pb2.RelationKey(
-                            left_column=left_fk_cols[0],  # This is actually in the bridge table
-                            right_column=right_fk_cols[0],  # This is also in the bridge table
+                            left_column=left_fk_cols[
+                                0
+                            ],  # This is actually in the bridge table
+                            right_column=right_fk_cols[
+                                0
+                            ],  # This is also in the bridge table
                         )
                     )
@@ -863,13 +944,19 @@ def _calculate_relationship_confidence(
         pk_confidence = 0.4
         confidence_score += pk_confidence
         if left_has_pk and right_has_pk:
-            reasoning_factors.append("Both sides have primary key metadata (very strong evidence)")
+            reasoning_factors.append(
+                "Both sides have primary key metadata (very strong evidence)"
+            )
             evidence_details["pk_evidence"] = "both_pk"
         elif right_has_pk:
-            reasoning_factors.append("Right side has primary key metadata (strong evidence)")
+            reasoning_factors.append(
+                "Right side has primary key metadata (strong evidence)"
+            )
             evidence_details["pk_evidence"] = "right_pk"
         elif left_has_pk:
-            reasoning_factors.append("Left side has primary key metadata (strong evidence)")
+            reasoning_factors.append(
+                "Left side has primary key metadata (strong evidence)"
+            )
             evidence_details["pk_evidence"] = "left_pk"
     # Factor 2: Name similarity and pattern matching
@@ -884,28 +971,53 @@ def _calculate_relationship_confidence(
         if avg_name_similarity >= 0.9:
             name_confidence = 0.25
-            reasoning_factors.append(f"Very high column name similarity ({avg_name_similarity:.2f})")
+            reasoning_factors.append(
+                f"Very high column name similarity ({avg_name_similarity:.2f})"
+            )
         elif avg_name_similarity >= 0.7:
             name_confidence = 0.2
-            reasoning_factors.append(f"High column name similarity ({avg_name_similarity:.2f})")
+            reasoning_factors.append(
+                f"High column name similarity ({avg_name_similarity:.2f})"
+            )
         elif avg_name_similarity >= 0.5:
             name_confidence = 0.15
-            reasoning_factors.append(f"Moderate column name similarity ({avg_name_similarity:.2f})")
+            reasoning_factors.append(
+                f"Moderate column name similarity ({avg_name_similarity:.2f})"
+            )
         elif avg_name_similarity >= 0.3:
             name_confidence = 0.1
-            reasoning_factors.append(f"Low column name similarity ({avg_name_similarity:.2f})")
+            reasoning_factors.append(
+                f"Low column name similarity ({avg_name_similarity:.2f})"
+            )
         else:
             name_confidence = 0.05
-            reasoning_factors.append(f"Very low column name similarity ({avg_name_similarity:.2f})")
+            reasoning_factors.append(
+                f"Very low column name similarity ({avg_name_similarity:.2f})"
+            )
         confidence_score += name_confidence
+        generic_pair_count = sum(
+            1
+            for left_col, right_col in column_pairs
+            if _is_generic_identifier(left_col)
+            and _is_generic_identifier(right_col)
+        )
+        if generic_pair_count:
+            penalty = min(0.15 * generic_pair_count, 0.3)
+            confidence_score = max(confidence_score - penalty, 0.0)
+            reasoning_factors.append(
+                f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
+            )
         # Check for foreign key naming patterns
         fk_pattern_confidence = 0.0
         for left_col, right_col in column_pairs:
             if _looks_like_foreign_key(left_table, right_table, left_col):
                 fk_pattern_confidence += 0.1
-                reasoning_factors.append(f"Column '{left_col}' follows FK naming pattern")
+                reasoning_factors.append(
+                    f"Column '{left_col}' follows FK naming pattern"
+                )
         confidence_score += min(fk_pattern_confidence, 0.2)
@@ -927,29 +1039,45 @@ def _calculate_relationship_confidence(
                 # Check if uniqueness pattern matches inferred cardinality
                 left_card, right_card = cardinality_result
-                uniqueness_threshold = adaptive_thresholds.get("uniqueness_threshold", 0.95) if adaptive_thresholds else 0.95
+                uniqueness_threshold = (
+                    adaptive_thresholds.get("uniqueness_threshold", 0.95)
+                    if adaptive_thresholds
+                    else 0.95
+                )
                 cardinality_consistency = False
                 if left_card == "1" and left_unique_ratio > uniqueness_threshold:
                     cardinality_consistency = True
-                elif left_card in ("*", "+") and left_unique_ratio <= uniqueness_threshold:
+                elif (
+                    left_card in ("*", "+")
+                    and left_unique_ratio <= uniqueness_threshold
+                ):
                     cardinality_consistency = True
                 if right_card == "1" and right_unique_ratio > uniqueness_threshold:
                     cardinality_consistency = cardinality_consistency and True
-                elif right_card in ("*", "+") and right_unique_ratio <= uniqueness_threshold:
+                elif (
+                    right_card in ("*", "+")
+                    and right_unique_ratio <= uniqueness_threshold
+                ):
                     cardinality_consistency = cardinality_consistency and True
                 if cardinality_consistency:
                     uniqueness_confidence = 0.2
-                    reasoning_factors.append("Sample uniqueness patterns support inferred cardinality")
+                    reasoning_factors.append(
+                        "Sample uniqueness patterns support inferred cardinality"
+                    )
                 else:
                     uniqueness_confidence = 0.1
-                    reasoning_factors.append("Sample uniqueness patterns partially support cardinality")
+                    reasoning_factors.append(
+                        "Sample uniqueness patterns partially support cardinality"
+                    )
                 confidence_score += uniqueness_confidence
         else:
-            reasoning_factors.append(f"Limited sample size ({sample_size}) reduces confidence")
+            reasoning_factors.append(
+                f"Limited sample size ({sample_size}) reduces confidence"
+            )
     # Factor 4: Data type compatibility
     if column_pairs and left_meta and right_meta:
@@ -992,15 +1120,21 @@ def _calculate_relationship_confidence(
     evidence_details["left_table_role"] = left_role
     evidence_details["right_table_role"] = right_role
-    relationship_context = _get_business_relationship_context(left_table, right_table, left_role, right_role)
+    relationship_context = _get_business_relationship_context(
+        left_table, right_table, left_role, right_role
+    )
     evidence_details["relationship_context"] = relationship_context
     if relationship_context in ["fact_to_dimension", "dimension_to_fact"]:
         role_confidence = 0.15
-        reasoning_factors.append(f"Strong business relationship pattern: {relationship_context}")
+        reasoning_factors.append(
+            f"Strong business relationship pattern: {relationship_context}"
+        )
     elif relationship_context in ["dimension_hierarchy", "bridge_relationship"]:
         role_confidence = 0.1
-        reasoning_factors.append(f"Valid business relationship pattern: {relationship_context}")
+        reasoning_factors.append(
+            f"Valid business relationship pattern: {relationship_context}"
+        )
     elif relationship_context == "fact_to_fact":
         role_confidence = 0.05
         reasoning_factors.append("Unusual but possible fact-to-fact relationship")
@@ -1013,7 +1147,9 @@ def _calculate_relationship_confidence(
     # Factor 6: Multiple column relationships (composite keys)
     if len(column_pairs) > 1:
         composite_confidence = 0.1
-        reasoning_factors.append(f"Multi-column relationship ({len(column_pairs)} columns) increases confidence")
+        reasoning_factors.append(
+            f"Multi-column relationship ({len(column_pairs)} columns) increases confidence"
+        )
         confidence_score += composite_confidence
     # Normalize confidence score to 0-1 range
@@ -1043,7 +1179,9 @@ def _calculate_relationship_confidence(
         "reasoning_factors": reasoning_factors,
         "evidence_details": evidence_details,
         "inferred_cardinality": f"{cardinality_result[0]}:{cardinality_result[1]}",
-        "join_type": "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER",
+        "join_type": (
+            "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
+        ),
         "column_count": len(column_pairs),
     }
@@ -1059,101 +1197,196 @@ def _get_domain_knowledge_patterns() -> Dict[str, Any]:
         # Common business entity patterns
         "business_entities": {
             "customer": {
-                "table_patterns": ["CUSTOMER", "CUST", "CLIENT", "ACCOUNT_HOLDER", "USER", "MEMBER"],
-                "pk_patterns": ["CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "USER_ID", "MEMBER_ID"],
-                "typical_attributes": ["NAME", "EMAIL", "PHONE", "ADDRESS", "STATUS", "TYPE", "SEGMENT"],
-                "role": "dimension"
+                "table_patterns": [
+                    "CUSTOMER",
+                    "CUST",
+                    "CLIENT",
+                    "ACCOUNT_HOLDER",
+                    "USER",
+                    "MEMBER",
+                ],
+                "pk_patterns": [
+                    "CUSTOMER_ID",
+                    "CUST_ID",
+                    "CLIENT_ID",
+                    "USER_ID",
+                    "MEMBER_ID",
+                ],
+                "typical_attributes": [
+                    "NAME",
+                    "EMAIL",
+                    "PHONE",
+                    "ADDRESS",
+                    "STATUS",
+                    "TYPE",
+                    "SEGMENT",
+                ],
+                "role": "dimension",
             },
             "product": {
                 "table_patterns": ["PRODUCT", "ITEM", "SKU", "INVENTORY", "CATALOG"],
                 "pk_patterns": ["PRODUCT_ID", "ITEM_ID", "SKU", "PRODUCT_KEY"],
-                "typical_attributes": ["NAME", "DESCRIPTION", "CATEGORY", "PRICE", "BRAND", "STATUS"],
-                "role": "dimension"
+                "typical_attributes": [
+                    "NAME",
+                    "DESCRIPTION",
+                    "CATEGORY",
+                    "PRICE",
+                    "BRAND",
+                    "STATUS",
+                ],
+                "role": "dimension",
             },
             "order": {
                 "table_patterns": ["ORDER", "TRANSACTION", "SALE", "PURCHASE"],
-                "pk_patterns": ["ORDER_ID", "TRANSACTION_ID", "SALE_ID", "ORDER_NUMBER"],
+                "pk_patterns": [
+                    "ORDER_ID",
+                    "TRANSACTION_ID",
+                    "SALE_ID",
+                    "ORDER_NUMBER",
+                ],
                 "typical_attributes": ["DATE", "AMOUNT", "STATUS", "QUANTITY", "TOTAL"],
-                "role": "fact"
+                "role": "fact",
             },
             "date": {
                 "table_patterns": ["DATE", "TIME", "CALENDAR", "DIM_DATE"],
                 "pk_patterns": ["DATE_ID", "DATE_KEY", "TIME_ID"],
-                "typical_attributes": ["YEAR", "MONTH", "DAY", "QUARTER", "WEEK", "WEEKDAY"],
-                "role": "dimension"
+                "typical_attributes": [
+                    "YEAR",
+                    "MONTH",
+                    "DAY",
+                    "QUARTER",
+                    "WEEK",
+                    "WEEKDAY",
+                ],
+                "role": "dimension",
             },
             "location": {
-                "table_patterns": ["LOCATION", "GEOGRAPHY", "ADDRESS", "REGION", "TERRITORY"],
+                "table_patterns": [
+                    "LOCATION",
+                    "GEOGRAPHY",
+                    "ADDRESS",
+                    "REGION",
+                    "TERRITORY",
+                ],
                 "pk_patterns": ["LOCATION_ID", "GEO_ID", "ADDRESS_ID", "REGION_ID"],
-                "typical_attributes": ["COUNTRY", "STATE", "CITY", "ZIP", "LATITUDE", "LONGITUDE"],
-                "role": "dimension"
+                "typical_attributes": [
+                    "COUNTRY",
+                    "STATE",
+                    "CITY",
+                    "ZIP",
+                    "LATITUDE",
+                    "LONGITUDE",
+                ],
+                "role": "dimension",
             },
             "employee": {
                 "table_patterns": ["EMPLOYEE", "STAFF", "WORKER", "PERSONNEL"],
                 "pk_patterns": ["EMPLOYEE_ID", "STAFF_ID", "EMP_ID"],
-                "typical_attributes": ["NAME", "DEPARTMENT", "TITLE", "MANAGER", "HIRE_DATE"],
-                "role": "dimension"
-            }
+                "typical_attributes": [
+                    "NAME",
+                    "DEPARTMENT",
+                    "TITLE",
+                    "MANAGER",
+                    "HIRE_DATE",
+                ],
+                "role": "dimension",
+            },
         },
         # Common relationship patterns in data warehouses
         "relationship_patterns": {
             "star_schema": {
                 "pattern": "fact_to_dimension",
                 "confidence_boost": 0.2,
-                "description": "Standard star schema fact-to-dimension relationship"
+                "description": "Standard star schema fact-to-dimension relationship",
             },
             "snowflake_schema": {
                 "pattern": "dimension_hierarchy",
                 "confidence_boost": 0.15,
-                "description": "Snowflake schema dimension hierarchy"
+                "description": "Snowflake schema dimension hierarchy",
             },
             "bridge_table": {
                 "pattern": "many_to_many_via_bridge",
                 "confidence_boost": 0.1,
-                "description": "Many-to-many relationship through bridge table"
+                "description": "Many-to-many relationship through bridge table",
             },
             "time_dimension": {
                 "pattern": "temporal_relationship",
                 "confidence_boost": 0.25,
-                "description": "Time-based relationship (very common in warehouses)"
-            }
+                "description": "Time-based relationship (very common in warehouses)",
+            },
         },
         # Known FK patterns that often appear in real data warehouses
         "common_fk_patterns": {
             "customer_references": [
-                "CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "ACCOUNT_ID", "USER_ID"
+                "CUSTOMER_ID",
+                "CUST_ID",
+                "CLIENT_ID",
+                "ACCOUNT_ID",
+                "USER_ID",
             ],
             "product_references": [
-                "PRODUCT_ID", "ITEM_ID", "SKU", "PROD_ID", "CATALOG_ID"
+                "PRODUCT_ID",
+                "ITEM_ID",
+                "SKU",
+                "PROD_ID",
+                "CATALOG_ID",
             ],
             "date_references": [
-                "DATE_ID", "ORDER_DATE_ID", "SHIP_DATE_ID", "CREATE_DATE_ID",
-                "TRANSACTION_DATE_ID", "DATE_KEY"
+                "DATE_ID",
+                "ORDER_DATE_ID",
+                "SHIP_DATE_ID",
+                "CREATE_DATE_ID",
+                "TRANSACTION_DATE_ID",
+                "DATE_KEY",
             ],
             "location_references": [
-                "LOCATION_ID", "ADDRESS_ID", "SHIP_TO_ID", "BILL_TO_ID",
-                "WAREHOUSE_ID", "STORE_ID"
-            ]
+                "LOCATION_ID",
+                "ADDRESS_ID",
+                "SHIP_TO_ID",
+                "BILL_TO_ID",
+                "WAREHOUSE_ID",
+                "STORE_ID",
+            ],
         },
         # Table naming conventions that indicate specific patterns
         "naming_conventions": {
             "fact_indicators": [
-                "FACT_", "FCT_", "F_", "SALES_", "ORDERS_", "TRANSACTIONS_",
-                "REVENUE_", "METRICS_", "EVENTS_", "ACTIVITY_"
+                "FACT_",
+                "FCT_",
+                "F_",
+                "SALES_",
+                "ORDERS_",
+                "TRANSACTIONS_",
+                "REVENUE_",
+                "METRICS_",
+                "EVENTS_",
+                "ACTIVITY_",
             ],
             "dimension_indicators": [
-                "DIM_", "D_", "REF_", "LKP_", "LOOKUP_", "MASTER_"
+                "DIM_",
+                "D_",
+                "REF_",
+                "LKP_",
+                "LOOKUP_",
+                "MASTER_",
             ],
             "bridge_indicators": [
-                "BRG_", "BRIDGE_", "XREF_", "MAP_", "ASSOC_", "LINK_"
+                "BRG_",
+                "BRIDGE_",
+                "XREF_",
+                "MAP_",
+                "ASSOC_",
+                "LINK_",
             ],
             "staging_indicators": [
-                "STG_", "STAGING_", "TMP_", "TEMP_", "RAW_", "LANDING_"
-            ]
-        }
+                "STG_",
+                "STAGING_",
+                "TMP_",
+                "TEMP_",
+                "RAW_",
+                "LANDING_",
+            ],
+        },
     }
@@ -1204,18 +1437,26 @@ def _apply_domain_knowledge(
         if entity_pair in common_pairs:
             boost = common_pairs[entity_pair]
             confidence_boost += boost
-            enhancement_factors.append(f"Recognized common business pattern: {entity_pair} (+{boost:.2f})")
+            enhancement_factors.append(
+                f"Recognized common business pattern: {entity_pair} (+{boost:.2f})"
+            )
         elif f"{right_entity}-{left_entity}" in common_pairs:
             boost = common_pairs[f"{right_entity}-{left_entity}"]
             confidence_boost += boost
-            enhancement_factors.append(f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})")
+            enhancement_factors.append(
+                f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})"
+            )
     # Factor 2: Check for standard FK naming patterns
     for left_col, right_col in column_pairs:
-        fk_pattern_match = _check_standard_fk_patterns(left_col, right_col, domain_patterns)
+        fk_pattern_match = _check_standard_fk_patterns(
+            left_col, right_col, domain_patterns
+        )
         if fk_pattern_match:
             confidence_boost += 0.15
-            enhancement_factors.append(f"Standard FK pattern detected: {fk_pattern_match}")
+            enhancement_factors.append(
+                f"Standard FK pattern detected: {fk_pattern_match}"
+            )
     # Factor 3: Table naming convention analysis
     left_convention = _identify_naming_convention(left_table, domain_patterns)
@@ -1223,8 +1464,9 @@ def _apply_domain_knowledge(
     if left_convention and right_convention:
         # Boost confidence for expected patterns
-        if (left_convention == "fact" and right_convention == "dimension") or \
-           (left_convention == "dimension" and right_convention == "fact"):
+        if (left_convention == "fact" and right_convention == "dimension") or (
+            left_convention == "dimension" and right_convention == "fact"
+        ):
             confidence_boost += 0.2
             enhancement_factors.append("Standard fact-dimension naming pattern (+0.20)")
         elif left_convention == "dimension" and right_convention == "dimension":
@@ -1237,12 +1479,20 @@ def _apply_domain_knowledge(
         enhancement_factors.append("Time dimension relationship (very common) (+0.20)")
     # Factor 5: Schema pattern recognition (star vs snowflake)
-    schema_pattern = _detect_schema_pattern(left_table, right_table, left_meta, right_meta, domain_patterns)
+    schema_pattern = _detect_schema_pattern(
+        left_table, right_table, left_meta, right_meta, domain_patterns
+    )
     if schema_pattern:
-        pattern_boost = domain_patterns["relationship_patterns"][schema_pattern]["confidence_boost"]
+        pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
+            "confidence_boost"
+        ]
         confidence_boost += pattern_boost
-        pattern_desc = domain_patterns["relationship_patterns"][schema_pattern]["description"]
-        enhancement_factors.append(f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})")
+        pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
+            "description"
+        ]
+        enhancement_factors.append(
+            f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})"
+        )
     # Apply the boost but cap the final confidence at 1.0
     enhanced_confidence = min(current_confidence + confidence_boost, 1.0)
@@ -1259,7 +1509,9 @@ def _apply_domain_knowledge(
     }
-def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> Optional[str]:
+def _identify_business_entity(
+    table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
+) -> Optional[str]:
     """Identify what business entity a table represents."""
     table_upper = table_name.upper()
     business_entities = domain_patterns["business_entities"]
@@ -1274,13 +1526,18 @@ def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domai
         pk_candidates = table_meta.get("pk_candidates", {})
         for pk_pattern in entity_info["pk_patterns"]:
             for pk_norm in pk_candidates.keys():
-                if pk_pattern.replace("_", "").upper() in pk_norm.replace("_", "").upper():
+                if (
+                    pk_pattern.replace("_", "").upper()
+                    in pk_norm.replace("_", "").upper()
+                ):
                     return entity_type
     return None
-def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
+def _check_standard_fk_patterns(
+    left_col: str, right_col: str, domain_patterns: Dict[str, Any]
+) -> Optional[str]:
     """Check if column pair matches standard FK patterns."""
     common_fks = domain_patterns["common_fk_patterns"]
@@ -1295,7 +1552,9 @@ def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns:
     return None
-def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
+def _identify_naming_convention(
+    table_name: str, domain_patterns: Dict[str, Any]
+) -> Optional[str]:
     """Identify the naming convention used for a table."""
     table_upper = table_name.upper()
     naming_conventions = domain_patterns["naming_conventions"]
@@ -1308,7 +1567,9 @@ def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]
     return None
-def _is_time_dimension_pattern(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> bool:
+def _is_time_dimension_pattern(
+    table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
+) -> bool:
     """Check if table follows time dimension patterns."""
     table_upper = table_name.upper()
     time_patterns = domain_patterns["business_entities"]["date"]["table_patterns"]
@@ -1344,15 +1605,16 @@ def _detect_schema_pattern(
     right_table: str,
     left_meta: Dict[str, Any],
     right_meta: Dict[str, Any],
-    domain_patterns: Dict[str, Any]
+    domain_patterns: Dict[str, Any],
 ) -> Optional[str]:
     """Detect common schema patterns (star, snowflake, etc.)."""
     left_role = _detect_table_role(left_table, left_meta)
     right_role = _detect_table_role(right_table, right_meta)
     # Star schema pattern: fact table to dimension
-    if (left_role == "fact" and right_role == "dimension") or \
-       (left_role == "dimension" and right_role == "fact"):
+    if (left_role == "fact" and right_role == "dimension") or (
+        left_role == "dimension" and right_role == "fact"
+    ):
         return "star_schema"
     # Snowflake schema pattern: dimension to dimension
@@ -1360,8 +1622,9 @@ def _detect_schema_pattern(
         return "snowflake_schema"
     # Time dimension pattern (very common)
-    if _is_time_dimension_pattern(right_table, right_meta, domain_patterns) or \
-       _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
+    if _is_time_dimension_pattern(
+        right_table, right_meta, domain_patterns
+    ) or _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
         return "time_dimension"
     # Bridge table pattern
@@ -1397,7 +1660,9 @@ def _calculate_adaptive_thresholds(
     # Calculate sample statistics
     sample_sizes = [len(vals) for vals in values_list if vals]
     max_sample_size = max(sample_sizes) if sample_sizes else base_sample_size
-    avg_sample_size = sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
+    avg_sample_size = (
+        sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
+    )
     # Calculate data distribution characteristics
     total_unique_values = 0
@@ -1425,7 +1690,7 @@ def _calculate_adaptive_thresholds(
             if len(value_counts) > 1:
                 max_freq = max(value_counts.values())
                 min_freq = min(value_counts.values())
-                skew = max_freq / min_freq if min_freq > 0 else float('inf')
+                skew = max_freq / min_freq if min_freq > 0 else float("inf")
                 skew_ratios.append(skew)
     # Calculate overall uniqueness ratio
@@ -1459,7 +1724,9 @@ def _calculate_adaptive_thresholds(
         min_size_adj *= 1.1
     # Scale with base sample size from configuration
-    size_scale_factor = min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
+    size_scale_factor = (
+        min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
+    )
     min_size_adj *= size_scale_factor
     thresholds["min_sample_size"] = max(int(base_min_size * min_size_adj), 10)
@@ -1594,8 +1861,12 @@ def _infer_cardinality(
         left_non_null = [v for v in left_values if not _is_nullish(v)]
         right_non_null = [v for v in right_values if not _is_nullish(v)]
-        left_unique_ratio = len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
-        right_unique_ratio = len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
+        left_unique_ratio = (
+            len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
+        )
+        right_unique_ratio = (
+            len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
+        )
         # Apply adaptive uniqueness threshold
         left_is_unique = left_unique_ratio > uniqueness_threshold
@@ -1691,11 +1962,19 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
     Returns:
         str: Table role ('fact', 'dimension', 'bridge', 'staging', 'unknown')
     """
-    upper_name = table_name.upper()
     tokens = _identifier_tokens(table_name)
     # Rule 1: Explicit prefixes/suffixes
-    fact_indicators = {"FACT", "FCT", "TXN", "TRANSACTION", "EVENT", "LOG", "SALES", "ORDER"}
+    fact_indicators = {
+        "FACT",
+        "FCT",
+        "TXN",
+        "TRANSACTION",
+        "EVENT",
+        "LOG",
+        "SALES",
+        "ORDER",
+    }
     dim_indicators = {"DIM", "DIMENSION", "LOOKUP", "REF", "REFERENCE", "MASTER"}
     bridge_indicators = {"BRIDGE", "BRG", "LINK", "JUNCTION", "ASSOC", "ASSOCIATION"}
     staging_indicators = {"STG", "STAGING", "TMP", "TEMP", "WORK", "LANDING", "RAW"}
@@ -1734,9 +2013,22 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
                 id_count += 1
             # Count measure-like columns (amounts, counts, quantities)
-            if any(word in col_name for word in ["AMOUNT", "QTY", "QUANTITY", "COUNT", "TOTAL", "SUM", "AVG"]):
+            if any(
+                word in col_name
+                for word in [
+                    "AMOUNT",
+                    "QTY",
+                    "QUANTITY",
+                    "COUNT",
+                    "TOTAL",
+                    "SUM",
+                    "AVG",
+                ]
+            ):
                 measure_like_count += 1
-            elif base_type in MEASURE_DATATYPES and not col_info.get("is_identifier", False):
+            elif base_type in MEASURE_DATATYPES and not col_info.get(
+                "is_identifier", False
+            ):
                 measure_like_count += 1
             else:
                 dimension_like_count += 1
@@ -1761,7 +2053,9 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
     return "unknown"
-def _get_business_relationship_context(left_table: str, right_table: str, left_role: str, right_role: str) -> str:
+def _get_business_relationship_context(
+    left_table: str, right_table: str, left_role: str, right_role: str
+) -> str:
     """
     Determine business relationship context between tables based on their roles.
@@ -1833,7 +2127,7 @@ def _infer_join_type(
         4. Naming pattern heuristics
         5. Conservative INNER JOIN default
     """
     # RULE 1: Default to INNER JOIN (most common and safest)
     default_join = semantic_model_pb2.JoinType.inner
@@ -1861,9 +2155,17 @@ def _infer_join_type(
         # Apply business rules based on relationship context
         if relationship_context == "fact_to_dimension":
             # Fact → Dimension: usually INNER, but check for optional dimensions
-            if any(keyword in right_table.upper() for keyword in [
-                "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "OPTIONAL", "SECONDARY"
-            ]):
+            if any(
+                keyword in right_table.upper()
+                for keyword in [
+                    "PROMO",
+                    "PROMOTION",
+                    "DISCOUNT",
+                    "COUPON",
+                    "OPTIONAL",
+                    "SECONDARY",
+                ]
+            ):
                 logger.debug(
                     f"Join type inference for {left_table} -> {right_table}: "
                     f"LEFT_OUTER (fact to optional dimension: {right_role})"
@@ -1907,11 +2209,19 @@ def _infer_join_type(
             return semantic_model_pb2.JoinType.left_outer
     # RULE 5: Naming pattern heuristics for optional relationships
-    left_upper = left_table.upper()
     right_upper = right_table.upper()
     optional_keywords = {
-        "OPTIONAL", "ALTERNATE", "SECONDARY", "BACKUP", "FALLBACK",
-        "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "TEMP", "TMP"
+        "OPTIONAL",
+        "ALTERNATE",
+        "SECONDARY",
+        "BACKUP",
+        "FALLBACK",
+        "PROMO",
+        "PROMOTION",
+        "DISCOUNT",
+        "COUPON",
+        "TEMP",
+        "TMP",
     }
     for keyword in optional_keywords:
@@ -1946,12 +2256,17 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
     """
     fk_upper = fk_column.strip().upper()
     pk_table_variants = _table_variants(pk_table)
     # Pattern 1: {table_name}_id or {table_name}_key
     for variant in pk_table_variants:
-        if fk_upper in {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}:
+        if fk_upper in {
+            f"{variant}_ID",
+            f"{variant}ID",
+            f"{variant}_KEY",
+            f"{variant}KEY",
+        }:
             return True
     # Pattern 2: Column ends with table name variants
     tokens = _identifier_tokens(fk_column)
     if len(tokens) >= 2:
@@ -1961,21 +2276,23 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
                 tail = tokens[-1]
                 if tail in {"ID", "KEY"}:
                     return True
     # Pattern 3: Similar to primary key column but with FK table prefix
     # e.g., order_id in order_items table referencing orders.id
     fk_table_variants = _table_variants(fk_table)
     for fk_variant in fk_table_variants:
         if fk_upper.startswith(fk_variant):
-            remainder = fk_upper[len(fk_variant):].lstrip("_")
+            remainder = fk_upper[len(fk_variant) :].lstrip("_")
             for pk_variant in pk_table_variants:
                 if remainder.startswith(pk_variant):
                     return True
     return False
-def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.NamedFilter]:
+def _suggest_filters(
+    raw_table: data_types.Table,
+) -> List[semantic_model_pb2.NamedFilter]:
     suggestions: List[semantic_model_pb2.NamedFilter] = []
     for col in raw_table.columns:
         base_type = _base_type_from_type(col.column_type)
@@ -2011,12 +2328,20 @@ def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.Nam
             )
             is_textual = base_type in {"STRING", "TEXT", "VARCHAR", "CHAR", "CHARACTER"}
             is_boolean = base_type in {"BOOLEAN"}
-            is_categorical_numeric = base_type in {"INT", "INTEGER", "NUMBER", "SMALLINT", "BIGINT"} and any(
-                upper_name.endswith(suffix) for suffix in categorical_suffixes
-            )
-            if not is_identifier_like and (is_textual or is_boolean or is_categorical_numeric):
-                formatted = [_format_literal(val, base_type) for val in distinct_values[:5]]
+            is_categorical_numeric = base_type in {
+                "INT",
+                "INTEGER",
+                "NUMBER",
+                "SMALLINT",
+                "BIGINT",
+            } and any(upper_name.endswith(suffix) for suffix in categorical_suffixes)
+            if not is_identifier_like and (
+                is_textual or is_boolean or is_categorical_numeric
+            ):
+                formatted = [
+                    _format_literal(val, base_type) for val in distinct_values[:5]
+                ]
                 expr = f"{col.column_name} IN ({', '.join(formatted)})"
                 suggestions.append(
                     semantic_model_pb2.NamedFilter(
@@ -2034,11 +2359,31 @@ def _infer_relationships(
     *,
     session: Optional[Session] = None,
     strict_join_inference: bool = False,
+    status: Optional[Dict[str, bool]] = None,
+    max_relationships: Optional[int] = None,
+    min_confidence: float = 0.2,
+    timeout_seconds: Optional[float] = None,
 ) -> List[semantic_model_pb2.Relationship]:
+    status_dict = status if status is not None else {}
+    if "limited_by_timeout" not in status_dict:
+        status_dict["limited_by_timeout"] = False
+    if "limited_by_max_relationships" not in status_dict:
+        status_dict["limited_by_max_relationships"] = False
     relationships: List[semantic_model_pb2.Relationship] = []
     if not raw_tables:
         return relationships
+    start_time = time.perf_counter()
+    min_confidence = max(0.0, min(min_confidence, 1.0))
+    limit_reached = False
+    def _timed_out() -> bool:
+        return (
+            timeout_seconds is not None
+            and (time.perf_counter() - start_time) >= timeout_seconds
+        )
     metadata = {}
     prefix_counter: Dict[str, int] = {}
     for _, raw_table in raw_tables:
@@ -2060,7 +2405,9 @@ def _infer_relationships(
         table_prefixes = global_prefixes | _table_prefixes(raw_table.name)
         for column in raw_table.columns:
             base_type = _base_type_from_type(column.column_type)
-            normalized = _sanitize_identifier_name(column.column_name, prefixes_to_drop=table_prefixes)
+            normalized = _sanitize_identifier_name(
+                column.column_name, prefixes_to_drop=table_prefixes
+            )
             entry = columns_meta.setdefault(
                 normalized,
                 {
@@ -2075,7 +2422,9 @@ def _infer_relationships(
             entry["names"].append(column.column_name)
             if column.values:
                 entry["values"].extend(column.values)
-            entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(column.column_name, base_type)
+            entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
+                column.column_name, base_type
+            )
             is_primary = getattr(column, "is_primary_key", False)
             if is_primary:
                 entry["is_primary"] = True
@@ -2093,15 +2442,42 @@ def _infer_relationships(
     pairs: dict[tuple[str, str], List[tuple[str, str]]] = {}
     null_check_cache: Dict[Tuple[str, str, str, str], bool] = {}
-    def _record_pair(left_table: str, right_table: str, left_col: str, right_col: str) -> None:
+    def _record_pair(
+        left_table: str, right_table: str, left_col: str, right_col: str
+    ) -> None:
+        nonlocal limit_reached
+        if limit_reached:
+            return
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            limit_reached = True
+            return
         key = (left_table, right_table)
         value = (left_col, right_col)
-        if value not in pairs.setdefault(key, []):
-            pairs[key].append(value)
+        bucket = pairs.setdefault(key, [])
+        if value not in bucket:
+            bucket.append(value)
+            if (
+                max_relationships is not None
+                and len(pairs) >= max_relationships
+            ):
+                status_dict["limited_by_max_relationships"] = True
+                limit_reached = True
     table_names = list(metadata.keys())
     for i in range(len(table_names)):
+        if limit_reached or status_dict["limited_by_timeout"]:
+            break
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
         for j in range(i + 1, len(table_names)):
+            if limit_reached or status_dict["limited_by_timeout"]:
+                break
+            if _timed_out():
+                status_dict["limited_by_timeout"] = True
+                break
             table_a_name = table_names[i]
             table_b_name = table_names[j]
             table_a = metadata[table_a_name]
@@ -2158,7 +2534,7 @@ def _infer_relationships(
                         continue
                     if norm_b == pk_norm:
                         continue
                     # Direct suffix match
                     if norm_b.endswith(pk_norm):
                         _record_pair(
@@ -2168,23 +2544,34 @@ def _infer_relationships(
                             pk_cols[0],
                         )
                         continue
                     # Enhanced: Check if column looks like a foreign key to this table
-                    if _looks_like_foreign_key(table_b_name, table_a_name, meta_b["names"][0]):
+                    if _looks_like_foreign_key(
+                        table_b_name, table_a_name, meta_b["names"][0]
+                    ):
                         # Additional check: name similarity with adaptive threshold
                         similarity = _name_similarity(norm_b, pk_norm)
                         # Calculate adaptive threshold for this relationship
                         all_sample_values = []
-                        for col_values in [pk_meta.get("values", []), meta_b.get("values", [])]:
+                        for col_values in [
+                            pk_meta.get("values", []),
+                            meta_b.get("values", []),
+                        ]:
                             if col_values:
                                 all_sample_values.append(col_values)
                         adaptive_thresholds = _calculate_adaptive_thresholds(
                             all_sample_values,
                             table_count=len(raw_tables),
-                            base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
+                            base_sample_size=(
+                                len(pk_meta.get("values", []))
+                                if pk_meta.get("values")
+                                else 10
+                            ),
+                        )
+                        similarity_threshold = adaptive_thresholds.get(
+                            "similarity_threshold", 0.6
                         )
-                        similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
                         if similarity >= similarity_threshold:
                             _record_pair(
@@ -2204,7 +2591,7 @@ def _infer_relationships(
                         continue
                     if norm_a == pk_norm:
                         continue
                     # Direct suffix match
                     if norm_a.endswith(pk_norm):
                         _record_pair(
@@ -2214,23 +2601,34 @@ def _infer_relationships(
                             pk_cols[0],
                         )
                         continue
                     # Enhanced: Check if column looks like a foreign key to this table
-                    if _looks_like_foreign_key(table_a_name, table_b_name, meta_a["names"][0]):
+                    if _looks_like_foreign_key(
+                        table_a_name, table_b_name, meta_a["names"][0]
+                    ):
                         # Additional check: name similarity with adaptive threshold
                         similarity = _name_similarity(norm_a, pk_norm)
                         # Calculate adaptive threshold for this relationship
                         all_sample_values = []
-                        for col_values in [pk_meta.get("values", []), meta_a.get("values", [])]:
+                        for col_values in [
+                            pk_meta.get("values", []),
+                            meta_a.get("values", []),
+                        ]:
                             if col_values:
                                 all_sample_values.append(col_values)
                         adaptive_thresholds = _calculate_adaptive_thresholds(
                             all_sample_values,
                             table_count=len(raw_tables),
-                            base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
+                            base_sample_size=(
+                                len(pk_meta.get("values", []))
+                                if pk_meta.get("values")
+                                else 10
+                            ),
+                        )
+                        similarity_threshold = adaptive_thresholds.get(
+                            "similarity_threshold", 0.6
                         )
-                        similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
                         if similarity >= similarity_threshold:
                             _record_pair(
@@ -2255,10 +2653,19 @@ def _infer_relationships(
     # Build relationships with inferred cardinality
     for (left_table, right_table), column_pairs in pairs.items():
+        if _timed_out():
+            status_dict["limited_by_timeout"] = True
+            break
+        if (
+            max_relationships is not None
+            and len(relationships) >= max_relationships
+        ):
+            status_dict["limited_by_max_relationships"] = True
+            break
         # Infer cardinality based on available metadata
         left_meta = metadata[left_table]
         right_meta = metadata[right_table]
         # Determine if tables have primary keys in the relationship
         left_has_pk = any(
             col_name in [pair[0] for pair in column_pairs]
@@ -2270,7 +2677,7 @@ def _infer_relationships(
             for pk_list in right_meta["pk_candidates"].values()
             for col_name in pk_list
         )
         # Enhanced: Get sample values for all columns in the relationship (for composite key analysis)
         left_values_all = []
         right_values_all = []
@@ -2279,12 +2686,11 @@ def _infer_relationships(
         for left_col, right_col in column_pairs:
             left_col_key = _sanitize_identifier_name(
-                left_col,
-                prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
+                left_col, prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
             )
             right_col_key = _sanitize_identifier_name(
                 right_col,
-                prefixes_to_drop=global_prefixes | _table_prefixes(right_table)
+                prefixes_to_drop=global_prefixes | _table_prefixes(right_table),
             )
             left_col_values = []
@@ -2293,7 +2699,9 @@ def _infer_relationships(
             if left_col_key in left_meta["columns"]:
                 left_col_values = left_meta["columns"][left_col_key].get("values") or []
             if right_col_key in right_meta["columns"]:
-                right_col_values = right_meta["columns"][right_col_key].get("values") or []
+                right_col_values = (
+                    right_meta["columns"][right_col_key].get("values") or []
+                )
             left_values_all.append(left_col_values)
             right_values_all.append(right_col_values)
@@ -2322,7 +2730,7 @@ def _infer_relationships(
                 right_has_pk,
                 adaptive_thresholds=global_adaptive_thresholds,
             )
         # Determine if SQL null probe should be executed for stricter inference
         strict_fk_detected = False
         if strict_join_inference and session:
@@ -2352,7 +2760,7 @@ def _infer_relationships(
             left_table_meta=left_meta,
             right_table_meta=right_meta,
         )
         # Calculate confidence and reasoning for this relationship
         confidence_analysis = _calculate_relationship_confidence(
             left_table=left_table,
@@ -2376,45 +2784,54 @@ def _infer_relationships(
             column_pairs=column_pairs,
             left_meta=left_meta,
             right_meta=right_meta,
-            current_confidence=confidence_analysis['confidence_score']
+            current_confidence=confidence_analysis["confidence_score"],
         )
         # Update confidence analysis with domain knowledge
-        if domain_enhancement['confidence_boost'] > 0:
-            confidence_analysis['confidence_score'] = min(1.0,
-                confidence_analysis['confidence_score'] + domain_enhancement['confidence_boost'])
+        if domain_enhancement["confidence_boost"] > 0:
+            confidence_analysis["confidence_score"] = min(
+                1.0,
+                confidence_analysis["confidence_score"]
+                + domain_enhancement["confidence_boost"],
+            )
             # Add domain knowledge factors to reasoning
-            for domain_factor in domain_enhancement['domain_factors']:
-                confidence_analysis['reasoning_factors'].append(f"Domain knowledge: {domain_factor}")
+            for domain_factor in domain_enhancement["domain_factors"]:
+                confidence_analysis["reasoning_factors"].append(
+                    f"Domain knowledge: {domain_factor}"
+                )
             # Update confidence level based on new score
-            if confidence_analysis['confidence_score'] >= 0.8:
-                confidence_analysis['confidence_level'] = 'very_high'
-                confidence_analysis['confidence_description'] = 'Very High Confidence'
-            elif confidence_analysis['confidence_score'] >= 0.6:
-                confidence_analysis['confidence_level'] = 'high'
-                confidence_analysis['confidence_description'] = 'High Confidence'
-            elif confidence_analysis['confidence_score'] >= 0.4:
-                confidence_analysis['confidence_level'] = 'medium'
-                confidence_analysis['confidence_description'] = 'Medium Confidence'
-            elif confidence_analysis['confidence_score'] >= 0.2:
-                confidence_analysis['confidence_level'] = 'low'
-                confidence_analysis['confidence_description'] = 'Low Confidence'
+            if confidence_analysis["confidence_score"] >= 0.8:
+                confidence_analysis["confidence_level"] = "very_high"
+                confidence_analysis["confidence_description"] = "Very High Confidence"
+            elif confidence_analysis["confidence_score"] >= 0.6:
+                confidence_analysis["confidence_level"] = "high"
+                confidence_analysis["confidence_description"] = "High Confidence"
+            elif confidence_analysis["confidence_score"] >= 0.4:
+                confidence_analysis["confidence_level"] = "medium"
+                confidence_analysis["confidence_description"] = "Medium Confidence"
+            elif confidence_analysis["confidence_score"] >= 0.2:
+                confidence_analysis["confidence_level"] = "low"
+                confidence_analysis["confidence_description"] = "Low Confidence"
             else:
-                confidence_analysis['confidence_level'] = 'very_low'
-                confidence_analysis['confidence_description'] = 'Very Low Confidence'
+                confidence_analysis["confidence_level"] = "very_low"
+                confidence_analysis["confidence_description"] = "Very Low Confidence"
         # Enhanced logging with confidence and reasoning
         sample_info = f"samples: L={len(left_values)}, R={len(right_values)}"
         pk_info = f"PKs: L={left_has_pk}, R={right_has_pk}"
-        join_type_name = "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
+        join_type_name = (
+            "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
+        )
         confidence_info = f"confidence: {confidence_analysis['confidence_score']:.2f} ({confidence_analysis['confidence_level']})"
         # Add domain knowledge info if applied
         domain_info = ""
-        if domain_enhancement['confidence_boost'] > 0:
-            domain_info = f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
+        if domain_enhancement["confidence_boost"] > 0:
+            domain_info = (
+                f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
+            )
         logger.info(
             f"Relationship inference for {left_table} -> {right_table}: "
@@ -2423,22 +2840,40 @@ def _infer_relationships(
         )
         # Log domain knowledge patterns if detected
-        domain_factors = [f for f in confidence_analysis['reasoning_factors'] if f.startswith("Domain knowledge:")]
+        domain_factors = [
+            f
+            for f in confidence_analysis["reasoning_factors"]
+            if f.startswith("Domain knowledge:")
+        ]
         if domain_factors:
-            logger.debug(f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}")
+            logger.debug(
+                f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}"
+            )
         # Log detailed reasoning for medium or lower confidence relationships
-        if confidence_analysis['confidence_score'] < 0.6:
+        if confidence_analysis["confidence_score"] < 0.6:
             logger.debug(f"Confidence reasoning for {left_table} -> {right_table}:")
-            for factor in confidence_analysis['reasoning_factors']:
+            for factor in confidence_analysis["reasoning_factors"]:
                 logger.debug(f"  - {factor}")
         # Log very high confidence relationships with their evidence
-        elif confidence_analysis['confidence_score'] >= 0.8:
-            logger.debug(f"High confidence relationship {left_table} -> {right_table} based on:")
-            for factor in confidence_analysis['reasoning_factors'][:3]:  # Top 3 factors
+        elif confidence_analysis["confidence_score"] >= 0.8:
+            logger.debug(
+                f"High confidence relationship {left_table} -> {right_table} based on:"
+            )
+            for factor in confidence_analysis["reasoning_factors"][:3]:  # Top 3 factors
                 logger.debug(f"  + {factor}")
+        if confidence_analysis["confidence_score"] < min_confidence:
+            logger.debug(
+                "Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
+                left_table,
+                right_table,
+                confidence_analysis["confidence_score"],
+                min_confidence,
+            )
+            continue
         # Determine relationship type based on cardinality
         if left_card == "1" and right_card == "1":
             rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2449,7 +2884,7 @@ def _infer_relationships(
         else:
             # Default to many_to_one for backward compatibility
             rel_type = semantic_model_pb2.RelationshipType.many_to_one
         relationship = semantic_model_pb2.Relationship(
             name=f"{left_table}_to_{right_table}",
             left_table=left_table,
@@ -2466,15 +2901,30 @@ def _infer_relationships(
         relationships.append(relationship)
     # Phase 2: Detect many-to-many relationships through bridge table analysis
-    many_to_many_relationships = _detect_many_to_many_relationships(
-        raw_tables, metadata, relationships
-    )
+    many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
+    if not status_dict["limited_by_timeout"] and (
+        max_relationships is None or len(relationships) < max_relationships
+    ):
+        many_to_many_relationships = _detect_many_to_many_relationships(
+            raw_tables, metadata, relationships
+        )
-    if many_to_many_relationships:
-        relationships.extend(many_to_many_relationships)
-        logger.info(f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables")
+        if many_to_many_relationships and max_relationships is not None:
+            remaining = max_relationships - len(relationships)
+            if remaining <= 0:
+                many_to_many_relationships = []
+            else:
+                many_to_many_relationships = many_to_many_relationships[:remaining]
-    logger.info(f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables")
+        if many_to_many_relationships:
+            relationships.extend(many_to_many_relationships)
+            logger.info(
+                f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
+            )
+    logger.info(
+        f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
+    )
     return relationships
@@ -2512,7 +2962,14 @@ def _raw_table_to_semantic_context_table(
         base_type = _base_type_from_type(col.column_type)
         if _is_time_like_column(col):
             time_data_type = col.column_type
-            if time_data_type.split("(")[0].upper() in {"STRING", "VARCHAR", "TEXT", "CHAR", "CHARACTER", "NVARCHAR"}:
+            if time_data_type.split("(")[0].upper() in {
+                "STRING",
+                "VARCHAR",
+                "TEXT",
+                "CHAR",
+                "CHARACTER",
+                "NVARCHAR",
+            }:
                 time_data_type = "TIMESTAMP_NTZ"
             time_dimension_name = _safe_semantic_identifier(
                 col.column_name,
@@ -2564,7 +3021,9 @@ def _raw_table_to_semantic_context_table(
                         data_type=col.column_type,
                         sample_values=col.values,
                         synonyms=[_PLACEHOLDER_COMMENT],
-                        description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
+                        description=(
+                            col.comment if col.comment else _PLACEHOLDER_COMMENT
+                        ),
                     )
                 )
                 continue
@@ -2685,7 +3144,9 @@ def raw_schema_to_semantic_context(
             unique_database_schema.append(fqn_databse_schema)
         logger.info(f"Pulling column information from {fqn_table}")
-        _notify(f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}...")
+        _notify(
+            f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}..."
+        )
         valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
             session=conn,
             workspace=fqn_table.database,
@@ -2751,7 +3212,9 @@ def raw_schema_to_semantic_context(
                 semantic_model_name,
                 actual_model,
             )
-            _notify("Running DashScope enrichment to enhance descriptions and metrics...")
+            _notify(
+                "Running DashScope enrichment to enhance descriptions and metrics..."
+            )
             # Create progress tracker for enrichment
             def enrichment_progress_callback(update):
@@ -2760,14 +3223,16 @@ def raw_schema_to_semantic_context(
                     EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
                     EnrichmentStage.MODEL_METRICS: "Generating model-level metrics",
                     EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
-                    EnrichmentStage.COMPLETE: "Enrichment complete"
+                    EnrichmentStage.COMPLETE: "Enrichment complete",
                 }
                 base_message = stage_messages.get(update.stage, "Processing")
                 if update.table_name:
                     message = f"{base_message} - {update.table_name} ({update.current_step}/{update.total_steps})"
                 elif update.total_steps > 1:
-                    message = f"{base_message} ({update.current_step}/{update.total_steps})"
+                    message = (
+                        f"{base_message} ({update.current_step}/{update.total_steps})"
+                    )
                 else:
                     message = base_message
@@ -2801,7 +3266,9 @@ def raw_schema_to_semantic_context(
             )
             _notify("DashScope enrichment complete.")
         else:
-            logger.warning("LLM enrichment was requested but DashScope is not configured; skipping enrichment.")
+            logger.warning(
+                "LLM enrichment was requested but DashScope is not configured; skipping enrichment."
+            )
             _notify("DashScope configuration missing; skipped enrichment.")
     return context
@@ -2938,6 +3405,7 @@ def generate_model_str_from_clickzetta(
     Returns:
         str: The raw string of the semantic context.
     """
     def _notify(message: str) -> None:
         if progress_callback:
             try:
@@ -2946,7 +3414,11 @@ def generate_model_str_from_clickzetta(
                 logger.debug("Progress callback failed for message: {}", message)
     table_list = ", ".join(base_tables)
-    logger.info("Generating semantic model '{}' from tables: {}", semantic_model_name, table_list)
+    logger.info(
+        "Generating semantic model '{}' from tables: {}",
+        semantic_model_name,
+        table_list,
+    )
     _notify("Collecting metadata from ClickZetta tables...")
     context = raw_schema_to_semantic_context(

clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

clickzetta-semantic-model-generator 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl