clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/METADATA +5 -5
  2. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/RECORD +21 -21
  3. semantic_model_generator/clickzetta_utils/clickzetta_connector.py +91 -33
  4. semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
  5. semantic_model_generator/data_processing/cte_utils.py +1 -1
  6. semantic_model_generator/generate_model.py +588 -224
  7. semantic_model_generator/llm/dashscope_client.py +4 -2
  8. semantic_model_generator/llm/enrichment.py +144 -57
  9. semantic_model_generator/llm/progress_tracker.py +16 -15
  10. semantic_model_generator/relationships/discovery.py +1 -6
  11. semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
  12. semantic_model_generator/tests/cte_utils_test.py +1 -1
  13. semantic_model_generator/tests/generate_model_classification_test.py +12 -2
  14. semantic_model_generator/tests/llm_enrichment_test.py +152 -46
  15. semantic_model_generator/tests/relationship_discovery_test.py +6 -3
  16. semantic_model_generator/tests/relationships_filters_test.py +166 -30
  17. semantic_model_generator/tests/utils_test.py +1 -1
  18. semantic_model_generator/validate/keywords.py +453 -53
  19. semantic_model_generator/validate/schema.py +4 -2
  20. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/LICENSE +0 -0
  21. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/WHEEL +0 -0
@@ -1,6 +1,6 @@
1
+ import math
1
2
  import os
2
3
  import re
3
- import math
4
4
  from collections import defaultdict
5
5
  from datetime import datetime
6
6
  from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -8,8 +8,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
8
8
  from clickzetta.zettapark.session import Session
9
9
  from loguru import logger
10
10
 
11
- from semantic_model_generator.data_processing import data_types, proto_utils
12
- from semantic_model_generator.protos import semantic_model_pb2
13
11
  from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
14
12
  AUTOGEN_TOKEN,
15
13
  DIMENSION_DATATYPES,
@@ -20,14 +18,19 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
20
18
  get_valid_schemas_tables_columns_df,
21
19
  )
22
20
  from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
23
- from semantic_model_generator.validate.context_length import validate_context_length
21
+ from semantic_model_generator.data_processing import data_types, proto_utils
24
22
  from semantic_model_generator.llm import (
25
23
  DashscopeClient,
26
24
  DashscopeSettings,
27
25
  enrich_semantic_model,
28
26
  get_dashscope_settings,
29
27
  )
30
- from semantic_model_generator.llm.progress_tracker import EnrichmentProgressTracker, EnrichmentStage
28
+ from semantic_model_generator.llm.progress_tracker import (
29
+ EnrichmentProgressTracker,
30
+ EnrichmentStage,
31
+ )
32
+ from semantic_model_generator.protos import semantic_model_pb2
33
+ from semantic_model_generator.validate.context_length import validate_context_length
31
34
  from semantic_model_generator.validate.keywords import CZ_RESERVED_WORDS
32
35
 
33
36
  _PLACEHOLDER_COMMENT = " "
@@ -39,6 +42,7 @@ _AUTOGEN_COMMENT_TOKEN = (
39
42
  _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
40
43
  _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
41
44
 
45
+
42
46
  def _singularize(token: str) -> str:
43
47
  if token.endswith("IES") and len(token) > 3:
44
48
  return token[:-3] + "Y"
@@ -68,7 +72,9 @@ def _base_type_from_type(column_type: str) -> str:
68
72
  return token.split("(")[0]
69
73
 
70
74
 
71
- def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -> List[str]:
75
+ def _identifier_tokens(
76
+ name: str, prefixes_to_drop: Optional[set[str]] = None
77
+ ) -> List[str]:
72
78
  name = name.replace("-", "_")
73
79
  raw_tokens = re.split(r"[^0-9A-Za-z]+", name)
74
80
  tokens: List[str] = []
@@ -84,7 +90,9 @@ def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -
84
90
  return tokens
85
91
 
86
92
 
87
- def _sanitize_identifier_name(name: str, prefixes_to_drop: Optional[set[str]] = None) -> str:
93
+ def _sanitize_identifier_name(
94
+ name: str, prefixes_to_drop: Optional[set[str]] = None
95
+ ) -> str:
88
96
  if not name:
89
97
  return ""
90
98
 
@@ -271,7 +279,9 @@ def _looks_like_primary_key(table_name: str, column_name: str) -> bool:
271
279
  "PRIMARY_KEY",
272
280
  }
273
281
  for variant in variants:
274
- direct_matches.update({f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"})
282
+ direct_matches.update(
283
+ {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}
284
+ )
275
285
  if upper_name in direct_matches:
276
286
  return True
277
287
 
@@ -368,7 +378,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
368
378
  return _levenshtein_distance(s2, s1)
369
379
  if len(s2) == 0:
370
380
  return len(s1)
371
-
381
+
372
382
  previous_row = range(len(s2) + 1)
373
383
  for i, c1 in enumerate(s1):
374
384
  current_row = [i + 1]
@@ -378,7 +388,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
378
388
  substitutions = previous_row[j] + (c1 != c2)
379
389
  current_row.append(min(insertions, deletions, substitutions))
380
390
  previous_row = current_row
381
-
391
+
382
392
  return previous_row[-1]
383
393
 
384
394
 
@@ -389,26 +399,26 @@ def _name_similarity(name1: str, name2: str) -> float:
389
399
  """
390
400
  if not name1 or not name2:
391
401
  return 0.0
392
-
402
+
393
403
  # Exact match
394
404
  if name1.upper() == name2.upper():
395
405
  return 1.0
396
-
406
+
397
407
  # Normalize names for comparison
398
408
  norm1 = name1.upper().replace("_", "").replace("-", "")
399
409
  norm2 = name2.upper().replace("_", "").replace("-", "")
400
-
410
+
401
411
  if norm1 == norm2:
402
412
  return 0.95
403
-
413
+
404
414
  # Calculate Levenshtein-based similarity
405
415
  max_len = max(len(norm1), len(norm2))
406
416
  if max_len == 0:
407
417
  return 0.0
408
-
418
+
409
419
  distance = _levenshtein_distance(norm1, norm2)
410
420
  similarity = 1.0 - (distance / max_len)
411
-
421
+
412
422
  return max(0.0, similarity)
413
423
 
414
424
 
@@ -427,17 +437,24 @@ def _analyze_composite_key_patterns(
427
437
  Dict with composite key analysis results
428
438
  """
429
439
  pk_candidates = table_meta.get("pk_candidates", {})
430
- columns_meta = table_meta.get("columns", {})
431
440
 
432
441
  # Check if all relationship columns form a composite key
433
- relationship_cols = [pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs]
442
+ relationship_cols = [
443
+ pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs
444
+ ]
434
445
 
435
446
  # Normalize column names for comparison
436
447
  global_prefixes = set() # This should come from context but we'll handle it locally
437
- table_prefixes = _table_prefixes(list(table_meta.get("columns", {}).keys())[0] if table_meta.get("columns") else "")
448
+ table_prefixes = _table_prefixes(
449
+ list(table_meta.get("columns", {}).keys())[0]
450
+ if table_meta.get("columns")
451
+ else ""
452
+ )
438
453
 
439
454
  normalized_rel_cols = [
440
- _sanitize_identifier_name(col, prefixes_to_drop=global_prefixes | table_prefixes)
455
+ _sanitize_identifier_name(
456
+ col, prefixes_to_drop=global_prefixes | table_prefixes
457
+ )
441
458
  for col in relationship_cols
442
459
  ]
443
460
 
@@ -448,7 +465,9 @@ def _analyze_composite_key_patterns(
448
465
  analysis = {
449
466
  "is_composite_pk": pk_col_count > 1 and pk_col_count == total_pk_candidates,
450
467
  "partial_pk": pk_col_count > 0 and pk_col_count < total_pk_candidates,
451
- "pk_coverage_ratio": pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0,
468
+ "pk_coverage_ratio": (
469
+ pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0
470
+ ),
452
471
  "relationship_column_count": len(relationship_cols),
453
472
  "pk_column_count": pk_col_count,
454
473
  }
@@ -457,7 +476,10 @@ def _analyze_composite_key_patterns(
457
476
  if len(relationship_cols) > 1:
458
477
  sequential_patterns = []
459
478
  for col in relationship_cols:
460
- if any(pattern in col.upper() for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]):
479
+ if any(
480
+ pattern in col.upper()
481
+ for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]
482
+ ):
461
483
  sequential_patterns.append(col)
462
484
 
463
485
  analysis["sequential_id_pattern"] = len(sequential_patterns) >= 2
@@ -504,9 +526,12 @@ def _infer_composite_cardinality(
504
526
  # Rule 3: Composite key uniqueness analysis (if we have sufficient samples)
505
527
  MIN_SAMPLE_SIZE = 20 # Lower threshold for composite keys
506
528
 
507
- if (left_values_all and right_values_all and
508
- len(left_values_all) >= MIN_SAMPLE_SIZE and
509
- len(right_values_all) >= MIN_SAMPLE_SIZE):
529
+ if (
530
+ left_values_all
531
+ and right_values_all
532
+ and len(left_values_all) >= MIN_SAMPLE_SIZE
533
+ and len(right_values_all) >= MIN_SAMPLE_SIZE
534
+ ):
510
535
 
511
536
  # Create composite keys by concatenating values
512
537
  left_composite_keys = []
@@ -515,10 +540,12 @@ def _infer_composite_cardinality(
515
540
  sample_size = min(len(left_values_all), len(right_values_all))
516
541
 
517
542
  for i in range(sample_size):
518
- left_key = "|".join(str(vals[i]) if i < len(vals) else ""
519
- for vals in left_values_all)
520
- right_key = "|".join(str(vals[i]) if i < len(vals) else ""
521
- for vals in right_values_all)
543
+ left_key = "|".join(
544
+ str(vals[i]) if i < len(vals) else "" for vals in left_values_all
545
+ )
546
+ right_key = "|".join(
547
+ str(vals[i]) if i < len(vals) else "" for vals in right_values_all
548
+ )
522
549
 
523
550
  if left_key and not _is_nullish(left_key):
524
551
  left_composite_keys.append(left_key)
@@ -527,7 +554,9 @@ def _infer_composite_cardinality(
527
554
 
528
555
  if left_composite_keys and right_composite_keys:
529
556
  left_unique_ratio = len(set(left_composite_keys)) / len(left_composite_keys)
530
- right_unique_ratio = len(set(right_composite_keys)) / len(right_composite_keys)
557
+ right_unique_ratio = len(set(right_composite_keys)) / len(
558
+ right_composite_keys
559
+ )
531
560
 
532
561
  # Lower threshold for composite key uniqueness
533
562
  if right_unique_ratio > 0.9:
@@ -561,6 +590,7 @@ def _infer_composite_cardinality(
561
590
  adaptive_thresholds=adaptive_thresholds,
562
591
  )
563
592
 
593
+
564
594
  def _detect_bridge_table_pattern(
565
595
  table_meta: Dict[str, Any],
566
596
  all_tables_meta: Dict[str, Dict[str, Any]],
@@ -606,7 +636,9 @@ def _detect_bridge_table_pattern(
606
636
  base_type = col_info.get("base_type", "")
607
637
 
608
638
  # Check if column looks like an ID/foreign key
609
- if any(pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]):
639
+ if any(
640
+ pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]
641
+ ):
610
642
  id_columns.append(original_name)
611
643
 
612
644
  # Check if this could be a foreign key to another table
@@ -615,11 +647,13 @@ def _detect_bridge_table_pattern(
615
647
  continue
616
648
 
617
649
  if _looks_like_foreign_key(table_name, other_table_name, original_name):
618
- fk_like_columns.append({
619
- "column": original_name,
620
- "references_table": other_table_name,
621
- "confidence": 0.8
622
- })
650
+ fk_like_columns.append(
651
+ {
652
+ "column": original_name,
653
+ "references_table": other_table_name,
654
+ "confidence": 0.8,
655
+ }
656
+ )
623
657
  break
624
658
 
625
659
  # Check if column name contains the other table name
@@ -628,11 +662,13 @@ def _detect_bridge_table_pattern(
628
662
 
629
663
  for variant in other_variants:
630
664
  if variant in col_tokens:
631
- fk_like_columns.append({
632
- "column": original_name,
633
- "references_table": other_table_name,
634
- "confidence": 0.6
635
- })
665
+ fk_like_columns.append(
666
+ {
667
+ "column": original_name,
668
+ "references_table": other_table_name,
669
+ "confidence": 0.6,
670
+ }
671
+ )
636
672
  break
637
673
  else:
638
674
  # Count descriptive/non-ID columns
@@ -680,8 +716,18 @@ def _detect_bridge_table_pattern(
680
716
  # Name-based heuristics
681
717
  table_upper = table_name.upper()
682
718
  bridge_keywords = {
683
- "BRIDGE", "JUNCTION", "LINK", "ASSOC", "ASSOCIATION", "REL", "RELATIONSHIP",
684
- "MAP", "MAPPING", "XREF", "CROSS_REF", "CONNECTOR"
719
+ "BRIDGE",
720
+ "JUNCTION",
721
+ "LINK",
722
+ "ASSOC",
723
+ "ASSOCIATION",
724
+ "REL",
725
+ "RELATIONSHIP",
726
+ "MAP",
727
+ "MAPPING",
728
+ "XREF",
729
+ "CROSS_REF",
730
+ "CONNECTOR",
685
731
  }
686
732
 
687
733
  for keyword in bridge_keywords:
@@ -708,7 +754,9 @@ def _detect_bridge_table_pattern(
708
754
 
709
755
  is_bridge = confidence >= 0.6 # Threshold for bridge table classification
710
756
 
711
- connected_tables = [fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5]
757
+ connected_tables = [
758
+ fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5
759
+ ]
712
760
 
713
761
  return {
714
762
  "is_bridge": is_bridge,
@@ -718,14 +766,14 @@ def _detect_bridge_table_pattern(
718
766
  "fk_ratio": fk_ratio,
719
767
  "id_ratio": id_ratio,
720
768
  "total_columns": total_columns,
721
- "descriptive_columns": descriptive_columns
769
+ "descriptive_columns": descriptive_columns,
722
770
  }
723
771
 
724
772
 
725
773
  def _detect_many_to_many_relationships(
726
774
  raw_tables: List[tuple[data_types.FQNParts, data_types.Table]],
727
775
  metadata: Dict[str, Dict[str, Any]],
728
- existing_relationships: List[semantic_model_pb2.Relationship]
776
+ existing_relationships: List[semantic_model_pb2.Relationship],
729
777
  ) -> List[semantic_model_pb2.Relationship]:
730
778
  """
731
779
  Detect many-to-many relationships through bridge table analysis.
@@ -746,7 +794,10 @@ def _detect_many_to_many_relationships(
746
794
  for table_name, table_meta in metadata.items():
747
795
  bridge_analysis = _detect_bridge_table_pattern(table_meta, metadata)
748
796
 
749
- if bridge_analysis["is_bridge"] and len(bridge_analysis["connected_tables"]) >= 2:
797
+ if (
798
+ bridge_analysis["is_bridge"]
799
+ and len(bridge_analysis["connected_tables"]) >= 2
800
+ ):
750
801
  bridge_tables[table_name] = bridge_analysis
751
802
 
752
803
  logger.debug(
@@ -780,9 +831,15 @@ def _detect_many_to_many_relationships(
780
831
  right_fk_cols = []
781
832
 
782
833
  for fk_info in bridge_info["fk_like_columns"]:
783
- if fk_info["references_table"] == left_table and fk_info["confidence"] >= 0.5:
834
+ if (
835
+ fk_info["references_table"] == left_table
836
+ and fk_info["confidence"] >= 0.5
837
+ ):
784
838
  left_fk_cols.append(fk_info["column"])
785
- elif fk_info["references_table"] == right_table and fk_info["confidence"] >= 0.5:
839
+ elif (
840
+ fk_info["references_table"] == right_table
841
+ and fk_info["confidence"] >= 0.5
842
+ ):
786
843
  right_fk_cols.append(fk_info["column"])
787
844
 
788
845
  if not left_fk_cols or not right_fk_cols:
@@ -806,8 +863,12 @@ def _detect_many_to_many_relationships(
806
863
  # Use the first detected FK columns as a representative
807
864
  relationship.relationship_columns.append(
808
865
  semantic_model_pb2.RelationKey(
809
- left_column=left_fk_cols[0], # This is actually in the bridge table
810
- right_column=right_fk_cols[0], # This is also in the bridge table
866
+ left_column=left_fk_cols[
867
+ 0
868
+ ], # This is actually in the bridge table
869
+ right_column=right_fk_cols[
870
+ 0
871
+ ], # This is also in the bridge table
811
872
  )
812
873
  )
813
874
 
@@ -863,13 +924,19 @@ def _calculate_relationship_confidence(
863
924
  pk_confidence = 0.4
864
925
  confidence_score += pk_confidence
865
926
  if left_has_pk and right_has_pk:
866
- reasoning_factors.append("Both sides have primary key metadata (very strong evidence)")
927
+ reasoning_factors.append(
928
+ "Both sides have primary key metadata (very strong evidence)"
929
+ )
867
930
  evidence_details["pk_evidence"] = "both_pk"
868
931
  elif right_has_pk:
869
- reasoning_factors.append("Right side has primary key metadata (strong evidence)")
932
+ reasoning_factors.append(
933
+ "Right side has primary key metadata (strong evidence)"
934
+ )
870
935
  evidence_details["pk_evidence"] = "right_pk"
871
936
  elif left_has_pk:
872
- reasoning_factors.append("Left side has primary key metadata (strong evidence)")
937
+ reasoning_factors.append(
938
+ "Left side has primary key metadata (strong evidence)"
939
+ )
873
940
  evidence_details["pk_evidence"] = "left_pk"
874
941
 
875
942
  # Factor 2: Name similarity and pattern matching
@@ -884,19 +951,29 @@ def _calculate_relationship_confidence(
884
951
 
885
952
  if avg_name_similarity >= 0.9:
886
953
  name_confidence = 0.25
887
- reasoning_factors.append(f"Very high column name similarity ({avg_name_similarity:.2f})")
954
+ reasoning_factors.append(
955
+ f"Very high column name similarity ({avg_name_similarity:.2f})"
956
+ )
888
957
  elif avg_name_similarity >= 0.7:
889
958
  name_confidence = 0.2
890
- reasoning_factors.append(f"High column name similarity ({avg_name_similarity:.2f})")
959
+ reasoning_factors.append(
960
+ f"High column name similarity ({avg_name_similarity:.2f})"
961
+ )
891
962
  elif avg_name_similarity >= 0.5:
892
963
  name_confidence = 0.15
893
- reasoning_factors.append(f"Moderate column name similarity ({avg_name_similarity:.2f})")
964
+ reasoning_factors.append(
965
+ f"Moderate column name similarity ({avg_name_similarity:.2f})"
966
+ )
894
967
  elif avg_name_similarity >= 0.3:
895
968
  name_confidence = 0.1
896
- reasoning_factors.append(f"Low column name similarity ({avg_name_similarity:.2f})")
969
+ reasoning_factors.append(
970
+ f"Low column name similarity ({avg_name_similarity:.2f})"
971
+ )
897
972
  else:
898
973
  name_confidence = 0.05
899
- reasoning_factors.append(f"Very low column name similarity ({avg_name_similarity:.2f})")
974
+ reasoning_factors.append(
975
+ f"Very low column name similarity ({avg_name_similarity:.2f})"
976
+ )
900
977
 
901
978
  confidence_score += name_confidence
902
979
 
@@ -905,7 +982,9 @@ def _calculate_relationship_confidence(
905
982
  for left_col, right_col in column_pairs:
906
983
  if _looks_like_foreign_key(left_table, right_table, left_col):
907
984
  fk_pattern_confidence += 0.1
908
- reasoning_factors.append(f"Column '{left_col}' follows FK naming pattern")
985
+ reasoning_factors.append(
986
+ f"Column '{left_col}' follows FK naming pattern"
987
+ )
909
988
 
910
989
  confidence_score += min(fk_pattern_confidence, 0.2)
911
990
 
@@ -927,29 +1006,45 @@ def _calculate_relationship_confidence(
927
1006
 
928
1007
  # Check if uniqueness pattern matches inferred cardinality
929
1008
  left_card, right_card = cardinality_result
930
- uniqueness_threshold = adaptive_thresholds.get("uniqueness_threshold", 0.95) if adaptive_thresholds else 0.95
1009
+ uniqueness_threshold = (
1010
+ adaptive_thresholds.get("uniqueness_threshold", 0.95)
1011
+ if adaptive_thresholds
1012
+ else 0.95
1013
+ )
931
1014
 
932
1015
  cardinality_consistency = False
933
1016
  if left_card == "1" and left_unique_ratio > uniqueness_threshold:
934
1017
  cardinality_consistency = True
935
- elif left_card in ("*", "+") and left_unique_ratio <= uniqueness_threshold:
1018
+ elif (
1019
+ left_card in ("*", "+")
1020
+ and left_unique_ratio <= uniqueness_threshold
1021
+ ):
936
1022
  cardinality_consistency = True
937
1023
 
938
1024
  if right_card == "1" and right_unique_ratio > uniqueness_threshold:
939
1025
  cardinality_consistency = cardinality_consistency and True
940
- elif right_card in ("*", "+") and right_unique_ratio <= uniqueness_threshold:
1026
+ elif (
1027
+ right_card in ("*", "+")
1028
+ and right_unique_ratio <= uniqueness_threshold
1029
+ ):
941
1030
  cardinality_consistency = cardinality_consistency and True
942
1031
 
943
1032
  if cardinality_consistency:
944
1033
  uniqueness_confidence = 0.2
945
- reasoning_factors.append("Sample uniqueness patterns support inferred cardinality")
1034
+ reasoning_factors.append(
1035
+ "Sample uniqueness patterns support inferred cardinality"
1036
+ )
946
1037
  else:
947
1038
  uniqueness_confidence = 0.1
948
- reasoning_factors.append("Sample uniqueness patterns partially support cardinality")
1039
+ reasoning_factors.append(
1040
+ "Sample uniqueness patterns partially support cardinality"
1041
+ )
949
1042
 
950
1043
  confidence_score += uniqueness_confidence
951
1044
  else:
952
- reasoning_factors.append(f"Limited sample size ({sample_size}) reduces confidence")
1045
+ reasoning_factors.append(
1046
+ f"Limited sample size ({sample_size}) reduces confidence"
1047
+ )
953
1048
 
954
1049
  # Factor 4: Data type compatibility
955
1050
  if column_pairs and left_meta and right_meta:
@@ -992,15 +1087,21 @@ def _calculate_relationship_confidence(
992
1087
  evidence_details["left_table_role"] = left_role
993
1088
  evidence_details["right_table_role"] = right_role
994
1089
 
995
- relationship_context = _get_business_relationship_context(left_table, right_table, left_role, right_role)
1090
+ relationship_context = _get_business_relationship_context(
1091
+ left_table, right_table, left_role, right_role
1092
+ )
996
1093
  evidence_details["relationship_context"] = relationship_context
997
1094
 
998
1095
  if relationship_context in ["fact_to_dimension", "dimension_to_fact"]:
999
1096
  role_confidence = 0.15
1000
- reasoning_factors.append(f"Strong business relationship pattern: {relationship_context}")
1097
+ reasoning_factors.append(
1098
+ f"Strong business relationship pattern: {relationship_context}"
1099
+ )
1001
1100
  elif relationship_context in ["dimension_hierarchy", "bridge_relationship"]:
1002
1101
  role_confidence = 0.1
1003
- reasoning_factors.append(f"Valid business relationship pattern: {relationship_context}")
1102
+ reasoning_factors.append(
1103
+ f"Valid business relationship pattern: {relationship_context}"
1104
+ )
1004
1105
  elif relationship_context == "fact_to_fact":
1005
1106
  role_confidence = 0.05
1006
1107
  reasoning_factors.append("Unusual but possible fact-to-fact relationship")
@@ -1013,7 +1114,9 @@ def _calculate_relationship_confidence(
1013
1114
  # Factor 6: Multiple column relationships (composite keys)
1014
1115
  if len(column_pairs) > 1:
1015
1116
  composite_confidence = 0.1
1016
- reasoning_factors.append(f"Multi-column relationship ({len(column_pairs)} columns) increases confidence")
1117
+ reasoning_factors.append(
1118
+ f"Multi-column relationship ({len(column_pairs)} columns) increases confidence"
1119
+ )
1017
1120
  confidence_score += composite_confidence
1018
1121
 
1019
1122
  # Normalize confidence score to 0-1 range
@@ -1043,7 +1146,9 @@ def _calculate_relationship_confidence(
1043
1146
  "reasoning_factors": reasoning_factors,
1044
1147
  "evidence_details": evidence_details,
1045
1148
  "inferred_cardinality": f"{cardinality_result[0]}:{cardinality_result[1]}",
1046
- "join_type": "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER",
1149
+ "join_type": (
1150
+ "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
1151
+ ),
1047
1152
  "column_count": len(column_pairs),
1048
1153
  }
1049
1154
 
@@ -1059,101 +1164,196 @@ def _get_domain_knowledge_patterns() -> Dict[str, Any]:
1059
1164
  # Common business entity patterns
1060
1165
  "business_entities": {
1061
1166
  "customer": {
1062
- "table_patterns": ["CUSTOMER", "CUST", "CLIENT", "ACCOUNT_HOLDER", "USER", "MEMBER"],
1063
- "pk_patterns": ["CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "USER_ID", "MEMBER_ID"],
1064
- "typical_attributes": ["NAME", "EMAIL", "PHONE", "ADDRESS", "STATUS", "TYPE", "SEGMENT"],
1065
- "role": "dimension"
1167
+ "table_patterns": [
1168
+ "CUSTOMER",
1169
+ "CUST",
1170
+ "CLIENT",
1171
+ "ACCOUNT_HOLDER",
1172
+ "USER",
1173
+ "MEMBER",
1174
+ ],
1175
+ "pk_patterns": [
1176
+ "CUSTOMER_ID",
1177
+ "CUST_ID",
1178
+ "CLIENT_ID",
1179
+ "USER_ID",
1180
+ "MEMBER_ID",
1181
+ ],
1182
+ "typical_attributes": [
1183
+ "NAME",
1184
+ "EMAIL",
1185
+ "PHONE",
1186
+ "ADDRESS",
1187
+ "STATUS",
1188
+ "TYPE",
1189
+ "SEGMENT",
1190
+ ],
1191
+ "role": "dimension",
1066
1192
  },
1067
1193
  "product": {
1068
1194
  "table_patterns": ["PRODUCT", "ITEM", "SKU", "INVENTORY", "CATALOG"],
1069
1195
  "pk_patterns": ["PRODUCT_ID", "ITEM_ID", "SKU", "PRODUCT_KEY"],
1070
- "typical_attributes": ["NAME", "DESCRIPTION", "CATEGORY", "PRICE", "BRAND", "STATUS"],
1071
- "role": "dimension"
1196
+ "typical_attributes": [
1197
+ "NAME",
1198
+ "DESCRIPTION",
1199
+ "CATEGORY",
1200
+ "PRICE",
1201
+ "BRAND",
1202
+ "STATUS",
1203
+ ],
1204
+ "role": "dimension",
1072
1205
  },
1073
1206
  "order": {
1074
1207
  "table_patterns": ["ORDER", "TRANSACTION", "SALE", "PURCHASE"],
1075
- "pk_patterns": ["ORDER_ID", "TRANSACTION_ID", "SALE_ID", "ORDER_NUMBER"],
1208
+ "pk_patterns": [
1209
+ "ORDER_ID",
1210
+ "TRANSACTION_ID",
1211
+ "SALE_ID",
1212
+ "ORDER_NUMBER",
1213
+ ],
1076
1214
  "typical_attributes": ["DATE", "AMOUNT", "STATUS", "QUANTITY", "TOTAL"],
1077
- "role": "fact"
1215
+ "role": "fact",
1078
1216
  },
1079
1217
  "date": {
1080
1218
  "table_patterns": ["DATE", "TIME", "CALENDAR", "DIM_DATE"],
1081
1219
  "pk_patterns": ["DATE_ID", "DATE_KEY", "TIME_ID"],
1082
- "typical_attributes": ["YEAR", "MONTH", "DAY", "QUARTER", "WEEK", "WEEKDAY"],
1083
- "role": "dimension"
1220
+ "typical_attributes": [
1221
+ "YEAR",
1222
+ "MONTH",
1223
+ "DAY",
1224
+ "QUARTER",
1225
+ "WEEK",
1226
+ "WEEKDAY",
1227
+ ],
1228
+ "role": "dimension",
1084
1229
  },
1085
1230
  "location": {
1086
- "table_patterns": ["LOCATION", "GEOGRAPHY", "ADDRESS", "REGION", "TERRITORY"],
1231
+ "table_patterns": [
1232
+ "LOCATION",
1233
+ "GEOGRAPHY",
1234
+ "ADDRESS",
1235
+ "REGION",
1236
+ "TERRITORY",
1237
+ ],
1087
1238
  "pk_patterns": ["LOCATION_ID", "GEO_ID", "ADDRESS_ID", "REGION_ID"],
1088
- "typical_attributes": ["COUNTRY", "STATE", "CITY", "ZIP", "LATITUDE", "LONGITUDE"],
1089
- "role": "dimension"
1239
+ "typical_attributes": [
1240
+ "COUNTRY",
1241
+ "STATE",
1242
+ "CITY",
1243
+ "ZIP",
1244
+ "LATITUDE",
1245
+ "LONGITUDE",
1246
+ ],
1247
+ "role": "dimension",
1090
1248
  },
1091
1249
  "employee": {
1092
1250
  "table_patterns": ["EMPLOYEE", "STAFF", "WORKER", "PERSONNEL"],
1093
1251
  "pk_patterns": ["EMPLOYEE_ID", "STAFF_ID", "EMP_ID"],
1094
- "typical_attributes": ["NAME", "DEPARTMENT", "TITLE", "MANAGER", "HIRE_DATE"],
1095
- "role": "dimension"
1096
- }
1252
+ "typical_attributes": [
1253
+ "NAME",
1254
+ "DEPARTMENT",
1255
+ "TITLE",
1256
+ "MANAGER",
1257
+ "HIRE_DATE",
1258
+ ],
1259
+ "role": "dimension",
1260
+ },
1097
1261
  },
1098
-
1099
1262
  # Common relationship patterns in data warehouses
1100
1263
  "relationship_patterns": {
1101
1264
  "star_schema": {
1102
1265
  "pattern": "fact_to_dimension",
1103
1266
  "confidence_boost": 0.2,
1104
- "description": "Standard star schema fact-to-dimension relationship"
1267
+ "description": "Standard star schema fact-to-dimension relationship",
1105
1268
  },
1106
1269
  "snowflake_schema": {
1107
1270
  "pattern": "dimension_hierarchy",
1108
1271
  "confidence_boost": 0.15,
1109
- "description": "Snowflake schema dimension hierarchy"
1272
+ "description": "Snowflake schema dimension hierarchy",
1110
1273
  },
1111
1274
  "bridge_table": {
1112
1275
  "pattern": "many_to_many_via_bridge",
1113
1276
  "confidence_boost": 0.1,
1114
- "description": "Many-to-many relationship through bridge table"
1277
+ "description": "Many-to-many relationship through bridge table",
1115
1278
  },
1116
1279
  "time_dimension": {
1117
1280
  "pattern": "temporal_relationship",
1118
1281
  "confidence_boost": 0.25,
1119
- "description": "Time-based relationship (very common in warehouses)"
1120
- }
1282
+ "description": "Time-based relationship (very common in warehouses)",
1283
+ },
1121
1284
  },
1122
-
1123
1285
  # Known FK patterns that often appear in real data warehouses
1124
1286
  "common_fk_patterns": {
1125
1287
  "customer_references": [
1126
- "CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "ACCOUNT_ID", "USER_ID"
1288
+ "CUSTOMER_ID",
1289
+ "CUST_ID",
1290
+ "CLIENT_ID",
1291
+ "ACCOUNT_ID",
1292
+ "USER_ID",
1127
1293
  ],
1128
1294
  "product_references": [
1129
- "PRODUCT_ID", "ITEM_ID", "SKU", "PROD_ID", "CATALOG_ID"
1295
+ "PRODUCT_ID",
1296
+ "ITEM_ID",
1297
+ "SKU",
1298
+ "PROD_ID",
1299
+ "CATALOG_ID",
1130
1300
  ],
1131
1301
  "date_references": [
1132
- "DATE_ID", "ORDER_DATE_ID", "SHIP_DATE_ID", "CREATE_DATE_ID",
1133
- "TRANSACTION_DATE_ID", "DATE_KEY"
1302
+ "DATE_ID",
1303
+ "ORDER_DATE_ID",
1304
+ "SHIP_DATE_ID",
1305
+ "CREATE_DATE_ID",
1306
+ "TRANSACTION_DATE_ID",
1307
+ "DATE_KEY",
1134
1308
  ],
1135
1309
  "location_references": [
1136
- "LOCATION_ID", "ADDRESS_ID", "SHIP_TO_ID", "BILL_TO_ID",
1137
- "WAREHOUSE_ID", "STORE_ID"
1138
- ]
1310
+ "LOCATION_ID",
1311
+ "ADDRESS_ID",
1312
+ "SHIP_TO_ID",
1313
+ "BILL_TO_ID",
1314
+ "WAREHOUSE_ID",
1315
+ "STORE_ID",
1316
+ ],
1139
1317
  },
1140
-
1141
1318
  # Table naming conventions that indicate specific patterns
1142
1319
  "naming_conventions": {
1143
1320
  "fact_indicators": [
1144
- "FACT_", "FCT_", "F_", "SALES_", "ORDERS_", "TRANSACTIONS_",
1145
- "REVENUE_", "METRICS_", "EVENTS_", "ACTIVITY_"
1321
+ "FACT_",
1322
+ "FCT_",
1323
+ "F_",
1324
+ "SALES_",
1325
+ "ORDERS_",
1326
+ "TRANSACTIONS_",
1327
+ "REVENUE_",
1328
+ "METRICS_",
1329
+ "EVENTS_",
1330
+ "ACTIVITY_",
1146
1331
  ],
1147
1332
  "dimension_indicators": [
1148
- "DIM_", "D_", "REF_", "LKP_", "LOOKUP_", "MASTER_"
1333
+ "DIM_",
1334
+ "D_",
1335
+ "REF_",
1336
+ "LKP_",
1337
+ "LOOKUP_",
1338
+ "MASTER_",
1149
1339
  ],
1150
1340
  "bridge_indicators": [
1151
- "BRG_", "BRIDGE_", "XREF_", "MAP_", "ASSOC_", "LINK_"
1341
+ "BRG_",
1342
+ "BRIDGE_",
1343
+ "XREF_",
1344
+ "MAP_",
1345
+ "ASSOC_",
1346
+ "LINK_",
1152
1347
  ],
1153
1348
  "staging_indicators": [
1154
- "STG_", "STAGING_", "TMP_", "TEMP_", "RAW_", "LANDING_"
1155
- ]
1156
- }
1349
+ "STG_",
1350
+ "STAGING_",
1351
+ "TMP_",
1352
+ "TEMP_",
1353
+ "RAW_",
1354
+ "LANDING_",
1355
+ ],
1356
+ },
1157
1357
  }
1158
1358
 
1159
1359
 
@@ -1204,18 +1404,26 @@ def _apply_domain_knowledge(
1204
1404
  if entity_pair in common_pairs:
1205
1405
  boost = common_pairs[entity_pair]
1206
1406
  confidence_boost += boost
1207
- enhancement_factors.append(f"Recognized common business pattern: {entity_pair} (+{boost:.2f})")
1407
+ enhancement_factors.append(
1408
+ f"Recognized common business pattern: {entity_pair} (+{boost:.2f})"
1409
+ )
1208
1410
  elif f"{right_entity}-{left_entity}" in common_pairs:
1209
1411
  boost = common_pairs[f"{right_entity}-{left_entity}"]
1210
1412
  confidence_boost += boost
1211
- enhancement_factors.append(f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})")
1413
+ enhancement_factors.append(
1414
+ f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})"
1415
+ )
1212
1416
 
1213
1417
  # Factor 2: Check for standard FK naming patterns
1214
1418
  for left_col, right_col in column_pairs:
1215
- fk_pattern_match = _check_standard_fk_patterns(left_col, right_col, domain_patterns)
1419
+ fk_pattern_match = _check_standard_fk_patterns(
1420
+ left_col, right_col, domain_patterns
1421
+ )
1216
1422
  if fk_pattern_match:
1217
1423
  confidence_boost += 0.15
1218
- enhancement_factors.append(f"Standard FK pattern detected: {fk_pattern_match}")
1424
+ enhancement_factors.append(
1425
+ f"Standard FK pattern detected: {fk_pattern_match}"
1426
+ )
1219
1427
 
1220
1428
  # Factor 3: Table naming convention analysis
1221
1429
  left_convention = _identify_naming_convention(left_table, domain_patterns)
@@ -1223,8 +1431,9 @@ def _apply_domain_knowledge(
1223
1431
 
1224
1432
  if left_convention and right_convention:
1225
1433
  # Boost confidence for expected patterns
1226
- if (left_convention == "fact" and right_convention == "dimension") or \
1227
- (left_convention == "dimension" and right_convention == "fact"):
1434
+ if (left_convention == "fact" and right_convention == "dimension") or (
1435
+ left_convention == "dimension" and right_convention == "fact"
1436
+ ):
1228
1437
  confidence_boost += 0.2
1229
1438
  enhancement_factors.append("Standard fact-dimension naming pattern (+0.20)")
1230
1439
  elif left_convention == "dimension" and right_convention == "dimension":
@@ -1237,12 +1446,20 @@ def _apply_domain_knowledge(
1237
1446
  enhancement_factors.append("Time dimension relationship (very common) (+0.20)")
1238
1447
 
1239
1448
  # Factor 5: Schema pattern recognition (star vs snowflake)
1240
- schema_pattern = _detect_schema_pattern(left_table, right_table, left_meta, right_meta, domain_patterns)
1449
+ schema_pattern = _detect_schema_pattern(
1450
+ left_table, right_table, left_meta, right_meta, domain_patterns
1451
+ )
1241
1452
  if schema_pattern:
1242
- pattern_boost = domain_patterns["relationship_patterns"][schema_pattern]["confidence_boost"]
1453
+ pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
1454
+ "confidence_boost"
1455
+ ]
1243
1456
  confidence_boost += pattern_boost
1244
- pattern_desc = domain_patterns["relationship_patterns"][schema_pattern]["description"]
1245
- enhancement_factors.append(f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})")
1457
+ pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
1458
+ "description"
1459
+ ]
1460
+ enhancement_factors.append(
1461
+ f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})"
1462
+ )
1246
1463
 
1247
1464
  # Apply the boost but cap the final confidence at 1.0
1248
1465
  enhanced_confidence = min(current_confidence + confidence_boost, 1.0)
@@ -1259,7 +1476,9 @@ def _apply_domain_knowledge(
1259
1476
  }
1260
1477
 
1261
1478
 
1262
- def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> Optional[str]:
1479
+ def _identify_business_entity(
1480
+ table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
1481
+ ) -> Optional[str]:
1263
1482
  """Identify what business entity a table represents."""
1264
1483
  table_upper = table_name.upper()
1265
1484
  business_entities = domain_patterns["business_entities"]
@@ -1274,13 +1493,18 @@ def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domai
1274
1493
  pk_candidates = table_meta.get("pk_candidates", {})
1275
1494
  for pk_pattern in entity_info["pk_patterns"]:
1276
1495
  for pk_norm in pk_candidates.keys():
1277
- if pk_pattern.replace("_", "").upper() in pk_norm.replace("_", "").upper():
1496
+ if (
1497
+ pk_pattern.replace("_", "").upper()
1498
+ in pk_norm.replace("_", "").upper()
1499
+ ):
1278
1500
  return entity_type
1279
1501
 
1280
1502
  return None
1281
1503
 
1282
1504
 
1283
- def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
1505
+ def _check_standard_fk_patterns(
1506
+ left_col: str, right_col: str, domain_patterns: Dict[str, Any]
1507
+ ) -> Optional[str]:
1284
1508
  """Check if column pair matches standard FK patterns."""
1285
1509
  common_fks = domain_patterns["common_fk_patterns"]
1286
1510
 
@@ -1295,7 +1519,9 @@ def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns:
1295
1519
  return None
1296
1520
 
1297
1521
 
1298
- def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
1522
+ def _identify_naming_convention(
1523
+ table_name: str, domain_patterns: Dict[str, Any]
1524
+ ) -> Optional[str]:
1299
1525
  """Identify the naming convention used for a table."""
1300
1526
  table_upper = table_name.upper()
1301
1527
  naming_conventions = domain_patterns["naming_conventions"]
@@ -1308,7 +1534,9 @@ def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]
1308
1534
  return None
1309
1535
 
1310
1536
 
1311
- def _is_time_dimension_pattern(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> bool:
1537
+ def _is_time_dimension_pattern(
1538
+ table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
1539
+ ) -> bool:
1312
1540
  """Check if table follows time dimension patterns."""
1313
1541
  table_upper = table_name.upper()
1314
1542
  time_patterns = domain_patterns["business_entities"]["date"]["table_patterns"]
@@ -1344,15 +1572,16 @@ def _detect_schema_pattern(
1344
1572
  right_table: str,
1345
1573
  left_meta: Dict[str, Any],
1346
1574
  right_meta: Dict[str, Any],
1347
- domain_patterns: Dict[str, Any]
1575
+ domain_patterns: Dict[str, Any],
1348
1576
  ) -> Optional[str]:
1349
1577
  """Detect common schema patterns (star, snowflake, etc.)."""
1350
1578
  left_role = _detect_table_role(left_table, left_meta)
1351
1579
  right_role = _detect_table_role(right_table, right_meta)
1352
1580
 
1353
1581
  # Star schema pattern: fact table to dimension
1354
- if (left_role == "fact" and right_role == "dimension") or \
1355
- (left_role == "dimension" and right_role == "fact"):
1582
+ if (left_role == "fact" and right_role == "dimension") or (
1583
+ left_role == "dimension" and right_role == "fact"
1584
+ ):
1356
1585
  return "star_schema"
1357
1586
 
1358
1587
  # Snowflake schema pattern: dimension to dimension
@@ -1360,8 +1589,9 @@ def _detect_schema_pattern(
1360
1589
  return "snowflake_schema"
1361
1590
 
1362
1591
  # Time dimension pattern (very common)
1363
- if _is_time_dimension_pattern(right_table, right_meta, domain_patterns) or \
1364
- _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
1592
+ if _is_time_dimension_pattern(
1593
+ right_table, right_meta, domain_patterns
1594
+ ) or _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
1365
1595
  return "time_dimension"
1366
1596
 
1367
1597
  # Bridge table pattern
@@ -1397,7 +1627,9 @@ def _calculate_adaptive_thresholds(
1397
1627
  # Calculate sample statistics
1398
1628
  sample_sizes = [len(vals) for vals in values_list if vals]
1399
1629
  max_sample_size = max(sample_sizes) if sample_sizes else base_sample_size
1400
- avg_sample_size = sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
1630
+ avg_sample_size = (
1631
+ sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
1632
+ )
1401
1633
 
1402
1634
  # Calculate data distribution characteristics
1403
1635
  total_unique_values = 0
@@ -1425,7 +1657,7 @@ def _calculate_adaptive_thresholds(
1425
1657
  if len(value_counts) > 1:
1426
1658
  max_freq = max(value_counts.values())
1427
1659
  min_freq = min(value_counts.values())
1428
- skew = max_freq / min_freq if min_freq > 0 else float('inf')
1660
+ skew = max_freq / min_freq if min_freq > 0 else float("inf")
1429
1661
  skew_ratios.append(skew)
1430
1662
 
1431
1663
  # Calculate overall uniqueness ratio
@@ -1459,7 +1691,9 @@ def _calculate_adaptive_thresholds(
1459
1691
  min_size_adj *= 1.1
1460
1692
 
1461
1693
  # Scale with base sample size from configuration
1462
- size_scale_factor = min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
1694
+ size_scale_factor = (
1695
+ min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
1696
+ )
1463
1697
  min_size_adj *= size_scale_factor
1464
1698
 
1465
1699
  thresholds["min_sample_size"] = max(int(base_min_size * min_size_adj), 10)
@@ -1594,8 +1828,12 @@ def _infer_cardinality(
1594
1828
  left_non_null = [v for v in left_values if not _is_nullish(v)]
1595
1829
  right_non_null = [v for v in right_values if not _is_nullish(v)]
1596
1830
 
1597
- left_unique_ratio = len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
1598
- right_unique_ratio = len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
1831
+ left_unique_ratio = (
1832
+ len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
1833
+ )
1834
+ right_unique_ratio = (
1835
+ len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
1836
+ )
1599
1837
 
1600
1838
  # Apply adaptive uniqueness threshold
1601
1839
  left_is_unique = left_unique_ratio > uniqueness_threshold
@@ -1691,11 +1929,19 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1691
1929
  Returns:
1692
1930
  str: Table role ('fact', 'dimension', 'bridge', 'staging', 'unknown')
1693
1931
  """
1694
- upper_name = table_name.upper()
1695
1932
  tokens = _identifier_tokens(table_name)
1696
1933
 
1697
1934
  # Rule 1: Explicit prefixes/suffixes
1698
- fact_indicators = {"FACT", "FCT", "TXN", "TRANSACTION", "EVENT", "LOG", "SALES", "ORDER"}
1935
+ fact_indicators = {
1936
+ "FACT",
1937
+ "FCT",
1938
+ "TXN",
1939
+ "TRANSACTION",
1940
+ "EVENT",
1941
+ "LOG",
1942
+ "SALES",
1943
+ "ORDER",
1944
+ }
1699
1945
  dim_indicators = {"DIM", "DIMENSION", "LOOKUP", "REF", "REFERENCE", "MASTER"}
1700
1946
  bridge_indicators = {"BRIDGE", "BRG", "LINK", "JUNCTION", "ASSOC", "ASSOCIATION"}
1701
1947
  staging_indicators = {"STG", "STAGING", "TMP", "TEMP", "WORK", "LANDING", "RAW"}
@@ -1734,9 +1980,22 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1734
1980
  id_count += 1
1735
1981
 
1736
1982
  # Count measure-like columns (amounts, counts, quantities)
1737
- if any(word in col_name for word in ["AMOUNT", "QTY", "QUANTITY", "COUNT", "TOTAL", "SUM", "AVG"]):
1983
+ if any(
1984
+ word in col_name
1985
+ for word in [
1986
+ "AMOUNT",
1987
+ "QTY",
1988
+ "QUANTITY",
1989
+ "COUNT",
1990
+ "TOTAL",
1991
+ "SUM",
1992
+ "AVG",
1993
+ ]
1994
+ ):
1738
1995
  measure_like_count += 1
1739
- elif base_type in MEASURE_DATATYPES and not col_info.get("is_identifier", False):
1996
+ elif base_type in MEASURE_DATATYPES and not col_info.get(
1997
+ "is_identifier", False
1998
+ ):
1740
1999
  measure_like_count += 1
1741
2000
  else:
1742
2001
  dimension_like_count += 1
@@ -1761,7 +2020,9 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1761
2020
  return "unknown"
1762
2021
 
1763
2022
 
1764
- def _get_business_relationship_context(left_table: str, right_table: str, left_role: str, right_role: str) -> str:
2023
+ def _get_business_relationship_context(
2024
+ left_table: str, right_table: str, left_role: str, right_role: str
2025
+ ) -> str:
1765
2026
  """
1766
2027
  Determine business relationship context between tables based on their roles.
1767
2028
 
@@ -1833,7 +2094,7 @@ def _infer_join_type(
1833
2094
  4. Naming pattern heuristics
1834
2095
  5. Conservative INNER JOIN default
1835
2096
  """
1836
-
2097
+
1837
2098
  # RULE 1: Default to INNER JOIN (most common and safest)
1838
2099
  default_join = semantic_model_pb2.JoinType.inner
1839
2100
 
@@ -1861,9 +2122,17 @@ def _infer_join_type(
1861
2122
  # Apply business rules based on relationship context
1862
2123
  if relationship_context == "fact_to_dimension":
1863
2124
  # Fact → Dimension: usually INNER, but check for optional dimensions
1864
- if any(keyword in right_table.upper() for keyword in [
1865
- "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "OPTIONAL", "SECONDARY"
1866
- ]):
2125
+ if any(
2126
+ keyword in right_table.upper()
2127
+ for keyword in [
2128
+ "PROMO",
2129
+ "PROMOTION",
2130
+ "DISCOUNT",
2131
+ "COUPON",
2132
+ "OPTIONAL",
2133
+ "SECONDARY",
2134
+ ]
2135
+ ):
1867
2136
  logger.debug(
1868
2137
  f"Join type inference for {left_table} -> {right_table}: "
1869
2138
  f"LEFT_OUTER (fact to optional dimension: {right_role})"
@@ -1907,11 +2176,19 @@ def _infer_join_type(
1907
2176
  return semantic_model_pb2.JoinType.left_outer
1908
2177
 
1909
2178
  # RULE 5: Naming pattern heuristics for optional relationships
1910
- left_upper = left_table.upper()
1911
2179
  right_upper = right_table.upper()
1912
2180
  optional_keywords = {
1913
- "OPTIONAL", "ALTERNATE", "SECONDARY", "BACKUP", "FALLBACK",
1914
- "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "TEMP", "TMP"
2181
+ "OPTIONAL",
2182
+ "ALTERNATE",
2183
+ "SECONDARY",
2184
+ "BACKUP",
2185
+ "FALLBACK",
2186
+ "PROMO",
2187
+ "PROMOTION",
2188
+ "DISCOUNT",
2189
+ "COUPON",
2190
+ "TEMP",
2191
+ "TMP",
1915
2192
  }
1916
2193
 
1917
2194
  for keyword in optional_keywords:
@@ -1946,12 +2223,17 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
1946
2223
  """
1947
2224
  fk_upper = fk_column.strip().upper()
1948
2225
  pk_table_variants = _table_variants(pk_table)
1949
-
2226
+
1950
2227
  # Pattern 1: {table_name}_id or {table_name}_key
1951
2228
  for variant in pk_table_variants:
1952
- if fk_upper in {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}:
2229
+ if fk_upper in {
2230
+ f"{variant}_ID",
2231
+ f"{variant}ID",
2232
+ f"{variant}_KEY",
2233
+ f"{variant}KEY",
2234
+ }:
1953
2235
  return True
1954
-
2236
+
1955
2237
  # Pattern 2: Column ends with table name variants
1956
2238
  tokens = _identifier_tokens(fk_column)
1957
2239
  if len(tokens) >= 2:
@@ -1961,21 +2243,23 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
1961
2243
  tail = tokens[-1]
1962
2244
  if tail in {"ID", "KEY"}:
1963
2245
  return True
1964
-
2246
+
1965
2247
  # Pattern 3: Similar to primary key column but with FK table prefix
1966
2248
  # e.g., order_id in order_items table referencing orders.id
1967
2249
  fk_table_variants = _table_variants(fk_table)
1968
2250
  for fk_variant in fk_table_variants:
1969
2251
  if fk_upper.startswith(fk_variant):
1970
- remainder = fk_upper[len(fk_variant):].lstrip("_")
2252
+ remainder = fk_upper[len(fk_variant) :].lstrip("_")
1971
2253
  for pk_variant in pk_table_variants:
1972
2254
  if remainder.startswith(pk_variant):
1973
2255
  return True
1974
-
2256
+
1975
2257
  return False
1976
2258
 
1977
2259
 
1978
- def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.NamedFilter]:
2260
+ def _suggest_filters(
2261
+ raw_table: data_types.Table,
2262
+ ) -> List[semantic_model_pb2.NamedFilter]:
1979
2263
  suggestions: List[semantic_model_pb2.NamedFilter] = []
1980
2264
  for col in raw_table.columns:
1981
2265
  base_type = _base_type_from_type(col.column_type)
@@ -2011,12 +2295,20 @@ def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.Nam
2011
2295
  )
2012
2296
  is_textual = base_type in {"STRING", "TEXT", "VARCHAR", "CHAR", "CHARACTER"}
2013
2297
  is_boolean = base_type in {"BOOLEAN"}
2014
- is_categorical_numeric = base_type in {"INT", "INTEGER", "NUMBER", "SMALLINT", "BIGINT"} and any(
2015
- upper_name.endswith(suffix) for suffix in categorical_suffixes
2016
- )
2017
-
2018
- if not is_identifier_like and (is_textual or is_boolean or is_categorical_numeric):
2019
- formatted = [_format_literal(val, base_type) for val in distinct_values[:5]]
2298
+ is_categorical_numeric = base_type in {
2299
+ "INT",
2300
+ "INTEGER",
2301
+ "NUMBER",
2302
+ "SMALLINT",
2303
+ "BIGINT",
2304
+ } and any(upper_name.endswith(suffix) for suffix in categorical_suffixes)
2305
+
2306
+ if not is_identifier_like and (
2307
+ is_textual or is_boolean or is_categorical_numeric
2308
+ ):
2309
+ formatted = [
2310
+ _format_literal(val, base_type) for val in distinct_values[:5]
2311
+ ]
2020
2312
  expr = f"{col.column_name} IN ({', '.join(formatted)})"
2021
2313
  suggestions.append(
2022
2314
  semantic_model_pb2.NamedFilter(
@@ -2060,7 +2352,9 @@ def _infer_relationships(
2060
2352
  table_prefixes = global_prefixes | _table_prefixes(raw_table.name)
2061
2353
  for column in raw_table.columns:
2062
2354
  base_type = _base_type_from_type(column.column_type)
2063
- normalized = _sanitize_identifier_name(column.column_name, prefixes_to_drop=table_prefixes)
2355
+ normalized = _sanitize_identifier_name(
2356
+ column.column_name, prefixes_to_drop=table_prefixes
2357
+ )
2064
2358
  entry = columns_meta.setdefault(
2065
2359
  normalized,
2066
2360
  {
@@ -2075,7 +2369,9 @@ def _infer_relationships(
2075
2369
  entry["names"].append(column.column_name)
2076
2370
  if column.values:
2077
2371
  entry["values"].extend(column.values)
2078
- entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(column.column_name, base_type)
2372
+ entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
2373
+ column.column_name, base_type
2374
+ )
2079
2375
  is_primary = getattr(column, "is_primary_key", False)
2080
2376
  if is_primary:
2081
2377
  entry["is_primary"] = True
@@ -2093,7 +2389,9 @@ def _infer_relationships(
2093
2389
  pairs: dict[tuple[str, str], List[tuple[str, str]]] = {}
2094
2390
  null_check_cache: Dict[Tuple[str, str, str, str], bool] = {}
2095
2391
 
2096
- def _record_pair(left_table: str, right_table: str, left_col: str, right_col: str) -> None:
2392
+ def _record_pair(
2393
+ left_table: str, right_table: str, left_col: str, right_col: str
2394
+ ) -> None:
2097
2395
  key = (left_table, right_table)
2098
2396
  value = (left_col, right_col)
2099
2397
  if value not in pairs.setdefault(key, []):
@@ -2158,7 +2456,7 @@ def _infer_relationships(
2158
2456
  continue
2159
2457
  if norm_b == pk_norm:
2160
2458
  continue
2161
-
2459
+
2162
2460
  # Direct suffix match
2163
2461
  if norm_b.endswith(pk_norm):
2164
2462
  _record_pair(
@@ -2168,23 +2466,34 @@ def _infer_relationships(
2168
2466
  pk_cols[0],
2169
2467
  )
2170
2468
  continue
2171
-
2469
+
2172
2470
  # Enhanced: Check if column looks like a foreign key to this table
2173
- if _looks_like_foreign_key(table_b_name, table_a_name, meta_b["names"][0]):
2471
+ if _looks_like_foreign_key(
2472
+ table_b_name, table_a_name, meta_b["names"][0]
2473
+ ):
2174
2474
  # Additional check: name similarity with adaptive threshold
2175
2475
  similarity = _name_similarity(norm_b, pk_norm)
2176
2476
  # Calculate adaptive threshold for this relationship
2177
2477
  all_sample_values = []
2178
- for col_values in [pk_meta.get("values", []), meta_b.get("values", [])]:
2478
+ for col_values in [
2479
+ pk_meta.get("values", []),
2480
+ meta_b.get("values", []),
2481
+ ]:
2179
2482
  if col_values:
2180
2483
  all_sample_values.append(col_values)
2181
2484
 
2182
2485
  adaptive_thresholds = _calculate_adaptive_thresholds(
2183
2486
  all_sample_values,
2184
2487
  table_count=len(raw_tables),
2185
- base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
2488
+ base_sample_size=(
2489
+ len(pk_meta.get("values", []))
2490
+ if pk_meta.get("values")
2491
+ else 10
2492
+ ),
2493
+ )
2494
+ similarity_threshold = adaptive_thresholds.get(
2495
+ "similarity_threshold", 0.6
2186
2496
  )
2187
- similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
2188
2497
 
2189
2498
  if similarity >= similarity_threshold:
2190
2499
  _record_pair(
@@ -2204,7 +2513,7 @@ def _infer_relationships(
2204
2513
  continue
2205
2514
  if norm_a == pk_norm:
2206
2515
  continue
2207
-
2516
+
2208
2517
  # Direct suffix match
2209
2518
  if norm_a.endswith(pk_norm):
2210
2519
  _record_pair(
@@ -2214,23 +2523,34 @@ def _infer_relationships(
2214
2523
  pk_cols[0],
2215
2524
  )
2216
2525
  continue
2217
-
2526
+
2218
2527
  # Enhanced: Check if column looks like a foreign key to this table
2219
- if _looks_like_foreign_key(table_a_name, table_b_name, meta_a["names"][0]):
2528
+ if _looks_like_foreign_key(
2529
+ table_a_name, table_b_name, meta_a["names"][0]
2530
+ ):
2220
2531
  # Additional check: name similarity with adaptive threshold
2221
2532
  similarity = _name_similarity(norm_a, pk_norm)
2222
2533
  # Calculate adaptive threshold for this relationship
2223
2534
  all_sample_values = []
2224
- for col_values in [pk_meta.get("values", []), meta_a.get("values", [])]:
2535
+ for col_values in [
2536
+ pk_meta.get("values", []),
2537
+ meta_a.get("values", []),
2538
+ ]:
2225
2539
  if col_values:
2226
2540
  all_sample_values.append(col_values)
2227
2541
 
2228
2542
  adaptive_thresholds = _calculate_adaptive_thresholds(
2229
2543
  all_sample_values,
2230
2544
  table_count=len(raw_tables),
2231
- base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
2545
+ base_sample_size=(
2546
+ len(pk_meta.get("values", []))
2547
+ if pk_meta.get("values")
2548
+ else 10
2549
+ ),
2550
+ )
2551
+ similarity_threshold = adaptive_thresholds.get(
2552
+ "similarity_threshold", 0.6
2232
2553
  )
2233
- similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
2234
2554
 
2235
2555
  if similarity >= similarity_threshold:
2236
2556
  _record_pair(
@@ -2258,7 +2578,7 @@ def _infer_relationships(
2258
2578
  # Infer cardinality based on available metadata
2259
2579
  left_meta = metadata[left_table]
2260
2580
  right_meta = metadata[right_table]
2261
-
2581
+
2262
2582
  # Determine if tables have primary keys in the relationship
2263
2583
  left_has_pk = any(
2264
2584
  col_name in [pair[0] for pair in column_pairs]
@@ -2270,7 +2590,7 @@ def _infer_relationships(
2270
2590
  for pk_list in right_meta["pk_candidates"].values()
2271
2591
  for col_name in pk_list
2272
2592
  )
2273
-
2593
+
2274
2594
  # Enhanced: Get sample values for all columns in the relationship (for composite key analysis)
2275
2595
  left_values_all = []
2276
2596
  right_values_all = []
@@ -2279,12 +2599,11 @@ def _infer_relationships(
2279
2599
 
2280
2600
  for left_col, right_col in column_pairs:
2281
2601
  left_col_key = _sanitize_identifier_name(
2282
- left_col,
2283
- prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
2602
+ left_col, prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
2284
2603
  )
2285
2604
  right_col_key = _sanitize_identifier_name(
2286
2605
  right_col,
2287
- prefixes_to_drop=global_prefixes | _table_prefixes(right_table)
2606
+ prefixes_to_drop=global_prefixes | _table_prefixes(right_table),
2288
2607
  )
2289
2608
 
2290
2609
  left_col_values = []
@@ -2293,7 +2612,9 @@ def _infer_relationships(
2293
2612
  if left_col_key in left_meta["columns"]:
2294
2613
  left_col_values = left_meta["columns"][left_col_key].get("values") or []
2295
2614
  if right_col_key in right_meta["columns"]:
2296
- right_col_values = right_meta["columns"][right_col_key].get("values") or []
2615
+ right_col_values = (
2616
+ right_meta["columns"][right_col_key].get("values") or []
2617
+ )
2297
2618
 
2298
2619
  left_values_all.append(left_col_values)
2299
2620
  right_values_all.append(right_col_values)
@@ -2322,7 +2643,7 @@ def _infer_relationships(
2322
2643
  right_has_pk,
2323
2644
  adaptive_thresholds=global_adaptive_thresholds,
2324
2645
  )
2325
-
2646
+
2326
2647
  # Determine if SQL null probe should be executed for stricter inference
2327
2648
  strict_fk_detected = False
2328
2649
  if strict_join_inference and session:
@@ -2352,7 +2673,7 @@ def _infer_relationships(
2352
2673
  left_table_meta=left_meta,
2353
2674
  right_table_meta=right_meta,
2354
2675
  )
2355
-
2676
+
2356
2677
  # Calculate confidence and reasoning for this relationship
2357
2678
  confidence_analysis = _calculate_relationship_confidence(
2358
2679
  left_table=left_table,
@@ -2376,45 +2697,54 @@ def _infer_relationships(
2376
2697
  column_pairs=column_pairs,
2377
2698
  left_meta=left_meta,
2378
2699
  right_meta=right_meta,
2379
- current_confidence=confidence_analysis['confidence_score']
2700
+ current_confidence=confidence_analysis["confidence_score"],
2380
2701
  )
2381
2702
 
2382
2703
  # Update confidence analysis with domain knowledge
2383
- if domain_enhancement['confidence_boost'] > 0:
2384
- confidence_analysis['confidence_score'] = min(1.0,
2385
- confidence_analysis['confidence_score'] + domain_enhancement['confidence_boost'])
2704
+ if domain_enhancement["confidence_boost"] > 0:
2705
+ confidence_analysis["confidence_score"] = min(
2706
+ 1.0,
2707
+ confidence_analysis["confidence_score"]
2708
+ + domain_enhancement["confidence_boost"],
2709
+ )
2386
2710
 
2387
2711
  # Add domain knowledge factors to reasoning
2388
- for domain_factor in domain_enhancement['domain_factors']:
2389
- confidence_analysis['reasoning_factors'].append(f"Domain knowledge: {domain_factor}")
2712
+ for domain_factor in domain_enhancement["domain_factors"]:
2713
+ confidence_analysis["reasoning_factors"].append(
2714
+ f"Domain knowledge: {domain_factor}"
2715
+ )
2390
2716
 
2391
2717
  # Update confidence level based on new score
2392
- if confidence_analysis['confidence_score'] >= 0.8:
2393
- confidence_analysis['confidence_level'] = 'very_high'
2394
- confidence_analysis['confidence_description'] = 'Very High Confidence'
2395
- elif confidence_analysis['confidence_score'] >= 0.6:
2396
- confidence_analysis['confidence_level'] = 'high'
2397
- confidence_analysis['confidence_description'] = 'High Confidence'
2398
- elif confidence_analysis['confidence_score'] >= 0.4:
2399
- confidence_analysis['confidence_level'] = 'medium'
2400
- confidence_analysis['confidence_description'] = 'Medium Confidence'
2401
- elif confidence_analysis['confidence_score'] >= 0.2:
2402
- confidence_analysis['confidence_level'] = 'low'
2403
- confidence_analysis['confidence_description'] = 'Low Confidence'
2718
+ if confidence_analysis["confidence_score"] >= 0.8:
2719
+ confidence_analysis["confidence_level"] = "very_high"
2720
+ confidence_analysis["confidence_description"] = "Very High Confidence"
2721
+ elif confidence_analysis["confidence_score"] >= 0.6:
2722
+ confidence_analysis["confidence_level"] = "high"
2723
+ confidence_analysis["confidence_description"] = "High Confidence"
2724
+ elif confidence_analysis["confidence_score"] >= 0.4:
2725
+ confidence_analysis["confidence_level"] = "medium"
2726
+ confidence_analysis["confidence_description"] = "Medium Confidence"
2727
+ elif confidence_analysis["confidence_score"] >= 0.2:
2728
+ confidence_analysis["confidence_level"] = "low"
2729
+ confidence_analysis["confidence_description"] = "Low Confidence"
2404
2730
  else:
2405
- confidence_analysis['confidence_level'] = 'very_low'
2406
- confidence_analysis['confidence_description'] = 'Very Low Confidence'
2731
+ confidence_analysis["confidence_level"] = "very_low"
2732
+ confidence_analysis["confidence_description"] = "Very Low Confidence"
2407
2733
 
2408
2734
  # Enhanced logging with confidence and reasoning
2409
2735
  sample_info = f"samples: L={len(left_values)}, R={len(right_values)}"
2410
2736
  pk_info = f"PKs: L={left_has_pk}, R={right_has_pk}"
2411
- join_type_name = "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
2737
+ join_type_name = (
2738
+ "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
2739
+ )
2412
2740
  confidence_info = f"confidence: {confidence_analysis['confidence_score']:.2f} ({confidence_analysis['confidence_level']})"
2413
2741
 
2414
2742
  # Add domain knowledge info if applied
2415
2743
  domain_info = ""
2416
- if domain_enhancement['confidence_boost'] > 0:
2417
- domain_info = f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
2744
+ if domain_enhancement["confidence_boost"] > 0:
2745
+ domain_info = (
2746
+ f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
2747
+ )
2418
2748
 
2419
2749
  logger.info(
2420
2750
  f"Relationship inference for {left_table} -> {right_table}: "
@@ -2423,22 +2753,30 @@ def _infer_relationships(
2423
2753
  )
2424
2754
 
2425
2755
  # Log domain knowledge patterns if detected
2426
- domain_factors = [f for f in confidence_analysis['reasoning_factors'] if f.startswith("Domain knowledge:")]
2756
+ domain_factors = [
2757
+ f
2758
+ for f in confidence_analysis["reasoning_factors"]
2759
+ if f.startswith("Domain knowledge:")
2760
+ ]
2427
2761
  if domain_factors:
2428
- logger.debug(f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}")
2762
+ logger.debug(
2763
+ f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}"
2764
+ )
2429
2765
 
2430
2766
  # Log detailed reasoning for medium or lower confidence relationships
2431
- if confidence_analysis['confidence_score'] < 0.6:
2767
+ if confidence_analysis["confidence_score"] < 0.6:
2432
2768
  logger.debug(f"Confidence reasoning for {left_table} -> {right_table}:")
2433
- for factor in confidence_analysis['reasoning_factors']:
2769
+ for factor in confidence_analysis["reasoning_factors"]:
2434
2770
  logger.debug(f" - {factor}")
2435
2771
 
2436
2772
  # Log very high confidence relationships with their evidence
2437
- elif confidence_analysis['confidence_score'] >= 0.8:
2438
- logger.debug(f"High confidence relationship {left_table} -> {right_table} based on:")
2439
- for factor in confidence_analysis['reasoning_factors'][:3]: # Top 3 factors
2773
+ elif confidence_analysis["confidence_score"] >= 0.8:
2774
+ logger.debug(
2775
+ f"High confidence relationship {left_table} -> {right_table} based on:"
2776
+ )
2777
+ for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
2440
2778
  logger.debug(f" + {factor}")
2441
-
2779
+
2442
2780
  # Determine relationship type based on cardinality
2443
2781
  if left_card == "1" and right_card == "1":
2444
2782
  rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2449,7 +2787,7 @@ def _infer_relationships(
2449
2787
  else:
2450
2788
  # Default to many_to_one for backward compatibility
2451
2789
  rel_type = semantic_model_pb2.RelationshipType.many_to_one
2452
-
2790
+
2453
2791
  relationship = semantic_model_pb2.Relationship(
2454
2792
  name=f"{left_table}_to_{right_table}",
2455
2793
  left_table=left_table,
@@ -2472,9 +2810,13 @@ def _infer_relationships(
2472
2810
 
2473
2811
  if many_to_many_relationships:
2474
2812
  relationships.extend(many_to_many_relationships)
2475
- logger.info(f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables")
2813
+ logger.info(
2814
+ f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
2815
+ )
2476
2816
 
2477
- logger.info(f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables")
2817
+ logger.info(
2818
+ f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
2819
+ )
2478
2820
  return relationships
2479
2821
 
2480
2822
 
@@ -2512,7 +2854,14 @@ def _raw_table_to_semantic_context_table(
2512
2854
  base_type = _base_type_from_type(col.column_type)
2513
2855
  if _is_time_like_column(col):
2514
2856
  time_data_type = col.column_type
2515
- if time_data_type.split("(")[0].upper() in {"STRING", "VARCHAR", "TEXT", "CHAR", "CHARACTER", "NVARCHAR"}:
2857
+ if time_data_type.split("(")[0].upper() in {
2858
+ "STRING",
2859
+ "VARCHAR",
2860
+ "TEXT",
2861
+ "CHAR",
2862
+ "CHARACTER",
2863
+ "NVARCHAR",
2864
+ }:
2516
2865
  time_data_type = "TIMESTAMP_NTZ"
2517
2866
  time_dimension_name = _safe_semantic_identifier(
2518
2867
  col.column_name,
@@ -2564,7 +2913,9 @@ def _raw_table_to_semantic_context_table(
2564
2913
  data_type=col.column_type,
2565
2914
  sample_values=col.values,
2566
2915
  synonyms=[_PLACEHOLDER_COMMENT],
2567
- description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
2916
+ description=(
2917
+ col.comment if col.comment else _PLACEHOLDER_COMMENT
2918
+ ),
2568
2919
  )
2569
2920
  )
2570
2921
  continue
@@ -2685,7 +3036,9 @@ def raw_schema_to_semantic_context(
2685
3036
  unique_database_schema.append(fqn_databse_schema)
2686
3037
 
2687
3038
  logger.info(f"Pulling column information from {fqn_table}")
2688
- _notify(f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}...")
3039
+ _notify(
3040
+ f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}..."
3041
+ )
2689
3042
  valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
2690
3043
  session=conn,
2691
3044
  workspace=fqn_table.database,
@@ -2751,7 +3104,9 @@ def raw_schema_to_semantic_context(
2751
3104
  semantic_model_name,
2752
3105
  actual_model,
2753
3106
  )
2754
- _notify("Running DashScope enrichment to enhance descriptions and metrics...")
3107
+ _notify(
3108
+ "Running DashScope enrichment to enhance descriptions and metrics..."
3109
+ )
2755
3110
 
2756
3111
  # Create progress tracker for enrichment
2757
3112
  def enrichment_progress_callback(update):
@@ -2760,14 +3115,16 @@ def raw_schema_to_semantic_context(
2760
3115
  EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
2761
3116
  EnrichmentStage.MODEL_METRICS: "Generating model-level metrics",
2762
3117
  EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
2763
- EnrichmentStage.COMPLETE: "Enrichment complete"
3118
+ EnrichmentStage.COMPLETE: "Enrichment complete",
2764
3119
  }
2765
3120
 
2766
3121
  base_message = stage_messages.get(update.stage, "Processing")
2767
3122
  if update.table_name:
2768
3123
  message = f"{base_message} - {update.table_name} ({update.current_step}/{update.total_steps})"
2769
3124
  elif update.total_steps > 1:
2770
- message = f"{base_message} ({update.current_step}/{update.total_steps})"
3125
+ message = (
3126
+ f"{base_message} ({update.current_step}/{update.total_steps})"
3127
+ )
2771
3128
  else:
2772
3129
  message = base_message
2773
3130
 
@@ -2801,7 +3158,9 @@ def raw_schema_to_semantic_context(
2801
3158
  )
2802
3159
  _notify("DashScope enrichment complete.")
2803
3160
  else:
2804
- logger.warning("LLM enrichment was requested but DashScope is not configured; skipping enrichment.")
3161
+ logger.warning(
3162
+ "LLM enrichment was requested but DashScope is not configured; skipping enrichment."
3163
+ )
2805
3164
  _notify("DashScope configuration missing; skipped enrichment.")
2806
3165
  return context
2807
3166
 
@@ -2938,6 +3297,7 @@ def generate_model_str_from_clickzetta(
2938
3297
  Returns:
2939
3298
  str: The raw string of the semantic context.
2940
3299
  """
3300
+
2941
3301
  def _notify(message: str) -> None:
2942
3302
  if progress_callback:
2943
3303
  try:
@@ -2946,7 +3306,11 @@ def generate_model_str_from_clickzetta(
2946
3306
  logger.debug("Progress callback failed for message: {}", message)
2947
3307
 
2948
3308
  table_list = ", ".join(base_tables)
2949
- logger.info("Generating semantic model '{}' from tables: {}", semantic_model_name, table_list)
3309
+ logger.info(
3310
+ "Generating semantic model '{}' from tables: {}",
3311
+ semantic_model_name,
3312
+ table_list,
3313
+ )
2950
3314
  _notify("Collecting metadata from ClickZetta tables...")
2951
3315
 
2952
3316
  context = raw_schema_to_semantic_context(