clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/METADATA +5 -5
  2. clickzetta_semantic_model_generator-1.0.4.dist-info/RECORD +38 -0
  3. semantic_model_generator/clickzetta_utils/clickzetta_connector.py +100 -48
  4. semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
  5. semantic_model_generator/clickzetta_utils/utils.py +44 -2
  6. semantic_model_generator/data_processing/cte_utils.py +44 -14
  7. semantic_model_generator/generate_model.py +711 -239
  8. semantic_model_generator/llm/dashscope_client.py +4 -2
  9. semantic_model_generator/llm/enrichment.py +144 -57
  10. semantic_model_generator/llm/progress_tracker.py +16 -15
  11. semantic_model_generator/relationships/__init__.py +2 -0
  12. semantic_model_generator/relationships/discovery.py +181 -16
  13. semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
  14. semantic_model_generator/tests/cte_utils_test.py +15 -14
  15. semantic_model_generator/tests/generate_model_classification_test.py +12 -2
  16. semantic_model_generator/tests/llm_enrichment_test.py +152 -46
  17. semantic_model_generator/tests/relationship_discovery_test.py +70 -3
  18. semantic_model_generator/tests/relationships_filters_test.py +166 -30
  19. semantic_model_generator/tests/utils_test.py +1 -1
  20. semantic_model_generator/validate/keywords.py +453 -53
  21. semantic_model_generator/validate/schema.py +4 -2
  22. clickzetta_semantic_model_generator-1.0.2.dist-info/RECORD +0 -38
  23. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/LICENSE +0 -0
  24. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/WHEEL +0 -0
@@ -1,6 +1,7 @@
1
+ import math
1
2
  import os
2
3
  import re
3
- import math
4
+ import time
4
5
  from collections import defaultdict
5
6
  from datetime import datetime
6
7
  from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -8,8 +9,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
8
9
  from clickzetta.zettapark.session import Session
9
10
  from loguru import logger
10
11
 
11
- from semantic_model_generator.data_processing import data_types, proto_utils
12
- from semantic_model_generator.protos import semantic_model_pb2
13
12
  from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
14
13
  AUTOGEN_TOKEN,
15
14
  DIMENSION_DATATYPES,
@@ -19,15 +18,25 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
19
18
  get_table_representation,
20
19
  get_valid_schemas_tables_columns_df,
21
20
  )
22
- from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
23
- from semantic_model_generator.validate.context_length import validate_context_length
21
+ from semantic_model_generator.clickzetta_utils.utils import (
22
+ create_fqn_table,
23
+ join_quoted_identifiers,
24
+ normalize_identifier,
25
+ quote_identifier,
26
+ )
27
+ from semantic_model_generator.data_processing import data_types, proto_utils
24
28
  from semantic_model_generator.llm import (
25
29
  DashscopeClient,
26
30
  DashscopeSettings,
27
31
  enrich_semantic_model,
28
32
  get_dashscope_settings,
29
33
  )
30
- from semantic_model_generator.llm.progress_tracker import EnrichmentProgressTracker, EnrichmentStage
34
+ from semantic_model_generator.llm.progress_tracker import (
35
+ EnrichmentProgressTracker,
36
+ EnrichmentStage,
37
+ )
38
+ from semantic_model_generator.protos import semantic_model_pb2
39
+ from semantic_model_generator.validate.context_length import validate_context_length
31
40
  from semantic_model_generator.validate.keywords import CZ_RESERVED_WORDS
32
41
 
33
42
  _PLACEHOLDER_COMMENT = " "
@@ -38,6 +47,15 @@ _AUTOGEN_COMMENT_TOKEN = (
38
47
  )
39
48
  _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
40
49
  _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
50
+ _GENERIC_IDENTIFIER_TOKENS = {
51
+ "ID",
52
+ "NAME",
53
+ "CODE",
54
+ "KEY",
55
+ "VALUE",
56
+ "NUMBER",
57
+ }
58
+
41
59
 
42
60
  def _singularize(token: str) -> str:
43
61
  if token.endswith("IES") and len(token) > 3:
@@ -68,7 +86,9 @@ def _base_type_from_type(column_type: str) -> str:
68
86
  return token.split("(")[0]
69
87
 
70
88
 
71
- def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -> List[str]:
89
+ def _identifier_tokens(
90
+ name: str, prefixes_to_drop: Optional[set[str]] = None
91
+ ) -> List[str]:
72
92
  name = name.replace("-", "_")
73
93
  raw_tokens = re.split(r"[^0-9A-Za-z]+", name)
74
94
  tokens: List[str] = []
@@ -84,7 +104,17 @@ def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -
84
104
  return tokens
85
105
 
86
106
 
87
- def _sanitize_identifier_name(name: str, prefixes_to_drop: Optional[set[str]] = None) -> str:
107
+ def _is_generic_identifier(name: str) -> bool:
108
+ tokens = [token for token in _identifier_tokens(name) if token]
109
+ if not tokens:
110
+ return True
111
+ normalized_tokens = {token.upper() for token in tokens}
112
+ return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
113
+
114
+
115
+ def _sanitize_identifier_name(
116
+ name: str, prefixes_to_drop: Optional[set[str]] = None
117
+ ) -> str:
88
118
  if not name:
89
119
  return ""
90
120
 
@@ -271,7 +301,9 @@ def _looks_like_primary_key(table_name: str, column_name: str) -> bool:
271
301
  "PRIMARY_KEY",
272
302
  }
273
303
  for variant in variants:
274
- direct_matches.update({f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"})
304
+ direct_matches.update(
305
+ {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}
306
+ )
275
307
  if upper_name in direct_matches:
276
308
  return True
277
309
 
@@ -344,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
344
376
 
345
377
  def _format_sql_identifier(name: str) -> str:
346
378
  """
347
- Formats an identifier for SQL (without quoting) by stripping quotes and uppercasing.
379
+ Formats an identifier for SQL by wrapping it in backticks.
348
380
  """
349
- if not name:
350
- return ""
351
- return str(name).replace('"', "").replace("`", "").strip().upper()
381
+ return quote_identifier(name)
352
382
 
353
383
 
354
384
  def _qualified_table_name(fqn: data_types.FQNParts) -> str:
355
385
  """
356
- Builds a fully qualified table name without quoting.
386
+ Builds a fully qualified, backtick-quoted table name.
357
387
  """
358
- parts = [part for part in (fqn.database, fqn.schema_name, fqn.table) if part]
359
- return ".".join(_format_sql_identifier(part) for part in parts if part)
388
+ parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
389
+ return join_quoted_identifiers(*(part for part in parts if part))
360
390
 
361
391
 
362
392
  def _levenshtein_distance(s1: str, s2: str) -> int:
@@ -368,7 +398,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
368
398
  return _levenshtein_distance(s2, s1)
369
399
  if len(s2) == 0:
370
400
  return len(s1)
371
-
401
+
372
402
  previous_row = range(len(s2) + 1)
373
403
  for i, c1 in enumerate(s1):
374
404
  current_row = [i + 1]
@@ -378,7 +408,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
378
408
  substitutions = previous_row[j] + (c1 != c2)
379
409
  current_row.append(min(insertions, deletions, substitutions))
380
410
  previous_row = current_row
381
-
411
+
382
412
  return previous_row[-1]
383
413
 
384
414
 
@@ -389,26 +419,26 @@ def _name_similarity(name1: str, name2: str) -> float:
389
419
  """
390
420
  if not name1 or not name2:
391
421
  return 0.0
392
-
422
+
393
423
  # Exact match
394
424
  if name1.upper() == name2.upper():
395
425
  return 1.0
396
-
426
+
397
427
  # Normalize names for comparison
398
428
  norm1 = name1.upper().replace("_", "").replace("-", "")
399
429
  norm2 = name2.upper().replace("_", "").replace("-", "")
400
-
430
+
401
431
  if norm1 == norm2:
402
432
  return 0.95
403
-
433
+
404
434
  # Calculate Levenshtein-based similarity
405
435
  max_len = max(len(norm1), len(norm2))
406
436
  if max_len == 0:
407
437
  return 0.0
408
-
438
+
409
439
  distance = _levenshtein_distance(norm1, norm2)
410
440
  similarity = 1.0 - (distance / max_len)
411
-
441
+
412
442
  return max(0.0, similarity)
413
443
 
414
444
 
@@ -427,17 +457,24 @@ def _analyze_composite_key_patterns(
427
457
  Dict with composite key analysis results
428
458
  """
429
459
  pk_candidates = table_meta.get("pk_candidates", {})
430
- columns_meta = table_meta.get("columns", {})
431
460
 
432
461
  # Check if all relationship columns form a composite key
433
- relationship_cols = [pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs]
462
+ relationship_cols = [
463
+ pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs
464
+ ]
434
465
 
435
466
  # Normalize column names for comparison
436
467
  global_prefixes = set() # This should come from context but we'll handle it locally
437
- table_prefixes = _table_prefixes(list(table_meta.get("columns", {}).keys())[0] if table_meta.get("columns") else "")
468
+ table_prefixes = _table_prefixes(
469
+ list(table_meta.get("columns", {}).keys())[0]
470
+ if table_meta.get("columns")
471
+ else ""
472
+ )
438
473
 
439
474
  normalized_rel_cols = [
440
- _sanitize_identifier_name(col, prefixes_to_drop=global_prefixes | table_prefixes)
475
+ _sanitize_identifier_name(
476
+ col, prefixes_to_drop=global_prefixes | table_prefixes
477
+ )
441
478
  for col in relationship_cols
442
479
  ]
443
480
 
@@ -448,7 +485,9 @@ def _analyze_composite_key_patterns(
448
485
  analysis = {
449
486
  "is_composite_pk": pk_col_count > 1 and pk_col_count == total_pk_candidates,
450
487
  "partial_pk": pk_col_count > 0 and pk_col_count < total_pk_candidates,
451
- "pk_coverage_ratio": pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0,
488
+ "pk_coverage_ratio": (
489
+ pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0
490
+ ),
452
491
  "relationship_column_count": len(relationship_cols),
453
492
  "pk_column_count": pk_col_count,
454
493
  }
@@ -457,7 +496,10 @@ def _analyze_composite_key_patterns(
457
496
  if len(relationship_cols) > 1:
458
497
  sequential_patterns = []
459
498
  for col in relationship_cols:
460
- if any(pattern in col.upper() for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]):
499
+ if any(
500
+ pattern in col.upper()
501
+ for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]
502
+ ):
461
503
  sequential_patterns.append(col)
462
504
 
463
505
  analysis["sequential_id_pattern"] = len(sequential_patterns) >= 2
@@ -504,9 +546,12 @@ def _infer_composite_cardinality(
504
546
  # Rule 3: Composite key uniqueness analysis (if we have sufficient samples)
505
547
  MIN_SAMPLE_SIZE = 20 # Lower threshold for composite keys
506
548
 
507
- if (left_values_all and right_values_all and
508
- len(left_values_all) >= MIN_SAMPLE_SIZE and
509
- len(right_values_all) >= MIN_SAMPLE_SIZE):
549
+ if (
550
+ left_values_all
551
+ and right_values_all
552
+ and len(left_values_all) >= MIN_SAMPLE_SIZE
553
+ and len(right_values_all) >= MIN_SAMPLE_SIZE
554
+ ):
510
555
 
511
556
  # Create composite keys by concatenating values
512
557
  left_composite_keys = []
@@ -515,10 +560,12 @@ def _infer_composite_cardinality(
515
560
  sample_size = min(len(left_values_all), len(right_values_all))
516
561
 
517
562
  for i in range(sample_size):
518
- left_key = "|".join(str(vals[i]) if i < len(vals) else ""
519
- for vals in left_values_all)
520
- right_key = "|".join(str(vals[i]) if i < len(vals) else ""
521
- for vals in right_values_all)
563
+ left_key = "|".join(
564
+ str(vals[i]) if i < len(vals) else "" for vals in left_values_all
565
+ )
566
+ right_key = "|".join(
567
+ str(vals[i]) if i < len(vals) else "" for vals in right_values_all
568
+ )
522
569
 
523
570
  if left_key and not _is_nullish(left_key):
524
571
  left_composite_keys.append(left_key)
@@ -527,7 +574,9 @@ def _infer_composite_cardinality(
527
574
 
528
575
  if left_composite_keys and right_composite_keys:
529
576
  left_unique_ratio = len(set(left_composite_keys)) / len(left_composite_keys)
530
- right_unique_ratio = len(set(right_composite_keys)) / len(right_composite_keys)
577
+ right_unique_ratio = len(set(right_composite_keys)) / len(
578
+ right_composite_keys
579
+ )
531
580
 
532
581
  # Lower threshold for composite key uniqueness
533
582
  if right_unique_ratio > 0.9:
@@ -561,6 +610,7 @@ def _infer_composite_cardinality(
561
610
  adaptive_thresholds=adaptive_thresholds,
562
611
  )
563
612
 
613
+
564
614
  def _detect_bridge_table_pattern(
565
615
  table_meta: Dict[str, Any],
566
616
  all_tables_meta: Dict[str, Dict[str, Any]],
@@ -606,7 +656,9 @@ def _detect_bridge_table_pattern(
606
656
  base_type = col_info.get("base_type", "")
607
657
 
608
658
  # Check if column looks like an ID/foreign key
609
- if any(pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]):
659
+ if any(
660
+ pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]
661
+ ):
610
662
  id_columns.append(original_name)
611
663
 
612
664
  # Check if this could be a foreign key to another table
@@ -615,11 +667,13 @@ def _detect_bridge_table_pattern(
615
667
  continue
616
668
 
617
669
  if _looks_like_foreign_key(table_name, other_table_name, original_name):
618
- fk_like_columns.append({
619
- "column": original_name,
620
- "references_table": other_table_name,
621
- "confidence": 0.8
622
- })
670
+ fk_like_columns.append(
671
+ {
672
+ "column": original_name,
673
+ "references_table": other_table_name,
674
+ "confidence": 0.8,
675
+ }
676
+ )
623
677
  break
624
678
 
625
679
  # Check if column name contains the other table name
@@ -628,11 +682,13 @@ def _detect_bridge_table_pattern(
628
682
 
629
683
  for variant in other_variants:
630
684
  if variant in col_tokens:
631
- fk_like_columns.append({
632
- "column": original_name,
633
- "references_table": other_table_name,
634
- "confidence": 0.6
635
- })
685
+ fk_like_columns.append(
686
+ {
687
+ "column": original_name,
688
+ "references_table": other_table_name,
689
+ "confidence": 0.6,
690
+ }
691
+ )
636
692
  break
637
693
  else:
638
694
  # Count descriptive/non-ID columns
@@ -680,8 +736,18 @@ def _detect_bridge_table_pattern(
680
736
  # Name-based heuristics
681
737
  table_upper = table_name.upper()
682
738
  bridge_keywords = {
683
- "BRIDGE", "JUNCTION", "LINK", "ASSOC", "ASSOCIATION", "REL", "RELATIONSHIP",
684
- "MAP", "MAPPING", "XREF", "CROSS_REF", "CONNECTOR"
739
+ "BRIDGE",
740
+ "JUNCTION",
741
+ "LINK",
742
+ "ASSOC",
743
+ "ASSOCIATION",
744
+ "REL",
745
+ "RELATIONSHIP",
746
+ "MAP",
747
+ "MAPPING",
748
+ "XREF",
749
+ "CROSS_REF",
750
+ "CONNECTOR",
685
751
  }
686
752
 
687
753
  for keyword in bridge_keywords:
@@ -708,7 +774,9 @@ def _detect_bridge_table_pattern(
708
774
 
709
775
  is_bridge = confidence >= 0.6 # Threshold for bridge table classification
710
776
 
711
- connected_tables = [fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5]
777
+ connected_tables = [
778
+ fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5
779
+ ]
712
780
 
713
781
  return {
714
782
  "is_bridge": is_bridge,
@@ -718,14 +786,14 @@ def _detect_bridge_table_pattern(
718
786
  "fk_ratio": fk_ratio,
719
787
  "id_ratio": id_ratio,
720
788
  "total_columns": total_columns,
721
- "descriptive_columns": descriptive_columns
789
+ "descriptive_columns": descriptive_columns,
722
790
  }
723
791
 
724
792
 
725
793
  def _detect_many_to_many_relationships(
726
794
  raw_tables: List[tuple[data_types.FQNParts, data_types.Table]],
727
795
  metadata: Dict[str, Dict[str, Any]],
728
- existing_relationships: List[semantic_model_pb2.Relationship]
796
+ existing_relationships: List[semantic_model_pb2.Relationship],
729
797
  ) -> List[semantic_model_pb2.Relationship]:
730
798
  """
731
799
  Detect many-to-many relationships through bridge table analysis.
@@ -746,7 +814,10 @@ def _detect_many_to_many_relationships(
746
814
  for table_name, table_meta in metadata.items():
747
815
  bridge_analysis = _detect_bridge_table_pattern(table_meta, metadata)
748
816
 
749
- if bridge_analysis["is_bridge"] and len(bridge_analysis["connected_tables"]) >= 2:
817
+ if (
818
+ bridge_analysis["is_bridge"]
819
+ and len(bridge_analysis["connected_tables"]) >= 2
820
+ ):
750
821
  bridge_tables[table_name] = bridge_analysis
751
822
 
752
823
  logger.debug(
@@ -780,9 +851,15 @@ def _detect_many_to_many_relationships(
780
851
  right_fk_cols = []
781
852
 
782
853
  for fk_info in bridge_info["fk_like_columns"]:
783
- if fk_info["references_table"] == left_table and fk_info["confidence"] >= 0.5:
854
+ if (
855
+ fk_info["references_table"] == left_table
856
+ and fk_info["confidence"] >= 0.5
857
+ ):
784
858
  left_fk_cols.append(fk_info["column"])
785
- elif fk_info["references_table"] == right_table and fk_info["confidence"] >= 0.5:
859
+ elif (
860
+ fk_info["references_table"] == right_table
861
+ and fk_info["confidence"] >= 0.5
862
+ ):
786
863
  right_fk_cols.append(fk_info["column"])
787
864
 
788
865
  if not left_fk_cols or not right_fk_cols:
@@ -806,8 +883,12 @@ def _detect_many_to_many_relationships(
806
883
  # Use the first detected FK columns as a representative
807
884
  relationship.relationship_columns.append(
808
885
  semantic_model_pb2.RelationKey(
809
- left_column=left_fk_cols[0], # This is actually in the bridge table
810
- right_column=right_fk_cols[0], # This is also in the bridge table
886
+ left_column=left_fk_cols[
887
+ 0
888
+ ], # This is actually in the bridge table
889
+ right_column=right_fk_cols[
890
+ 0
891
+ ], # This is also in the bridge table
811
892
  )
812
893
  )
813
894
 
@@ -863,13 +944,19 @@ def _calculate_relationship_confidence(
863
944
  pk_confidence = 0.4
864
945
  confidence_score += pk_confidence
865
946
  if left_has_pk and right_has_pk:
866
- reasoning_factors.append("Both sides have primary key metadata (very strong evidence)")
947
+ reasoning_factors.append(
948
+ "Both sides have primary key metadata (very strong evidence)"
949
+ )
867
950
  evidence_details["pk_evidence"] = "both_pk"
868
951
  elif right_has_pk:
869
- reasoning_factors.append("Right side has primary key metadata (strong evidence)")
952
+ reasoning_factors.append(
953
+ "Right side has primary key metadata (strong evidence)"
954
+ )
870
955
  evidence_details["pk_evidence"] = "right_pk"
871
956
  elif left_has_pk:
872
- reasoning_factors.append("Left side has primary key metadata (strong evidence)")
957
+ reasoning_factors.append(
958
+ "Left side has primary key metadata (strong evidence)"
959
+ )
873
960
  evidence_details["pk_evidence"] = "left_pk"
874
961
 
875
962
  # Factor 2: Name similarity and pattern matching
@@ -884,28 +971,53 @@ def _calculate_relationship_confidence(
884
971
 
885
972
  if avg_name_similarity >= 0.9:
886
973
  name_confidence = 0.25
887
- reasoning_factors.append(f"Very high column name similarity ({avg_name_similarity:.2f})")
974
+ reasoning_factors.append(
975
+ f"Very high column name similarity ({avg_name_similarity:.2f})"
976
+ )
888
977
  elif avg_name_similarity >= 0.7:
889
978
  name_confidence = 0.2
890
- reasoning_factors.append(f"High column name similarity ({avg_name_similarity:.2f})")
979
+ reasoning_factors.append(
980
+ f"High column name similarity ({avg_name_similarity:.2f})"
981
+ )
891
982
  elif avg_name_similarity >= 0.5:
892
983
  name_confidence = 0.15
893
- reasoning_factors.append(f"Moderate column name similarity ({avg_name_similarity:.2f})")
984
+ reasoning_factors.append(
985
+ f"Moderate column name similarity ({avg_name_similarity:.2f})"
986
+ )
894
987
  elif avg_name_similarity >= 0.3:
895
988
  name_confidence = 0.1
896
- reasoning_factors.append(f"Low column name similarity ({avg_name_similarity:.2f})")
989
+ reasoning_factors.append(
990
+ f"Low column name similarity ({avg_name_similarity:.2f})"
991
+ )
897
992
  else:
898
993
  name_confidence = 0.05
899
- reasoning_factors.append(f"Very low column name similarity ({avg_name_similarity:.2f})")
994
+ reasoning_factors.append(
995
+ f"Very low column name similarity ({avg_name_similarity:.2f})"
996
+ )
900
997
 
901
998
  confidence_score += name_confidence
902
999
 
1000
+ generic_pair_count = sum(
1001
+ 1
1002
+ for left_col, right_col in column_pairs
1003
+ if _is_generic_identifier(left_col)
1004
+ and _is_generic_identifier(right_col)
1005
+ )
1006
+ if generic_pair_count:
1007
+ penalty = min(0.15 * generic_pair_count, 0.3)
1008
+ confidence_score = max(confidence_score - penalty, 0.0)
1009
+ reasoning_factors.append(
1010
+ f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
1011
+ )
1012
+
903
1013
  # Check for foreign key naming patterns
904
1014
  fk_pattern_confidence = 0.0
905
1015
  for left_col, right_col in column_pairs:
906
1016
  if _looks_like_foreign_key(left_table, right_table, left_col):
907
1017
  fk_pattern_confidence += 0.1
908
- reasoning_factors.append(f"Column '{left_col}' follows FK naming pattern")
1018
+ reasoning_factors.append(
1019
+ f"Column '{left_col}' follows FK naming pattern"
1020
+ )
909
1021
 
910
1022
  confidence_score += min(fk_pattern_confidence, 0.2)
911
1023
 
@@ -927,29 +1039,45 @@ def _calculate_relationship_confidence(
927
1039
 
928
1040
  # Check if uniqueness pattern matches inferred cardinality
929
1041
  left_card, right_card = cardinality_result
930
- uniqueness_threshold = adaptive_thresholds.get("uniqueness_threshold", 0.95) if adaptive_thresholds else 0.95
1042
+ uniqueness_threshold = (
1043
+ adaptive_thresholds.get("uniqueness_threshold", 0.95)
1044
+ if adaptive_thresholds
1045
+ else 0.95
1046
+ )
931
1047
 
932
1048
  cardinality_consistency = False
933
1049
  if left_card == "1" and left_unique_ratio > uniqueness_threshold:
934
1050
  cardinality_consistency = True
935
- elif left_card in ("*", "+") and left_unique_ratio <= uniqueness_threshold:
1051
+ elif (
1052
+ left_card in ("*", "+")
1053
+ and left_unique_ratio <= uniqueness_threshold
1054
+ ):
936
1055
  cardinality_consistency = True
937
1056
 
938
1057
  if right_card == "1" and right_unique_ratio > uniqueness_threshold:
939
1058
  cardinality_consistency = cardinality_consistency and True
940
- elif right_card in ("*", "+") and right_unique_ratio <= uniqueness_threshold:
1059
+ elif (
1060
+ right_card in ("*", "+")
1061
+ and right_unique_ratio <= uniqueness_threshold
1062
+ ):
941
1063
  cardinality_consistency = cardinality_consistency and True
942
1064
 
943
1065
  if cardinality_consistency:
944
1066
  uniqueness_confidence = 0.2
945
- reasoning_factors.append("Sample uniqueness patterns support inferred cardinality")
1067
+ reasoning_factors.append(
1068
+ "Sample uniqueness patterns support inferred cardinality"
1069
+ )
946
1070
  else:
947
1071
  uniqueness_confidence = 0.1
948
- reasoning_factors.append("Sample uniqueness patterns partially support cardinality")
1072
+ reasoning_factors.append(
1073
+ "Sample uniqueness patterns partially support cardinality"
1074
+ )
949
1075
 
950
1076
  confidence_score += uniqueness_confidence
951
1077
  else:
952
- reasoning_factors.append(f"Limited sample size ({sample_size}) reduces confidence")
1078
+ reasoning_factors.append(
1079
+ f"Limited sample size ({sample_size}) reduces confidence"
1080
+ )
953
1081
 
954
1082
  # Factor 4: Data type compatibility
955
1083
  if column_pairs and left_meta and right_meta:
@@ -992,15 +1120,21 @@ def _calculate_relationship_confidence(
992
1120
  evidence_details["left_table_role"] = left_role
993
1121
  evidence_details["right_table_role"] = right_role
994
1122
 
995
- relationship_context = _get_business_relationship_context(left_table, right_table, left_role, right_role)
1123
+ relationship_context = _get_business_relationship_context(
1124
+ left_table, right_table, left_role, right_role
1125
+ )
996
1126
  evidence_details["relationship_context"] = relationship_context
997
1127
 
998
1128
  if relationship_context in ["fact_to_dimension", "dimension_to_fact"]:
999
1129
  role_confidence = 0.15
1000
- reasoning_factors.append(f"Strong business relationship pattern: {relationship_context}")
1130
+ reasoning_factors.append(
1131
+ f"Strong business relationship pattern: {relationship_context}"
1132
+ )
1001
1133
  elif relationship_context in ["dimension_hierarchy", "bridge_relationship"]:
1002
1134
  role_confidence = 0.1
1003
- reasoning_factors.append(f"Valid business relationship pattern: {relationship_context}")
1135
+ reasoning_factors.append(
1136
+ f"Valid business relationship pattern: {relationship_context}"
1137
+ )
1004
1138
  elif relationship_context == "fact_to_fact":
1005
1139
  role_confidence = 0.05
1006
1140
  reasoning_factors.append("Unusual but possible fact-to-fact relationship")
@@ -1013,7 +1147,9 @@ def _calculate_relationship_confidence(
1013
1147
  # Factor 6: Multiple column relationships (composite keys)
1014
1148
  if len(column_pairs) > 1:
1015
1149
  composite_confidence = 0.1
1016
- reasoning_factors.append(f"Multi-column relationship ({len(column_pairs)} columns) increases confidence")
1150
+ reasoning_factors.append(
1151
+ f"Multi-column relationship ({len(column_pairs)} columns) increases confidence"
1152
+ )
1017
1153
  confidence_score += composite_confidence
1018
1154
 
1019
1155
  # Normalize confidence score to 0-1 range
@@ -1043,7 +1179,9 @@ def _calculate_relationship_confidence(
1043
1179
  "reasoning_factors": reasoning_factors,
1044
1180
  "evidence_details": evidence_details,
1045
1181
  "inferred_cardinality": f"{cardinality_result[0]}:{cardinality_result[1]}",
1046
- "join_type": "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER",
1182
+ "join_type": (
1183
+ "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
1184
+ ),
1047
1185
  "column_count": len(column_pairs),
1048
1186
  }
1049
1187
 
@@ -1059,101 +1197,196 @@ def _get_domain_knowledge_patterns() -> Dict[str, Any]:
1059
1197
  # Common business entity patterns
1060
1198
  "business_entities": {
1061
1199
  "customer": {
1062
- "table_patterns": ["CUSTOMER", "CUST", "CLIENT", "ACCOUNT_HOLDER", "USER", "MEMBER"],
1063
- "pk_patterns": ["CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "USER_ID", "MEMBER_ID"],
1064
- "typical_attributes": ["NAME", "EMAIL", "PHONE", "ADDRESS", "STATUS", "TYPE", "SEGMENT"],
1065
- "role": "dimension"
1200
+ "table_patterns": [
1201
+ "CUSTOMER",
1202
+ "CUST",
1203
+ "CLIENT",
1204
+ "ACCOUNT_HOLDER",
1205
+ "USER",
1206
+ "MEMBER",
1207
+ ],
1208
+ "pk_patterns": [
1209
+ "CUSTOMER_ID",
1210
+ "CUST_ID",
1211
+ "CLIENT_ID",
1212
+ "USER_ID",
1213
+ "MEMBER_ID",
1214
+ ],
1215
+ "typical_attributes": [
1216
+ "NAME",
1217
+ "EMAIL",
1218
+ "PHONE",
1219
+ "ADDRESS",
1220
+ "STATUS",
1221
+ "TYPE",
1222
+ "SEGMENT",
1223
+ ],
1224
+ "role": "dimension",
1066
1225
  },
1067
1226
  "product": {
1068
1227
  "table_patterns": ["PRODUCT", "ITEM", "SKU", "INVENTORY", "CATALOG"],
1069
1228
  "pk_patterns": ["PRODUCT_ID", "ITEM_ID", "SKU", "PRODUCT_KEY"],
1070
- "typical_attributes": ["NAME", "DESCRIPTION", "CATEGORY", "PRICE", "BRAND", "STATUS"],
1071
- "role": "dimension"
1229
+ "typical_attributes": [
1230
+ "NAME",
1231
+ "DESCRIPTION",
1232
+ "CATEGORY",
1233
+ "PRICE",
1234
+ "BRAND",
1235
+ "STATUS",
1236
+ ],
1237
+ "role": "dimension",
1072
1238
  },
1073
1239
  "order": {
1074
1240
  "table_patterns": ["ORDER", "TRANSACTION", "SALE", "PURCHASE"],
1075
- "pk_patterns": ["ORDER_ID", "TRANSACTION_ID", "SALE_ID", "ORDER_NUMBER"],
1241
+ "pk_patterns": [
1242
+ "ORDER_ID",
1243
+ "TRANSACTION_ID",
1244
+ "SALE_ID",
1245
+ "ORDER_NUMBER",
1246
+ ],
1076
1247
  "typical_attributes": ["DATE", "AMOUNT", "STATUS", "QUANTITY", "TOTAL"],
1077
- "role": "fact"
1248
+ "role": "fact",
1078
1249
  },
1079
1250
  "date": {
1080
1251
  "table_patterns": ["DATE", "TIME", "CALENDAR", "DIM_DATE"],
1081
1252
  "pk_patterns": ["DATE_ID", "DATE_KEY", "TIME_ID"],
1082
- "typical_attributes": ["YEAR", "MONTH", "DAY", "QUARTER", "WEEK", "WEEKDAY"],
1083
- "role": "dimension"
1253
+ "typical_attributes": [
1254
+ "YEAR",
1255
+ "MONTH",
1256
+ "DAY",
1257
+ "QUARTER",
1258
+ "WEEK",
1259
+ "WEEKDAY",
1260
+ ],
1261
+ "role": "dimension",
1084
1262
  },
1085
1263
  "location": {
1086
- "table_patterns": ["LOCATION", "GEOGRAPHY", "ADDRESS", "REGION", "TERRITORY"],
1264
+ "table_patterns": [
1265
+ "LOCATION",
1266
+ "GEOGRAPHY",
1267
+ "ADDRESS",
1268
+ "REGION",
1269
+ "TERRITORY",
1270
+ ],
1087
1271
  "pk_patterns": ["LOCATION_ID", "GEO_ID", "ADDRESS_ID", "REGION_ID"],
1088
- "typical_attributes": ["COUNTRY", "STATE", "CITY", "ZIP", "LATITUDE", "LONGITUDE"],
1089
- "role": "dimension"
1272
+ "typical_attributes": [
1273
+ "COUNTRY",
1274
+ "STATE",
1275
+ "CITY",
1276
+ "ZIP",
1277
+ "LATITUDE",
1278
+ "LONGITUDE",
1279
+ ],
1280
+ "role": "dimension",
1090
1281
  },
1091
1282
  "employee": {
1092
1283
  "table_patterns": ["EMPLOYEE", "STAFF", "WORKER", "PERSONNEL"],
1093
1284
  "pk_patterns": ["EMPLOYEE_ID", "STAFF_ID", "EMP_ID"],
1094
- "typical_attributes": ["NAME", "DEPARTMENT", "TITLE", "MANAGER", "HIRE_DATE"],
1095
- "role": "dimension"
1096
- }
1285
+ "typical_attributes": [
1286
+ "NAME",
1287
+ "DEPARTMENT",
1288
+ "TITLE",
1289
+ "MANAGER",
1290
+ "HIRE_DATE",
1291
+ ],
1292
+ "role": "dimension",
1293
+ },
1097
1294
  },
1098
-
1099
1295
  # Common relationship patterns in data warehouses
1100
1296
  "relationship_patterns": {
1101
1297
  "star_schema": {
1102
1298
  "pattern": "fact_to_dimension",
1103
1299
  "confidence_boost": 0.2,
1104
- "description": "Standard star schema fact-to-dimension relationship"
1300
+ "description": "Standard star schema fact-to-dimension relationship",
1105
1301
  },
1106
1302
  "snowflake_schema": {
1107
1303
  "pattern": "dimension_hierarchy",
1108
1304
  "confidence_boost": 0.15,
1109
- "description": "Snowflake schema dimension hierarchy"
1305
+ "description": "Snowflake schema dimension hierarchy",
1110
1306
  },
1111
1307
  "bridge_table": {
1112
1308
  "pattern": "many_to_many_via_bridge",
1113
1309
  "confidence_boost": 0.1,
1114
- "description": "Many-to-many relationship through bridge table"
1310
+ "description": "Many-to-many relationship through bridge table",
1115
1311
  },
1116
1312
  "time_dimension": {
1117
1313
  "pattern": "temporal_relationship",
1118
1314
  "confidence_boost": 0.25,
1119
- "description": "Time-based relationship (very common in warehouses)"
1120
- }
1315
+ "description": "Time-based relationship (very common in warehouses)",
1316
+ },
1121
1317
  },
1122
-
1123
1318
  # Known FK patterns that often appear in real data warehouses
1124
1319
  "common_fk_patterns": {
1125
1320
  "customer_references": [
1126
- "CUSTOMER_ID", "CUST_ID", "CLIENT_ID", "ACCOUNT_ID", "USER_ID"
1321
+ "CUSTOMER_ID",
1322
+ "CUST_ID",
1323
+ "CLIENT_ID",
1324
+ "ACCOUNT_ID",
1325
+ "USER_ID",
1127
1326
  ],
1128
1327
  "product_references": [
1129
- "PRODUCT_ID", "ITEM_ID", "SKU", "PROD_ID", "CATALOG_ID"
1328
+ "PRODUCT_ID",
1329
+ "ITEM_ID",
1330
+ "SKU",
1331
+ "PROD_ID",
1332
+ "CATALOG_ID",
1130
1333
  ],
1131
1334
  "date_references": [
1132
- "DATE_ID", "ORDER_DATE_ID", "SHIP_DATE_ID", "CREATE_DATE_ID",
1133
- "TRANSACTION_DATE_ID", "DATE_KEY"
1335
+ "DATE_ID",
1336
+ "ORDER_DATE_ID",
1337
+ "SHIP_DATE_ID",
1338
+ "CREATE_DATE_ID",
1339
+ "TRANSACTION_DATE_ID",
1340
+ "DATE_KEY",
1134
1341
  ],
1135
1342
  "location_references": [
1136
- "LOCATION_ID", "ADDRESS_ID", "SHIP_TO_ID", "BILL_TO_ID",
1137
- "WAREHOUSE_ID", "STORE_ID"
1138
- ]
1343
+ "LOCATION_ID",
1344
+ "ADDRESS_ID",
1345
+ "SHIP_TO_ID",
1346
+ "BILL_TO_ID",
1347
+ "WAREHOUSE_ID",
1348
+ "STORE_ID",
1349
+ ],
1139
1350
  },
1140
-
1141
1351
  # Table naming conventions that indicate specific patterns
1142
1352
  "naming_conventions": {
1143
1353
  "fact_indicators": [
1144
- "FACT_", "FCT_", "F_", "SALES_", "ORDERS_", "TRANSACTIONS_",
1145
- "REVENUE_", "METRICS_", "EVENTS_", "ACTIVITY_"
1354
+ "FACT_",
1355
+ "FCT_",
1356
+ "F_",
1357
+ "SALES_",
1358
+ "ORDERS_",
1359
+ "TRANSACTIONS_",
1360
+ "REVENUE_",
1361
+ "METRICS_",
1362
+ "EVENTS_",
1363
+ "ACTIVITY_",
1146
1364
  ],
1147
1365
  "dimension_indicators": [
1148
- "DIM_", "D_", "REF_", "LKP_", "LOOKUP_", "MASTER_"
1366
+ "DIM_",
1367
+ "D_",
1368
+ "REF_",
1369
+ "LKP_",
1370
+ "LOOKUP_",
1371
+ "MASTER_",
1149
1372
  ],
1150
1373
  "bridge_indicators": [
1151
- "BRG_", "BRIDGE_", "XREF_", "MAP_", "ASSOC_", "LINK_"
1374
+ "BRG_",
1375
+ "BRIDGE_",
1376
+ "XREF_",
1377
+ "MAP_",
1378
+ "ASSOC_",
1379
+ "LINK_",
1152
1380
  ],
1153
1381
  "staging_indicators": [
1154
- "STG_", "STAGING_", "TMP_", "TEMP_", "RAW_", "LANDING_"
1155
- ]
1156
- }
1382
+ "STG_",
1383
+ "STAGING_",
1384
+ "TMP_",
1385
+ "TEMP_",
1386
+ "RAW_",
1387
+ "LANDING_",
1388
+ ],
1389
+ },
1157
1390
  }
1158
1391
 
1159
1392
 
@@ -1204,18 +1437,26 @@ def _apply_domain_knowledge(
1204
1437
  if entity_pair in common_pairs:
1205
1438
  boost = common_pairs[entity_pair]
1206
1439
  confidence_boost += boost
1207
- enhancement_factors.append(f"Recognized common business pattern: {entity_pair} (+{boost:.2f})")
1440
+ enhancement_factors.append(
1441
+ f"Recognized common business pattern: {entity_pair} (+{boost:.2f})"
1442
+ )
1208
1443
  elif f"{right_entity}-{left_entity}" in common_pairs:
1209
1444
  boost = common_pairs[f"{right_entity}-{left_entity}"]
1210
1445
  confidence_boost += boost
1211
- enhancement_factors.append(f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})")
1446
+ enhancement_factors.append(
1447
+ f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})"
1448
+ )
1212
1449
 
1213
1450
  # Factor 2: Check for standard FK naming patterns
1214
1451
  for left_col, right_col in column_pairs:
1215
- fk_pattern_match = _check_standard_fk_patterns(left_col, right_col, domain_patterns)
1452
+ fk_pattern_match = _check_standard_fk_patterns(
1453
+ left_col, right_col, domain_patterns
1454
+ )
1216
1455
  if fk_pattern_match:
1217
1456
  confidence_boost += 0.15
1218
- enhancement_factors.append(f"Standard FK pattern detected: {fk_pattern_match}")
1457
+ enhancement_factors.append(
1458
+ f"Standard FK pattern detected: {fk_pattern_match}"
1459
+ )
1219
1460
 
1220
1461
  # Factor 3: Table naming convention analysis
1221
1462
  left_convention = _identify_naming_convention(left_table, domain_patterns)
@@ -1223,8 +1464,9 @@ def _apply_domain_knowledge(
1223
1464
 
1224
1465
  if left_convention and right_convention:
1225
1466
  # Boost confidence for expected patterns
1226
- if (left_convention == "fact" and right_convention == "dimension") or \
1227
- (left_convention == "dimension" and right_convention == "fact"):
1467
+ if (left_convention == "fact" and right_convention == "dimension") or (
1468
+ left_convention == "dimension" and right_convention == "fact"
1469
+ ):
1228
1470
  confidence_boost += 0.2
1229
1471
  enhancement_factors.append("Standard fact-dimension naming pattern (+0.20)")
1230
1472
  elif left_convention == "dimension" and right_convention == "dimension":
@@ -1237,12 +1479,20 @@ def _apply_domain_knowledge(
1237
1479
  enhancement_factors.append("Time dimension relationship (very common) (+0.20)")
1238
1480
 
1239
1481
  # Factor 5: Schema pattern recognition (star vs snowflake)
1240
- schema_pattern = _detect_schema_pattern(left_table, right_table, left_meta, right_meta, domain_patterns)
1482
+ schema_pattern = _detect_schema_pattern(
1483
+ left_table, right_table, left_meta, right_meta, domain_patterns
1484
+ )
1241
1485
  if schema_pattern:
1242
- pattern_boost = domain_patterns["relationship_patterns"][schema_pattern]["confidence_boost"]
1486
+ pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
1487
+ "confidence_boost"
1488
+ ]
1243
1489
  confidence_boost += pattern_boost
1244
- pattern_desc = domain_patterns["relationship_patterns"][schema_pattern]["description"]
1245
- enhancement_factors.append(f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})")
1490
+ pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
1491
+ "description"
1492
+ ]
1493
+ enhancement_factors.append(
1494
+ f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})"
1495
+ )
1246
1496
 
1247
1497
  # Apply the boost but cap the final confidence at 1.0
1248
1498
  enhanced_confidence = min(current_confidence + confidence_boost, 1.0)
@@ -1259,7 +1509,9 @@ def _apply_domain_knowledge(
1259
1509
  }
1260
1510
 
1261
1511
 
1262
- def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> Optional[str]:
1512
+ def _identify_business_entity(
1513
+ table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
1514
+ ) -> Optional[str]:
1263
1515
  """Identify what business entity a table represents."""
1264
1516
  table_upper = table_name.upper()
1265
1517
  business_entities = domain_patterns["business_entities"]
@@ -1274,13 +1526,18 @@ def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domai
1274
1526
  pk_candidates = table_meta.get("pk_candidates", {})
1275
1527
  for pk_pattern in entity_info["pk_patterns"]:
1276
1528
  for pk_norm in pk_candidates.keys():
1277
- if pk_pattern.replace("_", "").upper() in pk_norm.replace("_", "").upper():
1529
+ if (
1530
+ pk_pattern.replace("_", "").upper()
1531
+ in pk_norm.replace("_", "").upper()
1532
+ ):
1278
1533
  return entity_type
1279
1534
 
1280
1535
  return None
1281
1536
 
1282
1537
 
1283
- def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
1538
+ def _check_standard_fk_patterns(
1539
+ left_col: str, right_col: str, domain_patterns: Dict[str, Any]
1540
+ ) -> Optional[str]:
1284
1541
  """Check if column pair matches standard FK patterns."""
1285
1542
  common_fks = domain_patterns["common_fk_patterns"]
1286
1543
 
@@ -1295,7 +1552,9 @@ def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns:
1295
1552
  return None
1296
1553
 
1297
1554
 
1298
- def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]) -> Optional[str]:
1555
+ def _identify_naming_convention(
1556
+ table_name: str, domain_patterns: Dict[str, Any]
1557
+ ) -> Optional[str]:
1299
1558
  """Identify the naming convention used for a table."""
1300
1559
  table_upper = table_name.upper()
1301
1560
  naming_conventions = domain_patterns["naming_conventions"]
@@ -1308,7 +1567,9 @@ def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]
1308
1567
  return None
1309
1568
 
1310
1569
 
1311
- def _is_time_dimension_pattern(table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]) -> bool:
1570
+ def _is_time_dimension_pattern(
1571
+ table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
1572
+ ) -> bool:
1312
1573
  """Check if table follows time dimension patterns."""
1313
1574
  table_upper = table_name.upper()
1314
1575
  time_patterns = domain_patterns["business_entities"]["date"]["table_patterns"]
@@ -1344,15 +1605,16 @@ def _detect_schema_pattern(
1344
1605
  right_table: str,
1345
1606
  left_meta: Dict[str, Any],
1346
1607
  right_meta: Dict[str, Any],
1347
- domain_patterns: Dict[str, Any]
1608
+ domain_patterns: Dict[str, Any],
1348
1609
  ) -> Optional[str]:
1349
1610
  """Detect common schema patterns (star, snowflake, etc.)."""
1350
1611
  left_role = _detect_table_role(left_table, left_meta)
1351
1612
  right_role = _detect_table_role(right_table, right_meta)
1352
1613
 
1353
1614
  # Star schema pattern: fact table to dimension
1354
- if (left_role == "fact" and right_role == "dimension") or \
1355
- (left_role == "dimension" and right_role == "fact"):
1615
+ if (left_role == "fact" and right_role == "dimension") or (
1616
+ left_role == "dimension" and right_role == "fact"
1617
+ ):
1356
1618
  return "star_schema"
1357
1619
 
1358
1620
  # Snowflake schema pattern: dimension to dimension
@@ -1360,8 +1622,9 @@ def _detect_schema_pattern(
1360
1622
  return "snowflake_schema"
1361
1623
 
1362
1624
  # Time dimension pattern (very common)
1363
- if _is_time_dimension_pattern(right_table, right_meta, domain_patterns) or \
1364
- _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
1625
+ if _is_time_dimension_pattern(
1626
+ right_table, right_meta, domain_patterns
1627
+ ) or _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
1365
1628
  return "time_dimension"
1366
1629
 
1367
1630
  # Bridge table pattern
@@ -1397,7 +1660,9 @@ def _calculate_adaptive_thresholds(
1397
1660
  # Calculate sample statistics
1398
1661
  sample_sizes = [len(vals) for vals in values_list if vals]
1399
1662
  max_sample_size = max(sample_sizes) if sample_sizes else base_sample_size
1400
- avg_sample_size = sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
1663
+ avg_sample_size = (
1664
+ sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
1665
+ )
1401
1666
 
1402
1667
  # Calculate data distribution characteristics
1403
1668
  total_unique_values = 0
@@ -1425,7 +1690,7 @@ def _calculate_adaptive_thresholds(
1425
1690
  if len(value_counts) > 1:
1426
1691
  max_freq = max(value_counts.values())
1427
1692
  min_freq = min(value_counts.values())
1428
- skew = max_freq / min_freq if min_freq > 0 else float('inf')
1693
+ skew = max_freq / min_freq if min_freq > 0 else float("inf")
1429
1694
  skew_ratios.append(skew)
1430
1695
 
1431
1696
  # Calculate overall uniqueness ratio
@@ -1459,7 +1724,9 @@ def _calculate_adaptive_thresholds(
1459
1724
  min_size_adj *= 1.1
1460
1725
 
1461
1726
  # Scale with base sample size from configuration
1462
- size_scale_factor = min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
1727
+ size_scale_factor = (
1728
+ min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
1729
+ )
1463
1730
  min_size_adj *= size_scale_factor
1464
1731
 
1465
1732
  thresholds["min_sample_size"] = max(int(base_min_size * min_size_adj), 10)
@@ -1594,8 +1861,12 @@ def _infer_cardinality(
1594
1861
  left_non_null = [v for v in left_values if not _is_nullish(v)]
1595
1862
  right_non_null = [v for v in right_values if not _is_nullish(v)]
1596
1863
 
1597
- left_unique_ratio = len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
1598
- right_unique_ratio = len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
1864
+ left_unique_ratio = (
1865
+ len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
1866
+ )
1867
+ right_unique_ratio = (
1868
+ len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
1869
+ )
1599
1870
 
1600
1871
  # Apply adaptive uniqueness threshold
1601
1872
  left_is_unique = left_unique_ratio > uniqueness_threshold
@@ -1691,11 +1962,19 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1691
1962
  Returns:
1692
1963
  str: Table role ('fact', 'dimension', 'bridge', 'staging', 'unknown')
1693
1964
  """
1694
- upper_name = table_name.upper()
1695
1965
  tokens = _identifier_tokens(table_name)
1696
1966
 
1697
1967
  # Rule 1: Explicit prefixes/suffixes
1698
- fact_indicators = {"FACT", "FCT", "TXN", "TRANSACTION", "EVENT", "LOG", "SALES", "ORDER"}
1968
+ fact_indicators = {
1969
+ "FACT",
1970
+ "FCT",
1971
+ "TXN",
1972
+ "TRANSACTION",
1973
+ "EVENT",
1974
+ "LOG",
1975
+ "SALES",
1976
+ "ORDER",
1977
+ }
1699
1978
  dim_indicators = {"DIM", "DIMENSION", "LOOKUP", "REF", "REFERENCE", "MASTER"}
1700
1979
  bridge_indicators = {"BRIDGE", "BRG", "LINK", "JUNCTION", "ASSOC", "ASSOCIATION"}
1701
1980
  staging_indicators = {"STG", "STAGING", "TMP", "TEMP", "WORK", "LANDING", "RAW"}
@@ -1734,9 +2013,22 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1734
2013
  id_count += 1
1735
2014
 
1736
2015
  # Count measure-like columns (amounts, counts, quantities)
1737
- if any(word in col_name for word in ["AMOUNT", "QTY", "QUANTITY", "COUNT", "TOTAL", "SUM", "AVG"]):
2016
+ if any(
2017
+ word in col_name
2018
+ for word in [
2019
+ "AMOUNT",
2020
+ "QTY",
2021
+ "QUANTITY",
2022
+ "COUNT",
2023
+ "TOTAL",
2024
+ "SUM",
2025
+ "AVG",
2026
+ ]
2027
+ ):
1738
2028
  measure_like_count += 1
1739
- elif base_type in MEASURE_DATATYPES and not col_info.get("is_identifier", False):
2029
+ elif base_type in MEASURE_DATATYPES and not col_info.get(
2030
+ "is_identifier", False
2031
+ ):
1740
2032
  measure_like_count += 1
1741
2033
  else:
1742
2034
  dimension_like_count += 1
@@ -1761,7 +2053,9 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
1761
2053
  return "unknown"
1762
2054
 
1763
2055
 
1764
- def _get_business_relationship_context(left_table: str, right_table: str, left_role: str, right_role: str) -> str:
2056
+ def _get_business_relationship_context(
2057
+ left_table: str, right_table: str, left_role: str, right_role: str
2058
+ ) -> str:
1765
2059
  """
1766
2060
  Determine business relationship context between tables based on their roles.
1767
2061
 
@@ -1833,7 +2127,7 @@ def _infer_join_type(
1833
2127
  4. Naming pattern heuristics
1834
2128
  5. Conservative INNER JOIN default
1835
2129
  """
1836
-
2130
+
1837
2131
  # RULE 1: Default to INNER JOIN (most common and safest)
1838
2132
  default_join = semantic_model_pb2.JoinType.inner
1839
2133
 
@@ -1861,9 +2155,17 @@ def _infer_join_type(
1861
2155
  # Apply business rules based on relationship context
1862
2156
  if relationship_context == "fact_to_dimension":
1863
2157
  # Fact → Dimension: usually INNER, but check for optional dimensions
1864
- if any(keyword in right_table.upper() for keyword in [
1865
- "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "OPTIONAL", "SECONDARY"
1866
- ]):
2158
+ if any(
2159
+ keyword in right_table.upper()
2160
+ for keyword in [
2161
+ "PROMO",
2162
+ "PROMOTION",
2163
+ "DISCOUNT",
2164
+ "COUPON",
2165
+ "OPTIONAL",
2166
+ "SECONDARY",
2167
+ ]
2168
+ ):
1867
2169
  logger.debug(
1868
2170
  f"Join type inference for {left_table} -> {right_table}: "
1869
2171
  f"LEFT_OUTER (fact to optional dimension: {right_role})"
@@ -1907,11 +2209,19 @@ def _infer_join_type(
1907
2209
  return semantic_model_pb2.JoinType.left_outer
1908
2210
 
1909
2211
  # RULE 5: Naming pattern heuristics for optional relationships
1910
- left_upper = left_table.upper()
1911
2212
  right_upper = right_table.upper()
1912
2213
  optional_keywords = {
1913
- "OPTIONAL", "ALTERNATE", "SECONDARY", "BACKUP", "FALLBACK",
1914
- "PROMO", "PROMOTION", "DISCOUNT", "COUPON", "TEMP", "TMP"
2214
+ "OPTIONAL",
2215
+ "ALTERNATE",
2216
+ "SECONDARY",
2217
+ "BACKUP",
2218
+ "FALLBACK",
2219
+ "PROMO",
2220
+ "PROMOTION",
2221
+ "DISCOUNT",
2222
+ "COUPON",
2223
+ "TEMP",
2224
+ "TMP",
1915
2225
  }
1916
2226
 
1917
2227
  for keyword in optional_keywords:
@@ -1946,12 +2256,17 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
1946
2256
  """
1947
2257
  fk_upper = fk_column.strip().upper()
1948
2258
  pk_table_variants = _table_variants(pk_table)
1949
-
2259
+
1950
2260
  # Pattern 1: {table_name}_id or {table_name}_key
1951
2261
  for variant in pk_table_variants:
1952
- if fk_upper in {f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}:
2262
+ if fk_upper in {
2263
+ f"{variant}_ID",
2264
+ f"{variant}ID",
2265
+ f"{variant}_KEY",
2266
+ f"{variant}KEY",
2267
+ }:
1953
2268
  return True
1954
-
2269
+
1955
2270
  # Pattern 2: Column ends with table name variants
1956
2271
  tokens = _identifier_tokens(fk_column)
1957
2272
  if len(tokens) >= 2:
@@ -1961,21 +2276,23 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
1961
2276
  tail = tokens[-1]
1962
2277
  if tail in {"ID", "KEY"}:
1963
2278
  return True
1964
-
2279
+
1965
2280
  # Pattern 3: Similar to primary key column but with FK table prefix
1966
2281
  # e.g., order_id in order_items table referencing orders.id
1967
2282
  fk_table_variants = _table_variants(fk_table)
1968
2283
  for fk_variant in fk_table_variants:
1969
2284
  if fk_upper.startswith(fk_variant):
1970
- remainder = fk_upper[len(fk_variant):].lstrip("_")
2285
+ remainder = fk_upper[len(fk_variant) :].lstrip("_")
1971
2286
  for pk_variant in pk_table_variants:
1972
2287
  if remainder.startswith(pk_variant):
1973
2288
  return True
1974
-
2289
+
1975
2290
  return False
1976
2291
 
1977
2292
 
1978
- def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.NamedFilter]:
2293
+ def _suggest_filters(
2294
+ raw_table: data_types.Table,
2295
+ ) -> List[semantic_model_pb2.NamedFilter]:
1979
2296
  suggestions: List[semantic_model_pb2.NamedFilter] = []
1980
2297
  for col in raw_table.columns:
1981
2298
  base_type = _base_type_from_type(col.column_type)
@@ -2011,12 +2328,20 @@ def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.Nam
2011
2328
  )
2012
2329
  is_textual = base_type in {"STRING", "TEXT", "VARCHAR", "CHAR", "CHARACTER"}
2013
2330
  is_boolean = base_type in {"BOOLEAN"}
2014
- is_categorical_numeric = base_type in {"INT", "INTEGER", "NUMBER", "SMALLINT", "BIGINT"} and any(
2015
- upper_name.endswith(suffix) for suffix in categorical_suffixes
2016
- )
2017
-
2018
- if not is_identifier_like and (is_textual or is_boolean or is_categorical_numeric):
2019
- formatted = [_format_literal(val, base_type) for val in distinct_values[:5]]
2331
+ is_categorical_numeric = base_type in {
2332
+ "INT",
2333
+ "INTEGER",
2334
+ "NUMBER",
2335
+ "SMALLINT",
2336
+ "BIGINT",
2337
+ } and any(upper_name.endswith(suffix) for suffix in categorical_suffixes)
2338
+
2339
+ if not is_identifier_like and (
2340
+ is_textual or is_boolean or is_categorical_numeric
2341
+ ):
2342
+ formatted = [
2343
+ _format_literal(val, base_type) for val in distinct_values[:5]
2344
+ ]
2020
2345
  expr = f"{col.column_name} IN ({', '.join(formatted)})"
2021
2346
  suggestions.append(
2022
2347
  semantic_model_pb2.NamedFilter(
@@ -2034,11 +2359,31 @@ def _infer_relationships(
2034
2359
  *,
2035
2360
  session: Optional[Session] = None,
2036
2361
  strict_join_inference: bool = False,
2362
+ status: Optional[Dict[str, bool]] = None,
2363
+ max_relationships: Optional[int] = None,
2364
+ min_confidence: float = 0.2,
2365
+ timeout_seconds: Optional[float] = None,
2037
2366
  ) -> List[semantic_model_pb2.Relationship]:
2367
+ status_dict = status if status is not None else {}
2368
+ if "limited_by_timeout" not in status_dict:
2369
+ status_dict["limited_by_timeout"] = False
2370
+ if "limited_by_max_relationships" not in status_dict:
2371
+ status_dict["limited_by_max_relationships"] = False
2372
+
2038
2373
  relationships: List[semantic_model_pb2.Relationship] = []
2039
2374
  if not raw_tables:
2040
2375
  return relationships
2041
2376
 
2377
+ start_time = time.perf_counter()
2378
+ min_confidence = max(0.0, min(min_confidence, 1.0))
2379
+ limit_reached = False
2380
+
2381
+ def _timed_out() -> bool:
2382
+ return (
2383
+ timeout_seconds is not None
2384
+ and (time.perf_counter() - start_time) >= timeout_seconds
2385
+ )
2386
+
2042
2387
  metadata = {}
2043
2388
  prefix_counter: Dict[str, int] = {}
2044
2389
  for _, raw_table in raw_tables:
@@ -2060,7 +2405,9 @@ def _infer_relationships(
2060
2405
  table_prefixes = global_prefixes | _table_prefixes(raw_table.name)
2061
2406
  for column in raw_table.columns:
2062
2407
  base_type = _base_type_from_type(column.column_type)
2063
- normalized = _sanitize_identifier_name(column.column_name, prefixes_to_drop=table_prefixes)
2408
+ normalized = _sanitize_identifier_name(
2409
+ column.column_name, prefixes_to_drop=table_prefixes
2410
+ )
2064
2411
  entry = columns_meta.setdefault(
2065
2412
  normalized,
2066
2413
  {
@@ -2075,7 +2422,9 @@ def _infer_relationships(
2075
2422
  entry["names"].append(column.column_name)
2076
2423
  if column.values:
2077
2424
  entry["values"].extend(column.values)
2078
- entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(column.column_name, base_type)
2425
+ entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
2426
+ column.column_name, base_type
2427
+ )
2079
2428
  is_primary = getattr(column, "is_primary_key", False)
2080
2429
  if is_primary:
2081
2430
  entry["is_primary"] = True
@@ -2093,15 +2442,42 @@ def _infer_relationships(
2093
2442
  pairs: dict[tuple[str, str], List[tuple[str, str]]] = {}
2094
2443
  null_check_cache: Dict[Tuple[str, str, str, str], bool] = {}
2095
2444
 
2096
- def _record_pair(left_table: str, right_table: str, left_col: str, right_col: str) -> None:
2445
+ def _record_pair(
2446
+ left_table: str, right_table: str, left_col: str, right_col: str
2447
+ ) -> None:
2448
+ nonlocal limit_reached
2449
+ if limit_reached:
2450
+ return
2451
+ if _timed_out():
2452
+ status_dict["limited_by_timeout"] = True
2453
+ limit_reached = True
2454
+ return
2455
+
2097
2456
  key = (left_table, right_table)
2098
2457
  value = (left_col, right_col)
2099
- if value not in pairs.setdefault(key, []):
2100
- pairs[key].append(value)
2458
+ bucket = pairs.setdefault(key, [])
2459
+ if value not in bucket:
2460
+ bucket.append(value)
2461
+ if (
2462
+ max_relationships is not None
2463
+ and len(pairs) >= max_relationships
2464
+ ):
2465
+ status_dict["limited_by_max_relationships"] = True
2466
+ limit_reached = True
2101
2467
 
2102
2468
  table_names = list(metadata.keys())
2103
2469
  for i in range(len(table_names)):
2470
+ if limit_reached or status_dict["limited_by_timeout"]:
2471
+ break
2472
+ if _timed_out():
2473
+ status_dict["limited_by_timeout"] = True
2474
+ break
2104
2475
  for j in range(i + 1, len(table_names)):
2476
+ if limit_reached or status_dict["limited_by_timeout"]:
2477
+ break
2478
+ if _timed_out():
2479
+ status_dict["limited_by_timeout"] = True
2480
+ break
2105
2481
  table_a_name = table_names[i]
2106
2482
  table_b_name = table_names[j]
2107
2483
  table_a = metadata[table_a_name]
@@ -2158,7 +2534,7 @@ def _infer_relationships(
2158
2534
  continue
2159
2535
  if norm_b == pk_norm:
2160
2536
  continue
2161
-
2537
+
2162
2538
  # Direct suffix match
2163
2539
  if norm_b.endswith(pk_norm):
2164
2540
  _record_pair(
@@ -2168,23 +2544,34 @@ def _infer_relationships(
2168
2544
  pk_cols[0],
2169
2545
  )
2170
2546
  continue
2171
-
2547
+
2172
2548
  # Enhanced: Check if column looks like a foreign key to this table
2173
- if _looks_like_foreign_key(table_b_name, table_a_name, meta_b["names"][0]):
2549
+ if _looks_like_foreign_key(
2550
+ table_b_name, table_a_name, meta_b["names"][0]
2551
+ ):
2174
2552
  # Additional check: name similarity with adaptive threshold
2175
2553
  similarity = _name_similarity(norm_b, pk_norm)
2176
2554
  # Calculate adaptive threshold for this relationship
2177
2555
  all_sample_values = []
2178
- for col_values in [pk_meta.get("values", []), meta_b.get("values", [])]:
2556
+ for col_values in [
2557
+ pk_meta.get("values", []),
2558
+ meta_b.get("values", []),
2559
+ ]:
2179
2560
  if col_values:
2180
2561
  all_sample_values.append(col_values)
2181
2562
 
2182
2563
  adaptive_thresholds = _calculate_adaptive_thresholds(
2183
2564
  all_sample_values,
2184
2565
  table_count=len(raw_tables),
2185
- base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
2566
+ base_sample_size=(
2567
+ len(pk_meta.get("values", []))
2568
+ if pk_meta.get("values")
2569
+ else 10
2570
+ ),
2571
+ )
2572
+ similarity_threshold = adaptive_thresholds.get(
2573
+ "similarity_threshold", 0.6
2186
2574
  )
2187
- similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
2188
2575
 
2189
2576
  if similarity >= similarity_threshold:
2190
2577
  _record_pair(
@@ -2204,7 +2591,7 @@ def _infer_relationships(
2204
2591
  continue
2205
2592
  if norm_a == pk_norm:
2206
2593
  continue
2207
-
2594
+
2208
2595
  # Direct suffix match
2209
2596
  if norm_a.endswith(pk_norm):
2210
2597
  _record_pair(
@@ -2214,23 +2601,34 @@ def _infer_relationships(
2214
2601
  pk_cols[0],
2215
2602
  )
2216
2603
  continue
2217
-
2604
+
2218
2605
  # Enhanced: Check if column looks like a foreign key to this table
2219
- if _looks_like_foreign_key(table_a_name, table_b_name, meta_a["names"][0]):
2606
+ if _looks_like_foreign_key(
2607
+ table_a_name, table_b_name, meta_a["names"][0]
2608
+ ):
2220
2609
  # Additional check: name similarity with adaptive threshold
2221
2610
  similarity = _name_similarity(norm_a, pk_norm)
2222
2611
  # Calculate adaptive threshold for this relationship
2223
2612
  all_sample_values = []
2224
- for col_values in [pk_meta.get("values", []), meta_a.get("values", [])]:
2613
+ for col_values in [
2614
+ pk_meta.get("values", []),
2615
+ meta_a.get("values", []),
2616
+ ]:
2225
2617
  if col_values:
2226
2618
  all_sample_values.append(col_values)
2227
2619
 
2228
2620
  adaptive_thresholds = _calculate_adaptive_thresholds(
2229
2621
  all_sample_values,
2230
2622
  table_count=len(raw_tables),
2231
- base_sample_size=len(pk_meta.get("values", [])) if pk_meta.get("values") else 10,
2623
+ base_sample_size=(
2624
+ len(pk_meta.get("values", []))
2625
+ if pk_meta.get("values")
2626
+ else 10
2627
+ ),
2628
+ )
2629
+ similarity_threshold = adaptive_thresholds.get(
2630
+ "similarity_threshold", 0.6
2232
2631
  )
2233
- similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
2234
2632
 
2235
2633
  if similarity >= similarity_threshold:
2236
2634
  _record_pair(
@@ -2255,10 +2653,19 @@ def _infer_relationships(
2255
2653
 
2256
2654
  # Build relationships with inferred cardinality
2257
2655
  for (left_table, right_table), column_pairs in pairs.items():
2656
+ if _timed_out():
2657
+ status_dict["limited_by_timeout"] = True
2658
+ break
2659
+ if (
2660
+ max_relationships is not None
2661
+ and len(relationships) >= max_relationships
2662
+ ):
2663
+ status_dict["limited_by_max_relationships"] = True
2664
+ break
2258
2665
  # Infer cardinality based on available metadata
2259
2666
  left_meta = metadata[left_table]
2260
2667
  right_meta = metadata[right_table]
2261
-
2668
+
2262
2669
  # Determine if tables have primary keys in the relationship
2263
2670
  left_has_pk = any(
2264
2671
  col_name in [pair[0] for pair in column_pairs]
@@ -2270,7 +2677,7 @@ def _infer_relationships(
2270
2677
  for pk_list in right_meta["pk_candidates"].values()
2271
2678
  for col_name in pk_list
2272
2679
  )
2273
-
2680
+
2274
2681
  # Enhanced: Get sample values for all columns in the relationship (for composite key analysis)
2275
2682
  left_values_all = []
2276
2683
  right_values_all = []
@@ -2279,12 +2686,11 @@ def _infer_relationships(
2279
2686
 
2280
2687
  for left_col, right_col in column_pairs:
2281
2688
  left_col_key = _sanitize_identifier_name(
2282
- left_col,
2283
- prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
2689
+ left_col, prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
2284
2690
  )
2285
2691
  right_col_key = _sanitize_identifier_name(
2286
2692
  right_col,
2287
- prefixes_to_drop=global_prefixes | _table_prefixes(right_table)
2693
+ prefixes_to_drop=global_prefixes | _table_prefixes(right_table),
2288
2694
  )
2289
2695
 
2290
2696
  left_col_values = []
@@ -2293,7 +2699,9 @@ def _infer_relationships(
2293
2699
  if left_col_key in left_meta["columns"]:
2294
2700
  left_col_values = left_meta["columns"][left_col_key].get("values") or []
2295
2701
  if right_col_key in right_meta["columns"]:
2296
- right_col_values = right_meta["columns"][right_col_key].get("values") or []
2702
+ right_col_values = (
2703
+ right_meta["columns"][right_col_key].get("values") or []
2704
+ )
2297
2705
 
2298
2706
  left_values_all.append(left_col_values)
2299
2707
  right_values_all.append(right_col_values)
@@ -2322,7 +2730,7 @@ def _infer_relationships(
2322
2730
  right_has_pk,
2323
2731
  adaptive_thresholds=global_adaptive_thresholds,
2324
2732
  )
2325
-
2733
+
2326
2734
  # Determine if SQL null probe should be executed for stricter inference
2327
2735
  strict_fk_detected = False
2328
2736
  if strict_join_inference and session:
@@ -2352,7 +2760,7 @@ def _infer_relationships(
2352
2760
  left_table_meta=left_meta,
2353
2761
  right_table_meta=right_meta,
2354
2762
  )
2355
-
2763
+
2356
2764
  # Calculate confidence and reasoning for this relationship
2357
2765
  confidence_analysis = _calculate_relationship_confidence(
2358
2766
  left_table=left_table,
@@ -2376,45 +2784,54 @@ def _infer_relationships(
2376
2784
  column_pairs=column_pairs,
2377
2785
  left_meta=left_meta,
2378
2786
  right_meta=right_meta,
2379
- current_confidence=confidence_analysis['confidence_score']
2787
+ current_confidence=confidence_analysis["confidence_score"],
2380
2788
  )
2381
2789
 
2382
2790
  # Update confidence analysis with domain knowledge
2383
- if domain_enhancement['confidence_boost'] > 0:
2384
- confidence_analysis['confidence_score'] = min(1.0,
2385
- confidence_analysis['confidence_score'] + domain_enhancement['confidence_boost'])
2791
+ if domain_enhancement["confidence_boost"] > 0:
2792
+ confidence_analysis["confidence_score"] = min(
2793
+ 1.0,
2794
+ confidence_analysis["confidence_score"]
2795
+ + domain_enhancement["confidence_boost"],
2796
+ )
2386
2797
 
2387
2798
  # Add domain knowledge factors to reasoning
2388
- for domain_factor in domain_enhancement['domain_factors']:
2389
- confidence_analysis['reasoning_factors'].append(f"Domain knowledge: {domain_factor}")
2799
+ for domain_factor in domain_enhancement["domain_factors"]:
2800
+ confidence_analysis["reasoning_factors"].append(
2801
+ f"Domain knowledge: {domain_factor}"
2802
+ )
2390
2803
 
2391
2804
  # Update confidence level based on new score
2392
- if confidence_analysis['confidence_score'] >= 0.8:
2393
- confidence_analysis['confidence_level'] = 'very_high'
2394
- confidence_analysis['confidence_description'] = 'Very High Confidence'
2395
- elif confidence_analysis['confidence_score'] >= 0.6:
2396
- confidence_analysis['confidence_level'] = 'high'
2397
- confidence_analysis['confidence_description'] = 'High Confidence'
2398
- elif confidence_analysis['confidence_score'] >= 0.4:
2399
- confidence_analysis['confidence_level'] = 'medium'
2400
- confidence_analysis['confidence_description'] = 'Medium Confidence'
2401
- elif confidence_analysis['confidence_score'] >= 0.2:
2402
- confidence_analysis['confidence_level'] = 'low'
2403
- confidence_analysis['confidence_description'] = 'Low Confidence'
2805
+ if confidence_analysis["confidence_score"] >= 0.8:
2806
+ confidence_analysis["confidence_level"] = "very_high"
2807
+ confidence_analysis["confidence_description"] = "Very High Confidence"
2808
+ elif confidence_analysis["confidence_score"] >= 0.6:
2809
+ confidence_analysis["confidence_level"] = "high"
2810
+ confidence_analysis["confidence_description"] = "High Confidence"
2811
+ elif confidence_analysis["confidence_score"] >= 0.4:
2812
+ confidence_analysis["confidence_level"] = "medium"
2813
+ confidence_analysis["confidence_description"] = "Medium Confidence"
2814
+ elif confidence_analysis["confidence_score"] >= 0.2:
2815
+ confidence_analysis["confidence_level"] = "low"
2816
+ confidence_analysis["confidence_description"] = "Low Confidence"
2404
2817
  else:
2405
- confidence_analysis['confidence_level'] = 'very_low'
2406
- confidence_analysis['confidence_description'] = 'Very Low Confidence'
2818
+ confidence_analysis["confidence_level"] = "very_low"
2819
+ confidence_analysis["confidence_description"] = "Very Low Confidence"
2407
2820
 
2408
2821
  # Enhanced logging with confidence and reasoning
2409
2822
  sample_info = f"samples: L={len(left_values)}, R={len(right_values)}"
2410
2823
  pk_info = f"PKs: L={left_has_pk}, R={right_has_pk}"
2411
- join_type_name = "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
2824
+ join_type_name = (
2825
+ "INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
2826
+ )
2412
2827
  confidence_info = f"confidence: {confidence_analysis['confidence_score']:.2f} ({confidence_analysis['confidence_level']})"
2413
2828
 
2414
2829
  # Add domain knowledge info if applied
2415
2830
  domain_info = ""
2416
- if domain_enhancement['confidence_boost'] > 0:
2417
- domain_info = f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
2831
+ if domain_enhancement["confidence_boost"] > 0:
2832
+ domain_info = (
2833
+ f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
2834
+ )
2418
2835
 
2419
2836
  logger.info(
2420
2837
  f"Relationship inference for {left_table} -> {right_table}: "
@@ -2423,22 +2840,40 @@ def _infer_relationships(
2423
2840
  )
2424
2841
 
2425
2842
  # Log domain knowledge patterns if detected
2426
- domain_factors = [f for f in confidence_analysis['reasoning_factors'] if f.startswith("Domain knowledge:")]
2843
+ domain_factors = [
2844
+ f
2845
+ for f in confidence_analysis["reasoning_factors"]
2846
+ if f.startswith("Domain knowledge:")
2847
+ ]
2427
2848
  if domain_factors:
2428
- logger.debug(f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}")
2849
+ logger.debug(
2850
+ f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}"
2851
+ )
2429
2852
 
2430
2853
  # Log detailed reasoning for medium or lower confidence relationships
2431
- if confidence_analysis['confidence_score'] < 0.6:
2854
+ if confidence_analysis["confidence_score"] < 0.6:
2432
2855
  logger.debug(f"Confidence reasoning for {left_table} -> {right_table}:")
2433
- for factor in confidence_analysis['reasoning_factors']:
2856
+ for factor in confidence_analysis["reasoning_factors"]:
2434
2857
  logger.debug(f" - {factor}")
2435
2858
 
2436
2859
  # Log very high confidence relationships with their evidence
2437
- elif confidence_analysis['confidence_score'] >= 0.8:
2438
- logger.debug(f"High confidence relationship {left_table} -> {right_table} based on:")
2439
- for factor in confidence_analysis['reasoning_factors'][:3]: # Top 3 factors
2860
+ elif confidence_analysis["confidence_score"] >= 0.8:
2861
+ logger.debug(
2862
+ f"High confidence relationship {left_table} -> {right_table} based on:"
2863
+ )
2864
+ for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
2440
2865
  logger.debug(f" + {factor}")
2441
-
2866
+
2867
+ if confidence_analysis["confidence_score"] < min_confidence:
2868
+ logger.debug(
2869
+ "Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
2870
+ left_table,
2871
+ right_table,
2872
+ confidence_analysis["confidence_score"],
2873
+ min_confidence,
2874
+ )
2875
+ continue
2876
+
2442
2877
  # Determine relationship type based on cardinality
2443
2878
  if left_card == "1" and right_card == "1":
2444
2879
  rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2449,7 +2884,7 @@ def _infer_relationships(
2449
2884
  else:
2450
2885
  # Default to many_to_one for backward compatibility
2451
2886
  rel_type = semantic_model_pb2.RelationshipType.many_to_one
2452
-
2887
+
2453
2888
  relationship = semantic_model_pb2.Relationship(
2454
2889
  name=f"{left_table}_to_{right_table}",
2455
2890
  left_table=left_table,
@@ -2466,15 +2901,30 @@ def _infer_relationships(
2466
2901
  relationships.append(relationship)
2467
2902
 
2468
2903
  # Phase 2: Detect many-to-many relationships through bridge table analysis
2469
- many_to_many_relationships = _detect_many_to_many_relationships(
2470
- raw_tables, metadata, relationships
2471
- )
2904
+ many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
2905
+ if not status_dict["limited_by_timeout"] and (
2906
+ max_relationships is None or len(relationships) < max_relationships
2907
+ ):
2908
+ many_to_many_relationships = _detect_many_to_many_relationships(
2909
+ raw_tables, metadata, relationships
2910
+ )
2472
2911
 
2473
- if many_to_many_relationships:
2474
- relationships.extend(many_to_many_relationships)
2475
- logger.info(f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables")
2912
+ if many_to_many_relationships and max_relationships is not None:
2913
+ remaining = max_relationships - len(relationships)
2914
+ if remaining <= 0:
2915
+ many_to_many_relationships = []
2916
+ else:
2917
+ many_to_many_relationships = many_to_many_relationships[:remaining]
2476
2918
 
2477
- logger.info(f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables")
2919
+ if many_to_many_relationships:
2920
+ relationships.extend(many_to_many_relationships)
2921
+ logger.info(
2922
+ f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
2923
+ )
2924
+
2925
+ logger.info(
2926
+ f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
2927
+ )
2478
2928
  return relationships
2479
2929
 
2480
2930
 
@@ -2512,7 +2962,14 @@ def _raw_table_to_semantic_context_table(
2512
2962
  base_type = _base_type_from_type(col.column_type)
2513
2963
  if _is_time_like_column(col):
2514
2964
  time_data_type = col.column_type
2515
- if time_data_type.split("(")[0].upper() in {"STRING", "VARCHAR", "TEXT", "CHAR", "CHARACTER", "NVARCHAR"}:
2965
+ if time_data_type.split("(")[0].upper() in {
2966
+ "STRING",
2967
+ "VARCHAR",
2968
+ "TEXT",
2969
+ "CHAR",
2970
+ "CHARACTER",
2971
+ "NVARCHAR",
2972
+ }:
2516
2973
  time_data_type = "TIMESTAMP_NTZ"
2517
2974
  time_dimension_name = _safe_semantic_identifier(
2518
2975
  col.column_name,
@@ -2564,7 +3021,9 @@ def _raw_table_to_semantic_context_table(
2564
3021
  data_type=col.column_type,
2565
3022
  sample_values=col.values,
2566
3023
  synonyms=[_PLACEHOLDER_COMMENT],
2567
- description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
3024
+ description=(
3025
+ col.comment if col.comment else _PLACEHOLDER_COMMENT
3026
+ ),
2568
3027
  )
2569
3028
  )
2570
3029
  continue
@@ -2685,7 +3144,9 @@ def raw_schema_to_semantic_context(
2685
3144
  unique_database_schema.append(fqn_databse_schema)
2686
3145
 
2687
3146
  logger.info(f"Pulling column information from {fqn_table}")
2688
- _notify(f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}...")
3147
+ _notify(
3148
+ f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}..."
3149
+ )
2689
3150
  valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
2690
3151
  session=conn,
2691
3152
  workspace=fqn_table.database,
@@ -2751,7 +3212,9 @@ def raw_schema_to_semantic_context(
2751
3212
  semantic_model_name,
2752
3213
  actual_model,
2753
3214
  )
2754
- _notify("Running DashScope enrichment to enhance descriptions and metrics...")
3215
+ _notify(
3216
+ "Running DashScope enrichment to enhance descriptions and metrics..."
3217
+ )
2755
3218
 
2756
3219
  # Create progress tracker for enrichment
2757
3220
  def enrichment_progress_callback(update):
@@ -2760,14 +3223,16 @@ def raw_schema_to_semantic_context(
2760
3223
  EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
2761
3224
  EnrichmentStage.MODEL_METRICS: "Generating model-level metrics",
2762
3225
  EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
2763
- EnrichmentStage.COMPLETE: "Enrichment complete"
3226
+ EnrichmentStage.COMPLETE: "Enrichment complete",
2764
3227
  }
2765
3228
 
2766
3229
  base_message = stage_messages.get(update.stage, "Processing")
2767
3230
  if update.table_name:
2768
3231
  message = f"{base_message} - {update.table_name} ({update.current_step}/{update.total_steps})"
2769
3232
  elif update.total_steps > 1:
2770
- message = f"{base_message} ({update.current_step}/{update.total_steps})"
3233
+ message = (
3234
+ f"{base_message} ({update.current_step}/{update.total_steps})"
3235
+ )
2771
3236
  else:
2772
3237
  message = base_message
2773
3238
 
@@ -2801,7 +3266,9 @@ def raw_schema_to_semantic_context(
2801
3266
  )
2802
3267
  _notify("DashScope enrichment complete.")
2803
3268
  else:
2804
- logger.warning("LLM enrichment was requested but DashScope is not configured; skipping enrichment.")
3269
+ logger.warning(
3270
+ "LLM enrichment was requested but DashScope is not configured; skipping enrichment."
3271
+ )
2805
3272
  _notify("DashScope configuration missing; skipped enrichment.")
2806
3273
  return context
2807
3274
 
@@ -2938,6 +3405,7 @@ def generate_model_str_from_clickzetta(
2938
3405
  Returns:
2939
3406
  str: The raw string of the semantic context.
2940
3407
  """
3408
+
2941
3409
  def _notify(message: str) -> None:
2942
3410
  if progress_callback:
2943
3411
  try:
@@ -2946,7 +3414,11 @@ def generate_model_str_from_clickzetta(
2946
3414
  logger.debug("Progress callback failed for message: {}", message)
2947
3415
 
2948
3416
  table_list = ", ".join(base_tables)
2949
- logger.info("Generating semantic model '{}' from tables: {}", semantic_model_name, table_list)
3417
+ logger.info(
3418
+ "Generating semantic model '{}' from tables: {}",
3419
+ semantic_model_name,
3420
+ table_list,
3421
+ )
2950
3422
  _notify("Collecting metadata from ClickZetta tables...")
2951
3423
 
2952
3424
  context = raw_schema_to_semantic_context(