clickzetta-semantic-model-generator 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/METADATA +5 -5
- {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/RECORD +22 -19
- semantic_model_generator/clickzetta_utils/clickzetta_connector.py +91 -33
- semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
- semantic_model_generator/data_processing/cte_utils.py +1 -1
- semantic_model_generator/generate_model.py +588 -224
- semantic_model_generator/llm/dashscope_client.py +4 -2
- semantic_model_generator/llm/enrichment.py +144 -57
- semantic_model_generator/llm/progress_tracker.py +16 -15
- semantic_model_generator/relationships/__init__.py +15 -0
- semantic_model_generator/relationships/discovery.py +202 -0
- semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
- semantic_model_generator/tests/cte_utils_test.py +1 -1
- semantic_model_generator/tests/generate_model_classification_test.py +12 -2
- semantic_model_generator/tests/llm_enrichment_test.py +152 -46
- semantic_model_generator/tests/relationship_discovery_test.py +114 -0
- semantic_model_generator/tests/relationships_filters_test.py +166 -30
- semantic_model_generator/tests/utils_test.py +1 -1
- semantic_model_generator/validate/keywords.py +453 -53
- semantic_model_generator/validate/schema.py +4 -2
- {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/WHEEL +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
+
import math
|
1
2
|
import os
|
2
3
|
import re
|
3
|
-
import math
|
4
4
|
from collections import defaultdict
|
5
5
|
from datetime import datetime
|
6
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
@@ -8,8 +8,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
8
8
|
from clickzetta.zettapark.session import Session
|
9
9
|
from loguru import logger
|
10
10
|
|
11
|
-
from semantic_model_generator.data_processing import data_types, proto_utils
|
12
|
-
from semantic_model_generator.protos import semantic_model_pb2
|
13
11
|
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
14
12
|
AUTOGEN_TOKEN,
|
15
13
|
DIMENSION_DATATYPES,
|
@@ -20,14 +18,19 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
|
20
18
|
get_valid_schemas_tables_columns_df,
|
21
19
|
)
|
22
20
|
from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
|
23
|
-
from semantic_model_generator.
|
21
|
+
from semantic_model_generator.data_processing import data_types, proto_utils
|
24
22
|
from semantic_model_generator.llm import (
|
25
23
|
DashscopeClient,
|
26
24
|
DashscopeSettings,
|
27
25
|
enrich_semantic_model,
|
28
26
|
get_dashscope_settings,
|
29
27
|
)
|
30
|
-
from semantic_model_generator.llm.progress_tracker import
|
28
|
+
from semantic_model_generator.llm.progress_tracker import (
|
29
|
+
EnrichmentProgressTracker,
|
30
|
+
EnrichmentStage,
|
31
|
+
)
|
32
|
+
from semantic_model_generator.protos import semantic_model_pb2
|
33
|
+
from semantic_model_generator.validate.context_length import validate_context_length
|
31
34
|
from semantic_model_generator.validate.keywords import CZ_RESERVED_WORDS
|
32
35
|
|
33
36
|
_PLACEHOLDER_COMMENT = " "
|
@@ -39,6 +42,7 @@ _AUTOGEN_COMMENT_TOKEN = (
|
|
39
42
|
_DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
|
40
43
|
_AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
|
41
44
|
|
45
|
+
|
42
46
|
def _singularize(token: str) -> str:
|
43
47
|
if token.endswith("IES") and len(token) > 3:
|
44
48
|
return token[:-3] + "Y"
|
@@ -68,7 +72,9 @@ def _base_type_from_type(column_type: str) -> str:
|
|
68
72
|
return token.split("(")[0]
|
69
73
|
|
70
74
|
|
71
|
-
def _identifier_tokens(
|
75
|
+
def _identifier_tokens(
|
76
|
+
name: str, prefixes_to_drop: Optional[set[str]] = None
|
77
|
+
) -> List[str]:
|
72
78
|
name = name.replace("-", "_")
|
73
79
|
raw_tokens = re.split(r"[^0-9A-Za-z]+", name)
|
74
80
|
tokens: List[str] = []
|
@@ -84,7 +90,9 @@ def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -
|
|
84
90
|
return tokens
|
85
91
|
|
86
92
|
|
87
|
-
def _sanitize_identifier_name(
|
93
|
+
def _sanitize_identifier_name(
|
94
|
+
name: str, prefixes_to_drop: Optional[set[str]] = None
|
95
|
+
) -> str:
|
88
96
|
if not name:
|
89
97
|
return ""
|
90
98
|
|
@@ -271,7 +279,9 @@ def _looks_like_primary_key(table_name: str, column_name: str) -> bool:
|
|
271
279
|
"PRIMARY_KEY",
|
272
280
|
}
|
273
281
|
for variant in variants:
|
274
|
-
direct_matches.update(
|
282
|
+
direct_matches.update(
|
283
|
+
{f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}
|
284
|
+
)
|
275
285
|
if upper_name in direct_matches:
|
276
286
|
return True
|
277
287
|
|
@@ -368,7 +378,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
368
378
|
return _levenshtein_distance(s2, s1)
|
369
379
|
if len(s2) == 0:
|
370
380
|
return len(s1)
|
371
|
-
|
381
|
+
|
372
382
|
previous_row = range(len(s2) + 1)
|
373
383
|
for i, c1 in enumerate(s1):
|
374
384
|
current_row = [i + 1]
|
@@ -378,7 +388,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
378
388
|
substitutions = previous_row[j] + (c1 != c2)
|
379
389
|
current_row.append(min(insertions, deletions, substitutions))
|
380
390
|
previous_row = current_row
|
381
|
-
|
391
|
+
|
382
392
|
return previous_row[-1]
|
383
393
|
|
384
394
|
|
@@ -389,26 +399,26 @@ def _name_similarity(name1: str, name2: str) -> float:
|
|
389
399
|
"""
|
390
400
|
if not name1 or not name2:
|
391
401
|
return 0.0
|
392
|
-
|
402
|
+
|
393
403
|
# Exact match
|
394
404
|
if name1.upper() == name2.upper():
|
395
405
|
return 1.0
|
396
|
-
|
406
|
+
|
397
407
|
# Normalize names for comparison
|
398
408
|
norm1 = name1.upper().replace("_", "").replace("-", "")
|
399
409
|
norm2 = name2.upper().replace("_", "").replace("-", "")
|
400
|
-
|
410
|
+
|
401
411
|
if norm1 == norm2:
|
402
412
|
return 0.95
|
403
|
-
|
413
|
+
|
404
414
|
# Calculate Levenshtein-based similarity
|
405
415
|
max_len = max(len(norm1), len(norm2))
|
406
416
|
if max_len == 0:
|
407
417
|
return 0.0
|
408
|
-
|
418
|
+
|
409
419
|
distance = _levenshtein_distance(norm1, norm2)
|
410
420
|
similarity = 1.0 - (distance / max_len)
|
411
|
-
|
421
|
+
|
412
422
|
return max(0.0, similarity)
|
413
423
|
|
414
424
|
|
@@ -427,17 +437,24 @@ def _analyze_composite_key_patterns(
|
|
427
437
|
Dict with composite key analysis results
|
428
438
|
"""
|
429
439
|
pk_candidates = table_meta.get("pk_candidates", {})
|
430
|
-
columns_meta = table_meta.get("columns", {})
|
431
440
|
|
432
441
|
# Check if all relationship columns form a composite key
|
433
|
-
relationship_cols = [
|
442
|
+
relationship_cols = [
|
443
|
+
pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs
|
444
|
+
]
|
434
445
|
|
435
446
|
# Normalize column names for comparison
|
436
447
|
global_prefixes = set() # This should come from context but we'll handle it locally
|
437
|
-
table_prefixes = _table_prefixes(
|
448
|
+
table_prefixes = _table_prefixes(
|
449
|
+
list(table_meta.get("columns", {}).keys())[0]
|
450
|
+
if table_meta.get("columns")
|
451
|
+
else ""
|
452
|
+
)
|
438
453
|
|
439
454
|
normalized_rel_cols = [
|
440
|
-
_sanitize_identifier_name(
|
455
|
+
_sanitize_identifier_name(
|
456
|
+
col, prefixes_to_drop=global_prefixes | table_prefixes
|
457
|
+
)
|
441
458
|
for col in relationship_cols
|
442
459
|
]
|
443
460
|
|
@@ -448,7 +465,9 @@ def _analyze_composite_key_patterns(
|
|
448
465
|
analysis = {
|
449
466
|
"is_composite_pk": pk_col_count > 1 and pk_col_count == total_pk_candidates,
|
450
467
|
"partial_pk": pk_col_count > 0 and pk_col_count < total_pk_candidates,
|
451
|
-
"pk_coverage_ratio":
|
468
|
+
"pk_coverage_ratio": (
|
469
|
+
pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0
|
470
|
+
),
|
452
471
|
"relationship_column_count": len(relationship_cols),
|
453
472
|
"pk_column_count": pk_col_count,
|
454
473
|
}
|
@@ -457,7 +476,10 @@ def _analyze_composite_key_patterns(
|
|
457
476
|
if len(relationship_cols) > 1:
|
458
477
|
sequential_patterns = []
|
459
478
|
for col in relationship_cols:
|
460
|
-
if any(
|
479
|
+
if any(
|
480
|
+
pattern in col.upper()
|
481
|
+
for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]
|
482
|
+
):
|
461
483
|
sequential_patterns.append(col)
|
462
484
|
|
463
485
|
analysis["sequential_id_pattern"] = len(sequential_patterns) >= 2
|
@@ -504,9 +526,12 @@ def _infer_composite_cardinality(
|
|
504
526
|
# Rule 3: Composite key uniqueness analysis (if we have sufficient samples)
|
505
527
|
MIN_SAMPLE_SIZE = 20 # Lower threshold for composite keys
|
506
528
|
|
507
|
-
if (
|
508
|
-
|
509
|
-
|
529
|
+
if (
|
530
|
+
left_values_all
|
531
|
+
and right_values_all
|
532
|
+
and len(left_values_all) >= MIN_SAMPLE_SIZE
|
533
|
+
and len(right_values_all) >= MIN_SAMPLE_SIZE
|
534
|
+
):
|
510
535
|
|
511
536
|
# Create composite keys by concatenating values
|
512
537
|
left_composite_keys = []
|
@@ -515,10 +540,12 @@ def _infer_composite_cardinality(
|
|
515
540
|
sample_size = min(len(left_values_all), len(right_values_all))
|
516
541
|
|
517
542
|
for i in range(sample_size):
|
518
|
-
left_key = "|".join(
|
519
|
-
|
520
|
-
|
521
|
-
|
543
|
+
left_key = "|".join(
|
544
|
+
str(vals[i]) if i < len(vals) else "" for vals in left_values_all
|
545
|
+
)
|
546
|
+
right_key = "|".join(
|
547
|
+
str(vals[i]) if i < len(vals) else "" for vals in right_values_all
|
548
|
+
)
|
522
549
|
|
523
550
|
if left_key and not _is_nullish(left_key):
|
524
551
|
left_composite_keys.append(left_key)
|
@@ -527,7 +554,9 @@ def _infer_composite_cardinality(
|
|
527
554
|
|
528
555
|
if left_composite_keys and right_composite_keys:
|
529
556
|
left_unique_ratio = len(set(left_composite_keys)) / len(left_composite_keys)
|
530
|
-
right_unique_ratio = len(set(right_composite_keys)) / len(
|
557
|
+
right_unique_ratio = len(set(right_composite_keys)) / len(
|
558
|
+
right_composite_keys
|
559
|
+
)
|
531
560
|
|
532
561
|
# Lower threshold for composite key uniqueness
|
533
562
|
if right_unique_ratio > 0.9:
|
@@ -561,6 +590,7 @@ def _infer_composite_cardinality(
|
|
561
590
|
adaptive_thresholds=adaptive_thresholds,
|
562
591
|
)
|
563
592
|
|
593
|
+
|
564
594
|
def _detect_bridge_table_pattern(
|
565
595
|
table_meta: Dict[str, Any],
|
566
596
|
all_tables_meta: Dict[str, Dict[str, Any]],
|
@@ -606,7 +636,9 @@ def _detect_bridge_table_pattern(
|
|
606
636
|
base_type = col_info.get("base_type", "")
|
607
637
|
|
608
638
|
# Check if column looks like an ID/foreign key
|
609
|
-
if any(
|
639
|
+
if any(
|
640
|
+
pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]
|
641
|
+
):
|
610
642
|
id_columns.append(original_name)
|
611
643
|
|
612
644
|
# Check if this could be a foreign key to another table
|
@@ -615,11 +647,13 @@ def _detect_bridge_table_pattern(
|
|
615
647
|
continue
|
616
648
|
|
617
649
|
if _looks_like_foreign_key(table_name, other_table_name, original_name):
|
618
|
-
fk_like_columns.append(
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
650
|
+
fk_like_columns.append(
|
651
|
+
{
|
652
|
+
"column": original_name,
|
653
|
+
"references_table": other_table_name,
|
654
|
+
"confidence": 0.8,
|
655
|
+
}
|
656
|
+
)
|
623
657
|
break
|
624
658
|
|
625
659
|
# Check if column name contains the other table name
|
@@ -628,11 +662,13 @@ def _detect_bridge_table_pattern(
|
|
628
662
|
|
629
663
|
for variant in other_variants:
|
630
664
|
if variant in col_tokens:
|
631
|
-
fk_like_columns.append(
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
665
|
+
fk_like_columns.append(
|
666
|
+
{
|
667
|
+
"column": original_name,
|
668
|
+
"references_table": other_table_name,
|
669
|
+
"confidence": 0.6,
|
670
|
+
}
|
671
|
+
)
|
636
672
|
break
|
637
673
|
else:
|
638
674
|
# Count descriptive/non-ID columns
|
@@ -680,8 +716,18 @@ def _detect_bridge_table_pattern(
|
|
680
716
|
# Name-based heuristics
|
681
717
|
table_upper = table_name.upper()
|
682
718
|
bridge_keywords = {
|
683
|
-
"BRIDGE",
|
684
|
-
"
|
719
|
+
"BRIDGE",
|
720
|
+
"JUNCTION",
|
721
|
+
"LINK",
|
722
|
+
"ASSOC",
|
723
|
+
"ASSOCIATION",
|
724
|
+
"REL",
|
725
|
+
"RELATIONSHIP",
|
726
|
+
"MAP",
|
727
|
+
"MAPPING",
|
728
|
+
"XREF",
|
729
|
+
"CROSS_REF",
|
730
|
+
"CONNECTOR",
|
685
731
|
}
|
686
732
|
|
687
733
|
for keyword in bridge_keywords:
|
@@ -708,7 +754,9 @@ def _detect_bridge_table_pattern(
|
|
708
754
|
|
709
755
|
is_bridge = confidence >= 0.6 # Threshold for bridge table classification
|
710
756
|
|
711
|
-
connected_tables = [
|
757
|
+
connected_tables = [
|
758
|
+
fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5
|
759
|
+
]
|
712
760
|
|
713
761
|
return {
|
714
762
|
"is_bridge": is_bridge,
|
@@ -718,14 +766,14 @@ def _detect_bridge_table_pattern(
|
|
718
766
|
"fk_ratio": fk_ratio,
|
719
767
|
"id_ratio": id_ratio,
|
720
768
|
"total_columns": total_columns,
|
721
|
-
"descriptive_columns": descriptive_columns
|
769
|
+
"descriptive_columns": descriptive_columns,
|
722
770
|
}
|
723
771
|
|
724
772
|
|
725
773
|
def _detect_many_to_many_relationships(
|
726
774
|
raw_tables: List[tuple[data_types.FQNParts, data_types.Table]],
|
727
775
|
metadata: Dict[str, Dict[str, Any]],
|
728
|
-
existing_relationships: List[semantic_model_pb2.Relationship]
|
776
|
+
existing_relationships: List[semantic_model_pb2.Relationship],
|
729
777
|
) -> List[semantic_model_pb2.Relationship]:
|
730
778
|
"""
|
731
779
|
Detect many-to-many relationships through bridge table analysis.
|
@@ -746,7 +794,10 @@ def _detect_many_to_many_relationships(
|
|
746
794
|
for table_name, table_meta in metadata.items():
|
747
795
|
bridge_analysis = _detect_bridge_table_pattern(table_meta, metadata)
|
748
796
|
|
749
|
-
if
|
797
|
+
if (
|
798
|
+
bridge_analysis["is_bridge"]
|
799
|
+
and len(bridge_analysis["connected_tables"]) >= 2
|
800
|
+
):
|
750
801
|
bridge_tables[table_name] = bridge_analysis
|
751
802
|
|
752
803
|
logger.debug(
|
@@ -780,9 +831,15 @@ def _detect_many_to_many_relationships(
|
|
780
831
|
right_fk_cols = []
|
781
832
|
|
782
833
|
for fk_info in bridge_info["fk_like_columns"]:
|
783
|
-
if
|
834
|
+
if (
|
835
|
+
fk_info["references_table"] == left_table
|
836
|
+
and fk_info["confidence"] >= 0.5
|
837
|
+
):
|
784
838
|
left_fk_cols.append(fk_info["column"])
|
785
|
-
elif
|
839
|
+
elif (
|
840
|
+
fk_info["references_table"] == right_table
|
841
|
+
and fk_info["confidence"] >= 0.5
|
842
|
+
):
|
786
843
|
right_fk_cols.append(fk_info["column"])
|
787
844
|
|
788
845
|
if not left_fk_cols or not right_fk_cols:
|
@@ -806,8 +863,12 @@ def _detect_many_to_many_relationships(
|
|
806
863
|
# Use the first detected FK columns as a representative
|
807
864
|
relationship.relationship_columns.append(
|
808
865
|
semantic_model_pb2.RelationKey(
|
809
|
-
left_column=left_fk_cols[
|
810
|
-
|
866
|
+
left_column=left_fk_cols[
|
867
|
+
0
|
868
|
+
], # This is actually in the bridge table
|
869
|
+
right_column=right_fk_cols[
|
870
|
+
0
|
871
|
+
], # This is also in the bridge table
|
811
872
|
)
|
812
873
|
)
|
813
874
|
|
@@ -863,13 +924,19 @@ def _calculate_relationship_confidence(
|
|
863
924
|
pk_confidence = 0.4
|
864
925
|
confidence_score += pk_confidence
|
865
926
|
if left_has_pk and right_has_pk:
|
866
|
-
reasoning_factors.append(
|
927
|
+
reasoning_factors.append(
|
928
|
+
"Both sides have primary key metadata (very strong evidence)"
|
929
|
+
)
|
867
930
|
evidence_details["pk_evidence"] = "both_pk"
|
868
931
|
elif right_has_pk:
|
869
|
-
reasoning_factors.append(
|
932
|
+
reasoning_factors.append(
|
933
|
+
"Right side has primary key metadata (strong evidence)"
|
934
|
+
)
|
870
935
|
evidence_details["pk_evidence"] = "right_pk"
|
871
936
|
elif left_has_pk:
|
872
|
-
reasoning_factors.append(
|
937
|
+
reasoning_factors.append(
|
938
|
+
"Left side has primary key metadata (strong evidence)"
|
939
|
+
)
|
873
940
|
evidence_details["pk_evidence"] = "left_pk"
|
874
941
|
|
875
942
|
# Factor 2: Name similarity and pattern matching
|
@@ -884,19 +951,29 @@ def _calculate_relationship_confidence(
|
|
884
951
|
|
885
952
|
if avg_name_similarity >= 0.9:
|
886
953
|
name_confidence = 0.25
|
887
|
-
reasoning_factors.append(
|
954
|
+
reasoning_factors.append(
|
955
|
+
f"Very high column name similarity ({avg_name_similarity:.2f})"
|
956
|
+
)
|
888
957
|
elif avg_name_similarity >= 0.7:
|
889
958
|
name_confidence = 0.2
|
890
|
-
reasoning_factors.append(
|
959
|
+
reasoning_factors.append(
|
960
|
+
f"High column name similarity ({avg_name_similarity:.2f})"
|
961
|
+
)
|
891
962
|
elif avg_name_similarity >= 0.5:
|
892
963
|
name_confidence = 0.15
|
893
|
-
reasoning_factors.append(
|
964
|
+
reasoning_factors.append(
|
965
|
+
f"Moderate column name similarity ({avg_name_similarity:.2f})"
|
966
|
+
)
|
894
967
|
elif avg_name_similarity >= 0.3:
|
895
968
|
name_confidence = 0.1
|
896
|
-
reasoning_factors.append(
|
969
|
+
reasoning_factors.append(
|
970
|
+
f"Low column name similarity ({avg_name_similarity:.2f})"
|
971
|
+
)
|
897
972
|
else:
|
898
973
|
name_confidence = 0.05
|
899
|
-
reasoning_factors.append(
|
974
|
+
reasoning_factors.append(
|
975
|
+
f"Very low column name similarity ({avg_name_similarity:.2f})"
|
976
|
+
)
|
900
977
|
|
901
978
|
confidence_score += name_confidence
|
902
979
|
|
@@ -905,7 +982,9 @@ def _calculate_relationship_confidence(
|
|
905
982
|
for left_col, right_col in column_pairs:
|
906
983
|
if _looks_like_foreign_key(left_table, right_table, left_col):
|
907
984
|
fk_pattern_confidence += 0.1
|
908
|
-
reasoning_factors.append(
|
985
|
+
reasoning_factors.append(
|
986
|
+
f"Column '{left_col}' follows FK naming pattern"
|
987
|
+
)
|
909
988
|
|
910
989
|
confidence_score += min(fk_pattern_confidence, 0.2)
|
911
990
|
|
@@ -927,29 +1006,45 @@ def _calculate_relationship_confidence(
|
|
927
1006
|
|
928
1007
|
# Check if uniqueness pattern matches inferred cardinality
|
929
1008
|
left_card, right_card = cardinality_result
|
930
|
-
uniqueness_threshold =
|
1009
|
+
uniqueness_threshold = (
|
1010
|
+
adaptive_thresholds.get("uniqueness_threshold", 0.95)
|
1011
|
+
if adaptive_thresholds
|
1012
|
+
else 0.95
|
1013
|
+
)
|
931
1014
|
|
932
1015
|
cardinality_consistency = False
|
933
1016
|
if left_card == "1" and left_unique_ratio > uniqueness_threshold:
|
934
1017
|
cardinality_consistency = True
|
935
|
-
elif
|
1018
|
+
elif (
|
1019
|
+
left_card in ("*", "+")
|
1020
|
+
and left_unique_ratio <= uniqueness_threshold
|
1021
|
+
):
|
936
1022
|
cardinality_consistency = True
|
937
1023
|
|
938
1024
|
if right_card == "1" and right_unique_ratio > uniqueness_threshold:
|
939
1025
|
cardinality_consistency = cardinality_consistency and True
|
940
|
-
elif
|
1026
|
+
elif (
|
1027
|
+
right_card in ("*", "+")
|
1028
|
+
and right_unique_ratio <= uniqueness_threshold
|
1029
|
+
):
|
941
1030
|
cardinality_consistency = cardinality_consistency and True
|
942
1031
|
|
943
1032
|
if cardinality_consistency:
|
944
1033
|
uniqueness_confidence = 0.2
|
945
|
-
reasoning_factors.append(
|
1034
|
+
reasoning_factors.append(
|
1035
|
+
"Sample uniqueness patterns support inferred cardinality"
|
1036
|
+
)
|
946
1037
|
else:
|
947
1038
|
uniqueness_confidence = 0.1
|
948
|
-
reasoning_factors.append(
|
1039
|
+
reasoning_factors.append(
|
1040
|
+
"Sample uniqueness patterns partially support cardinality"
|
1041
|
+
)
|
949
1042
|
|
950
1043
|
confidence_score += uniqueness_confidence
|
951
1044
|
else:
|
952
|
-
reasoning_factors.append(
|
1045
|
+
reasoning_factors.append(
|
1046
|
+
f"Limited sample size ({sample_size}) reduces confidence"
|
1047
|
+
)
|
953
1048
|
|
954
1049
|
# Factor 4: Data type compatibility
|
955
1050
|
if column_pairs and left_meta and right_meta:
|
@@ -992,15 +1087,21 @@ def _calculate_relationship_confidence(
|
|
992
1087
|
evidence_details["left_table_role"] = left_role
|
993
1088
|
evidence_details["right_table_role"] = right_role
|
994
1089
|
|
995
|
-
relationship_context = _get_business_relationship_context(
|
1090
|
+
relationship_context = _get_business_relationship_context(
|
1091
|
+
left_table, right_table, left_role, right_role
|
1092
|
+
)
|
996
1093
|
evidence_details["relationship_context"] = relationship_context
|
997
1094
|
|
998
1095
|
if relationship_context in ["fact_to_dimension", "dimension_to_fact"]:
|
999
1096
|
role_confidence = 0.15
|
1000
|
-
reasoning_factors.append(
|
1097
|
+
reasoning_factors.append(
|
1098
|
+
f"Strong business relationship pattern: {relationship_context}"
|
1099
|
+
)
|
1001
1100
|
elif relationship_context in ["dimension_hierarchy", "bridge_relationship"]:
|
1002
1101
|
role_confidence = 0.1
|
1003
|
-
reasoning_factors.append(
|
1102
|
+
reasoning_factors.append(
|
1103
|
+
f"Valid business relationship pattern: {relationship_context}"
|
1104
|
+
)
|
1004
1105
|
elif relationship_context == "fact_to_fact":
|
1005
1106
|
role_confidence = 0.05
|
1006
1107
|
reasoning_factors.append("Unusual but possible fact-to-fact relationship")
|
@@ -1013,7 +1114,9 @@ def _calculate_relationship_confidence(
|
|
1013
1114
|
# Factor 6: Multiple column relationships (composite keys)
|
1014
1115
|
if len(column_pairs) > 1:
|
1015
1116
|
composite_confidence = 0.1
|
1016
|
-
reasoning_factors.append(
|
1117
|
+
reasoning_factors.append(
|
1118
|
+
f"Multi-column relationship ({len(column_pairs)} columns) increases confidence"
|
1119
|
+
)
|
1017
1120
|
confidence_score += composite_confidence
|
1018
1121
|
|
1019
1122
|
# Normalize confidence score to 0-1 range
|
@@ -1043,7 +1146,9 @@ def _calculate_relationship_confidence(
|
|
1043
1146
|
"reasoning_factors": reasoning_factors,
|
1044
1147
|
"evidence_details": evidence_details,
|
1045
1148
|
"inferred_cardinality": f"{cardinality_result[0]}:{cardinality_result[1]}",
|
1046
|
-
"join_type":
|
1149
|
+
"join_type": (
|
1150
|
+
"INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
|
1151
|
+
),
|
1047
1152
|
"column_count": len(column_pairs),
|
1048
1153
|
}
|
1049
1154
|
|
@@ -1059,101 +1164,196 @@ def _get_domain_knowledge_patterns() -> Dict[str, Any]:
|
|
1059
1164
|
# Common business entity patterns
|
1060
1165
|
"business_entities": {
|
1061
1166
|
"customer": {
|
1062
|
-
"table_patterns": [
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1167
|
+
"table_patterns": [
|
1168
|
+
"CUSTOMER",
|
1169
|
+
"CUST",
|
1170
|
+
"CLIENT",
|
1171
|
+
"ACCOUNT_HOLDER",
|
1172
|
+
"USER",
|
1173
|
+
"MEMBER",
|
1174
|
+
],
|
1175
|
+
"pk_patterns": [
|
1176
|
+
"CUSTOMER_ID",
|
1177
|
+
"CUST_ID",
|
1178
|
+
"CLIENT_ID",
|
1179
|
+
"USER_ID",
|
1180
|
+
"MEMBER_ID",
|
1181
|
+
],
|
1182
|
+
"typical_attributes": [
|
1183
|
+
"NAME",
|
1184
|
+
"EMAIL",
|
1185
|
+
"PHONE",
|
1186
|
+
"ADDRESS",
|
1187
|
+
"STATUS",
|
1188
|
+
"TYPE",
|
1189
|
+
"SEGMENT",
|
1190
|
+
],
|
1191
|
+
"role": "dimension",
|
1066
1192
|
},
|
1067
1193
|
"product": {
|
1068
1194
|
"table_patterns": ["PRODUCT", "ITEM", "SKU", "INVENTORY", "CATALOG"],
|
1069
1195
|
"pk_patterns": ["PRODUCT_ID", "ITEM_ID", "SKU", "PRODUCT_KEY"],
|
1070
|
-
"typical_attributes": [
|
1071
|
-
|
1196
|
+
"typical_attributes": [
|
1197
|
+
"NAME",
|
1198
|
+
"DESCRIPTION",
|
1199
|
+
"CATEGORY",
|
1200
|
+
"PRICE",
|
1201
|
+
"BRAND",
|
1202
|
+
"STATUS",
|
1203
|
+
],
|
1204
|
+
"role": "dimension",
|
1072
1205
|
},
|
1073
1206
|
"order": {
|
1074
1207
|
"table_patterns": ["ORDER", "TRANSACTION", "SALE", "PURCHASE"],
|
1075
|
-
"pk_patterns": [
|
1208
|
+
"pk_patterns": [
|
1209
|
+
"ORDER_ID",
|
1210
|
+
"TRANSACTION_ID",
|
1211
|
+
"SALE_ID",
|
1212
|
+
"ORDER_NUMBER",
|
1213
|
+
],
|
1076
1214
|
"typical_attributes": ["DATE", "AMOUNT", "STATUS", "QUANTITY", "TOTAL"],
|
1077
|
-
"role": "fact"
|
1215
|
+
"role": "fact",
|
1078
1216
|
},
|
1079
1217
|
"date": {
|
1080
1218
|
"table_patterns": ["DATE", "TIME", "CALENDAR", "DIM_DATE"],
|
1081
1219
|
"pk_patterns": ["DATE_ID", "DATE_KEY", "TIME_ID"],
|
1082
|
-
"typical_attributes": [
|
1083
|
-
|
1220
|
+
"typical_attributes": [
|
1221
|
+
"YEAR",
|
1222
|
+
"MONTH",
|
1223
|
+
"DAY",
|
1224
|
+
"QUARTER",
|
1225
|
+
"WEEK",
|
1226
|
+
"WEEKDAY",
|
1227
|
+
],
|
1228
|
+
"role": "dimension",
|
1084
1229
|
},
|
1085
1230
|
"location": {
|
1086
|
-
"table_patterns": [
|
1231
|
+
"table_patterns": [
|
1232
|
+
"LOCATION",
|
1233
|
+
"GEOGRAPHY",
|
1234
|
+
"ADDRESS",
|
1235
|
+
"REGION",
|
1236
|
+
"TERRITORY",
|
1237
|
+
],
|
1087
1238
|
"pk_patterns": ["LOCATION_ID", "GEO_ID", "ADDRESS_ID", "REGION_ID"],
|
1088
|
-
"typical_attributes": [
|
1089
|
-
|
1239
|
+
"typical_attributes": [
|
1240
|
+
"COUNTRY",
|
1241
|
+
"STATE",
|
1242
|
+
"CITY",
|
1243
|
+
"ZIP",
|
1244
|
+
"LATITUDE",
|
1245
|
+
"LONGITUDE",
|
1246
|
+
],
|
1247
|
+
"role": "dimension",
|
1090
1248
|
},
|
1091
1249
|
"employee": {
|
1092
1250
|
"table_patterns": ["EMPLOYEE", "STAFF", "WORKER", "PERSONNEL"],
|
1093
1251
|
"pk_patterns": ["EMPLOYEE_ID", "STAFF_ID", "EMP_ID"],
|
1094
|
-
"typical_attributes": [
|
1095
|
-
|
1096
|
-
|
1252
|
+
"typical_attributes": [
|
1253
|
+
"NAME",
|
1254
|
+
"DEPARTMENT",
|
1255
|
+
"TITLE",
|
1256
|
+
"MANAGER",
|
1257
|
+
"HIRE_DATE",
|
1258
|
+
],
|
1259
|
+
"role": "dimension",
|
1260
|
+
},
|
1097
1261
|
},
|
1098
|
-
|
1099
1262
|
# Common relationship patterns in data warehouses
|
1100
1263
|
"relationship_patterns": {
|
1101
1264
|
"star_schema": {
|
1102
1265
|
"pattern": "fact_to_dimension",
|
1103
1266
|
"confidence_boost": 0.2,
|
1104
|
-
"description": "Standard star schema fact-to-dimension relationship"
|
1267
|
+
"description": "Standard star schema fact-to-dimension relationship",
|
1105
1268
|
},
|
1106
1269
|
"snowflake_schema": {
|
1107
1270
|
"pattern": "dimension_hierarchy",
|
1108
1271
|
"confidence_boost": 0.15,
|
1109
|
-
"description": "Snowflake schema dimension hierarchy"
|
1272
|
+
"description": "Snowflake schema dimension hierarchy",
|
1110
1273
|
},
|
1111
1274
|
"bridge_table": {
|
1112
1275
|
"pattern": "many_to_many_via_bridge",
|
1113
1276
|
"confidence_boost": 0.1,
|
1114
|
-
"description": "Many-to-many relationship through bridge table"
|
1277
|
+
"description": "Many-to-many relationship through bridge table",
|
1115
1278
|
},
|
1116
1279
|
"time_dimension": {
|
1117
1280
|
"pattern": "temporal_relationship",
|
1118
1281
|
"confidence_boost": 0.25,
|
1119
|
-
"description": "Time-based relationship (very common in warehouses)"
|
1120
|
-
}
|
1282
|
+
"description": "Time-based relationship (very common in warehouses)",
|
1283
|
+
},
|
1121
1284
|
},
|
1122
|
-
|
1123
1285
|
# Known FK patterns that often appear in real data warehouses
|
1124
1286
|
"common_fk_patterns": {
|
1125
1287
|
"customer_references": [
|
1126
|
-
"CUSTOMER_ID",
|
1288
|
+
"CUSTOMER_ID",
|
1289
|
+
"CUST_ID",
|
1290
|
+
"CLIENT_ID",
|
1291
|
+
"ACCOUNT_ID",
|
1292
|
+
"USER_ID",
|
1127
1293
|
],
|
1128
1294
|
"product_references": [
|
1129
|
-
"PRODUCT_ID",
|
1295
|
+
"PRODUCT_ID",
|
1296
|
+
"ITEM_ID",
|
1297
|
+
"SKU",
|
1298
|
+
"PROD_ID",
|
1299
|
+
"CATALOG_ID",
|
1130
1300
|
],
|
1131
1301
|
"date_references": [
|
1132
|
-
"DATE_ID",
|
1133
|
-
"
|
1302
|
+
"DATE_ID",
|
1303
|
+
"ORDER_DATE_ID",
|
1304
|
+
"SHIP_DATE_ID",
|
1305
|
+
"CREATE_DATE_ID",
|
1306
|
+
"TRANSACTION_DATE_ID",
|
1307
|
+
"DATE_KEY",
|
1134
1308
|
],
|
1135
1309
|
"location_references": [
|
1136
|
-
"LOCATION_ID",
|
1137
|
-
"
|
1138
|
-
|
1310
|
+
"LOCATION_ID",
|
1311
|
+
"ADDRESS_ID",
|
1312
|
+
"SHIP_TO_ID",
|
1313
|
+
"BILL_TO_ID",
|
1314
|
+
"WAREHOUSE_ID",
|
1315
|
+
"STORE_ID",
|
1316
|
+
],
|
1139
1317
|
},
|
1140
|
-
|
1141
1318
|
# Table naming conventions that indicate specific patterns
|
1142
1319
|
"naming_conventions": {
|
1143
1320
|
"fact_indicators": [
|
1144
|
-
"FACT_",
|
1145
|
-
"
|
1321
|
+
"FACT_",
|
1322
|
+
"FCT_",
|
1323
|
+
"F_",
|
1324
|
+
"SALES_",
|
1325
|
+
"ORDERS_",
|
1326
|
+
"TRANSACTIONS_",
|
1327
|
+
"REVENUE_",
|
1328
|
+
"METRICS_",
|
1329
|
+
"EVENTS_",
|
1330
|
+
"ACTIVITY_",
|
1146
1331
|
],
|
1147
1332
|
"dimension_indicators": [
|
1148
|
-
"DIM_",
|
1333
|
+
"DIM_",
|
1334
|
+
"D_",
|
1335
|
+
"REF_",
|
1336
|
+
"LKP_",
|
1337
|
+
"LOOKUP_",
|
1338
|
+
"MASTER_",
|
1149
1339
|
],
|
1150
1340
|
"bridge_indicators": [
|
1151
|
-
"BRG_",
|
1341
|
+
"BRG_",
|
1342
|
+
"BRIDGE_",
|
1343
|
+
"XREF_",
|
1344
|
+
"MAP_",
|
1345
|
+
"ASSOC_",
|
1346
|
+
"LINK_",
|
1152
1347
|
],
|
1153
1348
|
"staging_indicators": [
|
1154
|
-
"STG_",
|
1155
|
-
|
1156
|
-
|
1349
|
+
"STG_",
|
1350
|
+
"STAGING_",
|
1351
|
+
"TMP_",
|
1352
|
+
"TEMP_",
|
1353
|
+
"RAW_",
|
1354
|
+
"LANDING_",
|
1355
|
+
],
|
1356
|
+
},
|
1157
1357
|
}
|
1158
1358
|
|
1159
1359
|
|
@@ -1204,18 +1404,26 @@ def _apply_domain_knowledge(
|
|
1204
1404
|
if entity_pair in common_pairs:
|
1205
1405
|
boost = common_pairs[entity_pair]
|
1206
1406
|
confidence_boost += boost
|
1207
|
-
enhancement_factors.append(
|
1407
|
+
enhancement_factors.append(
|
1408
|
+
f"Recognized common business pattern: {entity_pair} (+{boost:.2f})"
|
1409
|
+
)
|
1208
1410
|
elif f"{right_entity}-{left_entity}" in common_pairs:
|
1209
1411
|
boost = common_pairs[f"{right_entity}-{left_entity}"]
|
1210
1412
|
confidence_boost += boost
|
1211
|
-
enhancement_factors.append(
|
1413
|
+
enhancement_factors.append(
|
1414
|
+
f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})"
|
1415
|
+
)
|
1212
1416
|
|
1213
1417
|
# Factor 2: Check for standard FK naming patterns
|
1214
1418
|
for left_col, right_col in column_pairs:
|
1215
|
-
fk_pattern_match = _check_standard_fk_patterns(
|
1419
|
+
fk_pattern_match = _check_standard_fk_patterns(
|
1420
|
+
left_col, right_col, domain_patterns
|
1421
|
+
)
|
1216
1422
|
if fk_pattern_match:
|
1217
1423
|
confidence_boost += 0.15
|
1218
|
-
enhancement_factors.append(
|
1424
|
+
enhancement_factors.append(
|
1425
|
+
f"Standard FK pattern detected: {fk_pattern_match}"
|
1426
|
+
)
|
1219
1427
|
|
1220
1428
|
# Factor 3: Table naming convention analysis
|
1221
1429
|
left_convention = _identify_naming_convention(left_table, domain_patterns)
|
@@ -1223,8 +1431,9 @@ def _apply_domain_knowledge(
|
|
1223
1431
|
|
1224
1432
|
if left_convention and right_convention:
|
1225
1433
|
# Boost confidence for expected patterns
|
1226
|
-
if (left_convention == "fact" and right_convention == "dimension") or
|
1227
|
-
|
1434
|
+
if (left_convention == "fact" and right_convention == "dimension") or (
|
1435
|
+
left_convention == "dimension" and right_convention == "fact"
|
1436
|
+
):
|
1228
1437
|
confidence_boost += 0.2
|
1229
1438
|
enhancement_factors.append("Standard fact-dimension naming pattern (+0.20)")
|
1230
1439
|
elif left_convention == "dimension" and right_convention == "dimension":
|
@@ -1237,12 +1446,20 @@ def _apply_domain_knowledge(
|
|
1237
1446
|
enhancement_factors.append("Time dimension relationship (very common) (+0.20)")
|
1238
1447
|
|
1239
1448
|
# Factor 5: Schema pattern recognition (star vs snowflake)
|
1240
|
-
schema_pattern = _detect_schema_pattern(
|
1449
|
+
schema_pattern = _detect_schema_pattern(
|
1450
|
+
left_table, right_table, left_meta, right_meta, domain_patterns
|
1451
|
+
)
|
1241
1452
|
if schema_pattern:
|
1242
|
-
pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
|
1453
|
+
pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
|
1454
|
+
"confidence_boost"
|
1455
|
+
]
|
1243
1456
|
confidence_boost += pattern_boost
|
1244
|
-
pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
|
1245
|
-
|
1457
|
+
pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
|
1458
|
+
"description"
|
1459
|
+
]
|
1460
|
+
enhancement_factors.append(
|
1461
|
+
f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})"
|
1462
|
+
)
|
1246
1463
|
|
1247
1464
|
# Apply the boost but cap the final confidence at 1.0
|
1248
1465
|
enhanced_confidence = min(current_confidence + confidence_boost, 1.0)
|
@@ -1259,7 +1476,9 @@ def _apply_domain_knowledge(
|
|
1259
1476
|
}
|
1260
1477
|
|
1261
1478
|
|
1262
|
-
def _identify_business_entity(
|
1479
|
+
def _identify_business_entity(
|
1480
|
+
table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
|
1481
|
+
) -> Optional[str]:
|
1263
1482
|
"""Identify what business entity a table represents."""
|
1264
1483
|
table_upper = table_name.upper()
|
1265
1484
|
business_entities = domain_patterns["business_entities"]
|
@@ -1274,13 +1493,18 @@ def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domai
|
|
1274
1493
|
pk_candidates = table_meta.get("pk_candidates", {})
|
1275
1494
|
for pk_pattern in entity_info["pk_patterns"]:
|
1276
1495
|
for pk_norm in pk_candidates.keys():
|
1277
|
-
if
|
1496
|
+
if (
|
1497
|
+
pk_pattern.replace("_", "").upper()
|
1498
|
+
in pk_norm.replace("_", "").upper()
|
1499
|
+
):
|
1278
1500
|
return entity_type
|
1279
1501
|
|
1280
1502
|
return None
|
1281
1503
|
|
1282
1504
|
|
1283
|
-
def _check_standard_fk_patterns(
|
1505
|
+
def _check_standard_fk_patterns(
|
1506
|
+
left_col: str, right_col: str, domain_patterns: Dict[str, Any]
|
1507
|
+
) -> Optional[str]:
|
1284
1508
|
"""Check if column pair matches standard FK patterns."""
|
1285
1509
|
common_fks = domain_patterns["common_fk_patterns"]
|
1286
1510
|
|
@@ -1295,7 +1519,9 @@ def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns:
|
|
1295
1519
|
return None
|
1296
1520
|
|
1297
1521
|
|
1298
|
-
def _identify_naming_convention(
|
1522
|
+
def _identify_naming_convention(
|
1523
|
+
table_name: str, domain_patterns: Dict[str, Any]
|
1524
|
+
) -> Optional[str]:
|
1299
1525
|
"""Identify the naming convention used for a table."""
|
1300
1526
|
table_upper = table_name.upper()
|
1301
1527
|
naming_conventions = domain_patterns["naming_conventions"]
|
@@ -1308,7 +1534,9 @@ def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]
|
|
1308
1534
|
return None
|
1309
1535
|
|
1310
1536
|
|
1311
|
-
def _is_time_dimension_pattern(
|
1537
|
+
def _is_time_dimension_pattern(
|
1538
|
+
table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
|
1539
|
+
) -> bool:
|
1312
1540
|
"""Check if table follows time dimension patterns."""
|
1313
1541
|
table_upper = table_name.upper()
|
1314
1542
|
time_patterns = domain_patterns["business_entities"]["date"]["table_patterns"]
|
@@ -1344,15 +1572,16 @@ def _detect_schema_pattern(
|
|
1344
1572
|
right_table: str,
|
1345
1573
|
left_meta: Dict[str, Any],
|
1346
1574
|
right_meta: Dict[str, Any],
|
1347
|
-
domain_patterns: Dict[str, Any]
|
1575
|
+
domain_patterns: Dict[str, Any],
|
1348
1576
|
) -> Optional[str]:
|
1349
1577
|
"""Detect common schema patterns (star, snowflake, etc.)."""
|
1350
1578
|
left_role = _detect_table_role(left_table, left_meta)
|
1351
1579
|
right_role = _detect_table_role(right_table, right_meta)
|
1352
1580
|
|
1353
1581
|
# Star schema pattern: fact table to dimension
|
1354
|
-
if (left_role == "fact" and right_role == "dimension") or
|
1355
|
-
|
1582
|
+
if (left_role == "fact" and right_role == "dimension") or (
|
1583
|
+
left_role == "dimension" and right_role == "fact"
|
1584
|
+
):
|
1356
1585
|
return "star_schema"
|
1357
1586
|
|
1358
1587
|
# Snowflake schema pattern: dimension to dimension
|
@@ -1360,8 +1589,9 @@ def _detect_schema_pattern(
|
|
1360
1589
|
return "snowflake_schema"
|
1361
1590
|
|
1362
1591
|
# Time dimension pattern (very common)
|
1363
|
-
if _is_time_dimension_pattern(
|
1364
|
-
|
1592
|
+
if _is_time_dimension_pattern(
|
1593
|
+
right_table, right_meta, domain_patterns
|
1594
|
+
) or _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
|
1365
1595
|
return "time_dimension"
|
1366
1596
|
|
1367
1597
|
# Bridge table pattern
|
@@ -1397,7 +1627,9 @@ def _calculate_adaptive_thresholds(
|
|
1397
1627
|
# Calculate sample statistics
|
1398
1628
|
sample_sizes = [len(vals) for vals in values_list if vals]
|
1399
1629
|
max_sample_size = max(sample_sizes) if sample_sizes else base_sample_size
|
1400
|
-
avg_sample_size =
|
1630
|
+
avg_sample_size = (
|
1631
|
+
sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
|
1632
|
+
)
|
1401
1633
|
|
1402
1634
|
# Calculate data distribution characteristics
|
1403
1635
|
total_unique_values = 0
|
@@ -1425,7 +1657,7 @@ def _calculate_adaptive_thresholds(
|
|
1425
1657
|
if len(value_counts) > 1:
|
1426
1658
|
max_freq = max(value_counts.values())
|
1427
1659
|
min_freq = min(value_counts.values())
|
1428
|
-
skew = max_freq / min_freq if min_freq > 0 else float(
|
1660
|
+
skew = max_freq / min_freq if min_freq > 0 else float("inf")
|
1429
1661
|
skew_ratios.append(skew)
|
1430
1662
|
|
1431
1663
|
# Calculate overall uniqueness ratio
|
@@ -1459,7 +1691,9 @@ def _calculate_adaptive_thresholds(
|
|
1459
1691
|
min_size_adj *= 1.1
|
1460
1692
|
|
1461
1693
|
# Scale with base sample size from configuration
|
1462
|
-
size_scale_factor =
|
1694
|
+
size_scale_factor = (
|
1695
|
+
min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
|
1696
|
+
)
|
1463
1697
|
min_size_adj *= size_scale_factor
|
1464
1698
|
|
1465
1699
|
thresholds["min_sample_size"] = max(int(base_min_size * min_size_adj), 10)
|
@@ -1594,8 +1828,12 @@ def _infer_cardinality(
|
|
1594
1828
|
left_non_null = [v for v in left_values if not _is_nullish(v)]
|
1595
1829
|
right_non_null = [v for v in right_values if not _is_nullish(v)]
|
1596
1830
|
|
1597
|
-
left_unique_ratio =
|
1598
|
-
|
1831
|
+
left_unique_ratio = (
|
1832
|
+
len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
|
1833
|
+
)
|
1834
|
+
right_unique_ratio = (
|
1835
|
+
len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
|
1836
|
+
)
|
1599
1837
|
|
1600
1838
|
# Apply adaptive uniqueness threshold
|
1601
1839
|
left_is_unique = left_unique_ratio > uniqueness_threshold
|
@@ -1691,11 +1929,19 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1691
1929
|
Returns:
|
1692
1930
|
str: Table role ('fact', 'dimension', 'bridge', 'staging', 'unknown')
|
1693
1931
|
"""
|
1694
|
-
upper_name = table_name.upper()
|
1695
1932
|
tokens = _identifier_tokens(table_name)
|
1696
1933
|
|
1697
1934
|
# Rule 1: Explicit prefixes/suffixes
|
1698
|
-
fact_indicators = {
|
1935
|
+
fact_indicators = {
|
1936
|
+
"FACT",
|
1937
|
+
"FCT",
|
1938
|
+
"TXN",
|
1939
|
+
"TRANSACTION",
|
1940
|
+
"EVENT",
|
1941
|
+
"LOG",
|
1942
|
+
"SALES",
|
1943
|
+
"ORDER",
|
1944
|
+
}
|
1699
1945
|
dim_indicators = {"DIM", "DIMENSION", "LOOKUP", "REF", "REFERENCE", "MASTER"}
|
1700
1946
|
bridge_indicators = {"BRIDGE", "BRG", "LINK", "JUNCTION", "ASSOC", "ASSOCIATION"}
|
1701
1947
|
staging_indicators = {"STG", "STAGING", "TMP", "TEMP", "WORK", "LANDING", "RAW"}
|
@@ -1734,9 +1980,22 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1734
1980
|
id_count += 1
|
1735
1981
|
|
1736
1982
|
# Count measure-like columns (amounts, counts, quantities)
|
1737
|
-
if any(
|
1983
|
+
if any(
|
1984
|
+
word in col_name
|
1985
|
+
for word in [
|
1986
|
+
"AMOUNT",
|
1987
|
+
"QTY",
|
1988
|
+
"QUANTITY",
|
1989
|
+
"COUNT",
|
1990
|
+
"TOTAL",
|
1991
|
+
"SUM",
|
1992
|
+
"AVG",
|
1993
|
+
]
|
1994
|
+
):
|
1738
1995
|
measure_like_count += 1
|
1739
|
-
elif base_type in MEASURE_DATATYPES and not col_info.get(
|
1996
|
+
elif base_type in MEASURE_DATATYPES and not col_info.get(
|
1997
|
+
"is_identifier", False
|
1998
|
+
):
|
1740
1999
|
measure_like_count += 1
|
1741
2000
|
else:
|
1742
2001
|
dimension_like_count += 1
|
@@ -1761,7 +2020,9 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1761
2020
|
return "unknown"
|
1762
2021
|
|
1763
2022
|
|
1764
|
-
def _get_business_relationship_context(
|
2023
|
+
def _get_business_relationship_context(
|
2024
|
+
left_table: str, right_table: str, left_role: str, right_role: str
|
2025
|
+
) -> str:
|
1765
2026
|
"""
|
1766
2027
|
Determine business relationship context between tables based on their roles.
|
1767
2028
|
|
@@ -1833,7 +2094,7 @@ def _infer_join_type(
|
|
1833
2094
|
4. Naming pattern heuristics
|
1834
2095
|
5. Conservative INNER JOIN default
|
1835
2096
|
"""
|
1836
|
-
|
2097
|
+
|
1837
2098
|
# RULE 1: Default to INNER JOIN (most common and safest)
|
1838
2099
|
default_join = semantic_model_pb2.JoinType.inner
|
1839
2100
|
|
@@ -1861,9 +2122,17 @@ def _infer_join_type(
|
|
1861
2122
|
# Apply business rules based on relationship context
|
1862
2123
|
if relationship_context == "fact_to_dimension":
|
1863
2124
|
# Fact → Dimension: usually INNER, but check for optional dimensions
|
1864
|
-
if any(
|
1865
|
-
|
1866
|
-
|
2125
|
+
if any(
|
2126
|
+
keyword in right_table.upper()
|
2127
|
+
for keyword in [
|
2128
|
+
"PROMO",
|
2129
|
+
"PROMOTION",
|
2130
|
+
"DISCOUNT",
|
2131
|
+
"COUPON",
|
2132
|
+
"OPTIONAL",
|
2133
|
+
"SECONDARY",
|
2134
|
+
]
|
2135
|
+
):
|
1867
2136
|
logger.debug(
|
1868
2137
|
f"Join type inference for {left_table} -> {right_table}: "
|
1869
2138
|
f"LEFT_OUTER (fact to optional dimension: {right_role})"
|
@@ -1907,11 +2176,19 @@ def _infer_join_type(
|
|
1907
2176
|
return semantic_model_pb2.JoinType.left_outer
|
1908
2177
|
|
1909
2178
|
# RULE 5: Naming pattern heuristics for optional relationships
|
1910
|
-
left_upper = left_table.upper()
|
1911
2179
|
right_upper = right_table.upper()
|
1912
2180
|
optional_keywords = {
|
1913
|
-
"OPTIONAL",
|
1914
|
-
"
|
2181
|
+
"OPTIONAL",
|
2182
|
+
"ALTERNATE",
|
2183
|
+
"SECONDARY",
|
2184
|
+
"BACKUP",
|
2185
|
+
"FALLBACK",
|
2186
|
+
"PROMO",
|
2187
|
+
"PROMOTION",
|
2188
|
+
"DISCOUNT",
|
2189
|
+
"COUPON",
|
2190
|
+
"TEMP",
|
2191
|
+
"TMP",
|
1915
2192
|
}
|
1916
2193
|
|
1917
2194
|
for keyword in optional_keywords:
|
@@ -1946,12 +2223,17 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
|
|
1946
2223
|
"""
|
1947
2224
|
fk_upper = fk_column.strip().upper()
|
1948
2225
|
pk_table_variants = _table_variants(pk_table)
|
1949
|
-
|
2226
|
+
|
1950
2227
|
# Pattern 1: {table_name}_id or {table_name}_key
|
1951
2228
|
for variant in pk_table_variants:
|
1952
|
-
if fk_upper in {
|
2229
|
+
if fk_upper in {
|
2230
|
+
f"{variant}_ID",
|
2231
|
+
f"{variant}ID",
|
2232
|
+
f"{variant}_KEY",
|
2233
|
+
f"{variant}KEY",
|
2234
|
+
}:
|
1953
2235
|
return True
|
1954
|
-
|
2236
|
+
|
1955
2237
|
# Pattern 2: Column ends with table name variants
|
1956
2238
|
tokens = _identifier_tokens(fk_column)
|
1957
2239
|
if len(tokens) >= 2:
|
@@ -1961,21 +2243,23 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
|
|
1961
2243
|
tail = tokens[-1]
|
1962
2244
|
if tail in {"ID", "KEY"}:
|
1963
2245
|
return True
|
1964
|
-
|
2246
|
+
|
1965
2247
|
# Pattern 3: Similar to primary key column but with FK table prefix
|
1966
2248
|
# e.g., order_id in order_items table referencing orders.id
|
1967
2249
|
fk_table_variants = _table_variants(fk_table)
|
1968
2250
|
for fk_variant in fk_table_variants:
|
1969
2251
|
if fk_upper.startswith(fk_variant):
|
1970
|
-
remainder = fk_upper[len(fk_variant):].lstrip("_")
|
2252
|
+
remainder = fk_upper[len(fk_variant) :].lstrip("_")
|
1971
2253
|
for pk_variant in pk_table_variants:
|
1972
2254
|
if remainder.startswith(pk_variant):
|
1973
2255
|
return True
|
1974
|
-
|
2256
|
+
|
1975
2257
|
return False
|
1976
2258
|
|
1977
2259
|
|
1978
|
-
def _suggest_filters(
|
2260
|
+
def _suggest_filters(
|
2261
|
+
raw_table: data_types.Table,
|
2262
|
+
) -> List[semantic_model_pb2.NamedFilter]:
|
1979
2263
|
suggestions: List[semantic_model_pb2.NamedFilter] = []
|
1980
2264
|
for col in raw_table.columns:
|
1981
2265
|
base_type = _base_type_from_type(col.column_type)
|
@@ -2011,12 +2295,20 @@ def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.Nam
|
|
2011
2295
|
)
|
2012
2296
|
is_textual = base_type in {"STRING", "TEXT", "VARCHAR", "CHAR", "CHARACTER"}
|
2013
2297
|
is_boolean = base_type in {"BOOLEAN"}
|
2014
|
-
is_categorical_numeric = base_type in {
|
2015
|
-
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
2298
|
+
is_categorical_numeric = base_type in {
|
2299
|
+
"INT",
|
2300
|
+
"INTEGER",
|
2301
|
+
"NUMBER",
|
2302
|
+
"SMALLINT",
|
2303
|
+
"BIGINT",
|
2304
|
+
} and any(upper_name.endswith(suffix) for suffix in categorical_suffixes)
|
2305
|
+
|
2306
|
+
if not is_identifier_like and (
|
2307
|
+
is_textual or is_boolean or is_categorical_numeric
|
2308
|
+
):
|
2309
|
+
formatted = [
|
2310
|
+
_format_literal(val, base_type) for val in distinct_values[:5]
|
2311
|
+
]
|
2020
2312
|
expr = f"{col.column_name} IN ({', '.join(formatted)})"
|
2021
2313
|
suggestions.append(
|
2022
2314
|
semantic_model_pb2.NamedFilter(
|
@@ -2060,7 +2352,9 @@ def _infer_relationships(
|
|
2060
2352
|
table_prefixes = global_prefixes | _table_prefixes(raw_table.name)
|
2061
2353
|
for column in raw_table.columns:
|
2062
2354
|
base_type = _base_type_from_type(column.column_type)
|
2063
|
-
normalized = _sanitize_identifier_name(
|
2355
|
+
normalized = _sanitize_identifier_name(
|
2356
|
+
column.column_name, prefixes_to_drop=table_prefixes
|
2357
|
+
)
|
2064
2358
|
entry = columns_meta.setdefault(
|
2065
2359
|
normalized,
|
2066
2360
|
{
|
@@ -2075,7 +2369,9 @@ def _infer_relationships(
|
|
2075
2369
|
entry["names"].append(column.column_name)
|
2076
2370
|
if column.values:
|
2077
2371
|
entry["values"].extend(column.values)
|
2078
|
-
entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
|
2372
|
+
entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
|
2373
|
+
column.column_name, base_type
|
2374
|
+
)
|
2079
2375
|
is_primary = getattr(column, "is_primary_key", False)
|
2080
2376
|
if is_primary:
|
2081
2377
|
entry["is_primary"] = True
|
@@ -2093,7 +2389,9 @@ def _infer_relationships(
|
|
2093
2389
|
pairs: dict[tuple[str, str], List[tuple[str, str]]] = {}
|
2094
2390
|
null_check_cache: Dict[Tuple[str, str, str, str], bool] = {}
|
2095
2391
|
|
2096
|
-
def _record_pair(
|
2392
|
+
def _record_pair(
|
2393
|
+
left_table: str, right_table: str, left_col: str, right_col: str
|
2394
|
+
) -> None:
|
2097
2395
|
key = (left_table, right_table)
|
2098
2396
|
value = (left_col, right_col)
|
2099
2397
|
if value not in pairs.setdefault(key, []):
|
@@ -2158,7 +2456,7 @@ def _infer_relationships(
|
|
2158
2456
|
continue
|
2159
2457
|
if norm_b == pk_norm:
|
2160
2458
|
continue
|
2161
|
-
|
2459
|
+
|
2162
2460
|
# Direct suffix match
|
2163
2461
|
if norm_b.endswith(pk_norm):
|
2164
2462
|
_record_pair(
|
@@ -2168,23 +2466,34 @@ def _infer_relationships(
|
|
2168
2466
|
pk_cols[0],
|
2169
2467
|
)
|
2170
2468
|
continue
|
2171
|
-
|
2469
|
+
|
2172
2470
|
# Enhanced: Check if column looks like a foreign key to this table
|
2173
|
-
if _looks_like_foreign_key(
|
2471
|
+
if _looks_like_foreign_key(
|
2472
|
+
table_b_name, table_a_name, meta_b["names"][0]
|
2473
|
+
):
|
2174
2474
|
# Additional check: name similarity with adaptive threshold
|
2175
2475
|
similarity = _name_similarity(norm_b, pk_norm)
|
2176
2476
|
# Calculate adaptive threshold for this relationship
|
2177
2477
|
all_sample_values = []
|
2178
|
-
for col_values in [
|
2478
|
+
for col_values in [
|
2479
|
+
pk_meta.get("values", []),
|
2480
|
+
meta_b.get("values", []),
|
2481
|
+
]:
|
2179
2482
|
if col_values:
|
2180
2483
|
all_sample_values.append(col_values)
|
2181
2484
|
|
2182
2485
|
adaptive_thresholds = _calculate_adaptive_thresholds(
|
2183
2486
|
all_sample_values,
|
2184
2487
|
table_count=len(raw_tables),
|
2185
|
-
base_sample_size=
|
2488
|
+
base_sample_size=(
|
2489
|
+
len(pk_meta.get("values", []))
|
2490
|
+
if pk_meta.get("values")
|
2491
|
+
else 10
|
2492
|
+
),
|
2493
|
+
)
|
2494
|
+
similarity_threshold = adaptive_thresholds.get(
|
2495
|
+
"similarity_threshold", 0.6
|
2186
2496
|
)
|
2187
|
-
similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
|
2188
2497
|
|
2189
2498
|
if similarity >= similarity_threshold:
|
2190
2499
|
_record_pair(
|
@@ -2204,7 +2513,7 @@ def _infer_relationships(
|
|
2204
2513
|
continue
|
2205
2514
|
if norm_a == pk_norm:
|
2206
2515
|
continue
|
2207
|
-
|
2516
|
+
|
2208
2517
|
# Direct suffix match
|
2209
2518
|
if norm_a.endswith(pk_norm):
|
2210
2519
|
_record_pair(
|
@@ -2214,23 +2523,34 @@ def _infer_relationships(
|
|
2214
2523
|
pk_cols[0],
|
2215
2524
|
)
|
2216
2525
|
continue
|
2217
|
-
|
2526
|
+
|
2218
2527
|
# Enhanced: Check if column looks like a foreign key to this table
|
2219
|
-
if _looks_like_foreign_key(
|
2528
|
+
if _looks_like_foreign_key(
|
2529
|
+
table_a_name, table_b_name, meta_a["names"][0]
|
2530
|
+
):
|
2220
2531
|
# Additional check: name similarity with adaptive threshold
|
2221
2532
|
similarity = _name_similarity(norm_a, pk_norm)
|
2222
2533
|
# Calculate adaptive threshold for this relationship
|
2223
2534
|
all_sample_values = []
|
2224
|
-
for col_values in [
|
2535
|
+
for col_values in [
|
2536
|
+
pk_meta.get("values", []),
|
2537
|
+
meta_a.get("values", []),
|
2538
|
+
]:
|
2225
2539
|
if col_values:
|
2226
2540
|
all_sample_values.append(col_values)
|
2227
2541
|
|
2228
2542
|
adaptive_thresholds = _calculate_adaptive_thresholds(
|
2229
2543
|
all_sample_values,
|
2230
2544
|
table_count=len(raw_tables),
|
2231
|
-
base_sample_size=
|
2545
|
+
base_sample_size=(
|
2546
|
+
len(pk_meta.get("values", []))
|
2547
|
+
if pk_meta.get("values")
|
2548
|
+
else 10
|
2549
|
+
),
|
2550
|
+
)
|
2551
|
+
similarity_threshold = adaptive_thresholds.get(
|
2552
|
+
"similarity_threshold", 0.6
|
2232
2553
|
)
|
2233
|
-
similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
|
2234
2554
|
|
2235
2555
|
if similarity >= similarity_threshold:
|
2236
2556
|
_record_pair(
|
@@ -2258,7 +2578,7 @@ def _infer_relationships(
|
|
2258
2578
|
# Infer cardinality based on available metadata
|
2259
2579
|
left_meta = metadata[left_table]
|
2260
2580
|
right_meta = metadata[right_table]
|
2261
|
-
|
2581
|
+
|
2262
2582
|
# Determine if tables have primary keys in the relationship
|
2263
2583
|
left_has_pk = any(
|
2264
2584
|
col_name in [pair[0] for pair in column_pairs]
|
@@ -2270,7 +2590,7 @@ def _infer_relationships(
|
|
2270
2590
|
for pk_list in right_meta["pk_candidates"].values()
|
2271
2591
|
for col_name in pk_list
|
2272
2592
|
)
|
2273
|
-
|
2593
|
+
|
2274
2594
|
# Enhanced: Get sample values for all columns in the relationship (for composite key analysis)
|
2275
2595
|
left_values_all = []
|
2276
2596
|
right_values_all = []
|
@@ -2279,12 +2599,11 @@ def _infer_relationships(
|
|
2279
2599
|
|
2280
2600
|
for left_col, right_col in column_pairs:
|
2281
2601
|
left_col_key = _sanitize_identifier_name(
|
2282
|
-
left_col,
|
2283
|
-
prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
|
2602
|
+
left_col, prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
|
2284
2603
|
)
|
2285
2604
|
right_col_key = _sanitize_identifier_name(
|
2286
2605
|
right_col,
|
2287
|
-
prefixes_to_drop=global_prefixes | _table_prefixes(right_table)
|
2606
|
+
prefixes_to_drop=global_prefixes | _table_prefixes(right_table),
|
2288
2607
|
)
|
2289
2608
|
|
2290
2609
|
left_col_values = []
|
@@ -2293,7 +2612,9 @@ def _infer_relationships(
|
|
2293
2612
|
if left_col_key in left_meta["columns"]:
|
2294
2613
|
left_col_values = left_meta["columns"][left_col_key].get("values") or []
|
2295
2614
|
if right_col_key in right_meta["columns"]:
|
2296
|
-
right_col_values =
|
2615
|
+
right_col_values = (
|
2616
|
+
right_meta["columns"][right_col_key].get("values") or []
|
2617
|
+
)
|
2297
2618
|
|
2298
2619
|
left_values_all.append(left_col_values)
|
2299
2620
|
right_values_all.append(right_col_values)
|
@@ -2322,7 +2643,7 @@ def _infer_relationships(
|
|
2322
2643
|
right_has_pk,
|
2323
2644
|
adaptive_thresholds=global_adaptive_thresholds,
|
2324
2645
|
)
|
2325
|
-
|
2646
|
+
|
2326
2647
|
# Determine if SQL null probe should be executed for stricter inference
|
2327
2648
|
strict_fk_detected = False
|
2328
2649
|
if strict_join_inference and session:
|
@@ -2352,7 +2673,7 @@ def _infer_relationships(
|
|
2352
2673
|
left_table_meta=left_meta,
|
2353
2674
|
right_table_meta=right_meta,
|
2354
2675
|
)
|
2355
|
-
|
2676
|
+
|
2356
2677
|
# Calculate confidence and reasoning for this relationship
|
2357
2678
|
confidence_analysis = _calculate_relationship_confidence(
|
2358
2679
|
left_table=left_table,
|
@@ -2376,45 +2697,54 @@ def _infer_relationships(
|
|
2376
2697
|
column_pairs=column_pairs,
|
2377
2698
|
left_meta=left_meta,
|
2378
2699
|
right_meta=right_meta,
|
2379
|
-
current_confidence=confidence_analysis[
|
2700
|
+
current_confidence=confidence_analysis["confidence_score"],
|
2380
2701
|
)
|
2381
2702
|
|
2382
2703
|
# Update confidence analysis with domain knowledge
|
2383
|
-
if domain_enhancement[
|
2384
|
-
confidence_analysis[
|
2385
|
-
|
2704
|
+
if domain_enhancement["confidence_boost"] > 0:
|
2705
|
+
confidence_analysis["confidence_score"] = min(
|
2706
|
+
1.0,
|
2707
|
+
confidence_analysis["confidence_score"]
|
2708
|
+
+ domain_enhancement["confidence_boost"],
|
2709
|
+
)
|
2386
2710
|
|
2387
2711
|
# Add domain knowledge factors to reasoning
|
2388
|
-
for domain_factor in domain_enhancement[
|
2389
|
-
confidence_analysis[
|
2712
|
+
for domain_factor in domain_enhancement["domain_factors"]:
|
2713
|
+
confidence_analysis["reasoning_factors"].append(
|
2714
|
+
f"Domain knowledge: {domain_factor}"
|
2715
|
+
)
|
2390
2716
|
|
2391
2717
|
# Update confidence level based on new score
|
2392
|
-
if confidence_analysis[
|
2393
|
-
confidence_analysis[
|
2394
|
-
confidence_analysis[
|
2395
|
-
elif confidence_analysis[
|
2396
|
-
confidence_analysis[
|
2397
|
-
confidence_analysis[
|
2398
|
-
elif confidence_analysis[
|
2399
|
-
confidence_analysis[
|
2400
|
-
confidence_analysis[
|
2401
|
-
elif confidence_analysis[
|
2402
|
-
confidence_analysis[
|
2403
|
-
confidence_analysis[
|
2718
|
+
if confidence_analysis["confidence_score"] >= 0.8:
|
2719
|
+
confidence_analysis["confidence_level"] = "very_high"
|
2720
|
+
confidence_analysis["confidence_description"] = "Very High Confidence"
|
2721
|
+
elif confidence_analysis["confidence_score"] >= 0.6:
|
2722
|
+
confidence_analysis["confidence_level"] = "high"
|
2723
|
+
confidence_analysis["confidence_description"] = "High Confidence"
|
2724
|
+
elif confidence_analysis["confidence_score"] >= 0.4:
|
2725
|
+
confidence_analysis["confidence_level"] = "medium"
|
2726
|
+
confidence_analysis["confidence_description"] = "Medium Confidence"
|
2727
|
+
elif confidence_analysis["confidence_score"] >= 0.2:
|
2728
|
+
confidence_analysis["confidence_level"] = "low"
|
2729
|
+
confidence_analysis["confidence_description"] = "Low Confidence"
|
2404
2730
|
else:
|
2405
|
-
confidence_analysis[
|
2406
|
-
confidence_analysis[
|
2731
|
+
confidence_analysis["confidence_level"] = "very_low"
|
2732
|
+
confidence_analysis["confidence_description"] = "Very Low Confidence"
|
2407
2733
|
|
2408
2734
|
# Enhanced logging with confidence and reasoning
|
2409
2735
|
sample_info = f"samples: L={len(left_values)}, R={len(right_values)}"
|
2410
2736
|
pk_info = f"PKs: L={left_has_pk}, R={right_has_pk}"
|
2411
|
-
join_type_name =
|
2737
|
+
join_type_name = (
|
2738
|
+
"INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
|
2739
|
+
)
|
2412
2740
|
confidence_info = f"confidence: {confidence_analysis['confidence_score']:.2f} ({confidence_analysis['confidence_level']})"
|
2413
2741
|
|
2414
2742
|
# Add domain knowledge info if applied
|
2415
2743
|
domain_info = ""
|
2416
|
-
if domain_enhancement[
|
2417
|
-
domain_info =
|
2744
|
+
if domain_enhancement["confidence_boost"] > 0:
|
2745
|
+
domain_info = (
|
2746
|
+
f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
|
2747
|
+
)
|
2418
2748
|
|
2419
2749
|
logger.info(
|
2420
2750
|
f"Relationship inference for {left_table} -> {right_table}: "
|
@@ -2423,22 +2753,30 @@ def _infer_relationships(
|
|
2423
2753
|
)
|
2424
2754
|
|
2425
2755
|
# Log domain knowledge patterns if detected
|
2426
|
-
domain_factors = [
|
2756
|
+
domain_factors = [
|
2757
|
+
f
|
2758
|
+
for f in confidence_analysis["reasoning_factors"]
|
2759
|
+
if f.startswith("Domain knowledge:")
|
2760
|
+
]
|
2427
2761
|
if domain_factors:
|
2428
|
-
logger.debug(
|
2762
|
+
logger.debug(
|
2763
|
+
f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}"
|
2764
|
+
)
|
2429
2765
|
|
2430
2766
|
# Log detailed reasoning for medium or lower confidence relationships
|
2431
|
-
if confidence_analysis[
|
2767
|
+
if confidence_analysis["confidence_score"] < 0.6:
|
2432
2768
|
logger.debug(f"Confidence reasoning for {left_table} -> {right_table}:")
|
2433
|
-
for factor in confidence_analysis[
|
2769
|
+
for factor in confidence_analysis["reasoning_factors"]:
|
2434
2770
|
logger.debug(f" - {factor}")
|
2435
2771
|
|
2436
2772
|
# Log very high confidence relationships with their evidence
|
2437
|
-
elif confidence_analysis[
|
2438
|
-
logger.debug(
|
2439
|
-
|
2773
|
+
elif confidence_analysis["confidence_score"] >= 0.8:
|
2774
|
+
logger.debug(
|
2775
|
+
f"High confidence relationship {left_table} -> {right_table} based on:"
|
2776
|
+
)
|
2777
|
+
for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
|
2440
2778
|
logger.debug(f" + {factor}")
|
2441
|
-
|
2779
|
+
|
2442
2780
|
# Determine relationship type based on cardinality
|
2443
2781
|
if left_card == "1" and right_card == "1":
|
2444
2782
|
rel_type = semantic_model_pb2.RelationshipType.one_to_one
|
@@ -2449,7 +2787,7 @@ def _infer_relationships(
|
|
2449
2787
|
else:
|
2450
2788
|
# Default to many_to_one for backward compatibility
|
2451
2789
|
rel_type = semantic_model_pb2.RelationshipType.many_to_one
|
2452
|
-
|
2790
|
+
|
2453
2791
|
relationship = semantic_model_pb2.Relationship(
|
2454
2792
|
name=f"{left_table}_to_{right_table}",
|
2455
2793
|
left_table=left_table,
|
@@ -2472,9 +2810,13 @@ def _infer_relationships(
|
|
2472
2810
|
|
2473
2811
|
if many_to_many_relationships:
|
2474
2812
|
relationships.extend(many_to_many_relationships)
|
2475
|
-
logger.info(
|
2813
|
+
logger.info(
|
2814
|
+
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2815
|
+
)
|
2476
2816
|
|
2477
|
-
logger.info(
|
2817
|
+
logger.info(
|
2818
|
+
f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
|
2819
|
+
)
|
2478
2820
|
return relationships
|
2479
2821
|
|
2480
2822
|
|
@@ -2512,7 +2854,14 @@ def _raw_table_to_semantic_context_table(
|
|
2512
2854
|
base_type = _base_type_from_type(col.column_type)
|
2513
2855
|
if _is_time_like_column(col):
|
2514
2856
|
time_data_type = col.column_type
|
2515
|
-
if time_data_type.split("(")[0].upper() in {
|
2857
|
+
if time_data_type.split("(")[0].upper() in {
|
2858
|
+
"STRING",
|
2859
|
+
"VARCHAR",
|
2860
|
+
"TEXT",
|
2861
|
+
"CHAR",
|
2862
|
+
"CHARACTER",
|
2863
|
+
"NVARCHAR",
|
2864
|
+
}:
|
2516
2865
|
time_data_type = "TIMESTAMP_NTZ"
|
2517
2866
|
time_dimension_name = _safe_semantic_identifier(
|
2518
2867
|
col.column_name,
|
@@ -2564,7 +2913,9 @@ def _raw_table_to_semantic_context_table(
|
|
2564
2913
|
data_type=col.column_type,
|
2565
2914
|
sample_values=col.values,
|
2566
2915
|
synonyms=[_PLACEHOLDER_COMMENT],
|
2567
|
-
description=
|
2916
|
+
description=(
|
2917
|
+
col.comment if col.comment else _PLACEHOLDER_COMMENT
|
2918
|
+
),
|
2568
2919
|
)
|
2569
2920
|
)
|
2570
2921
|
continue
|
@@ -2685,7 +3036,9 @@ def raw_schema_to_semantic_context(
|
|
2685
3036
|
unique_database_schema.append(fqn_databse_schema)
|
2686
3037
|
|
2687
3038
|
logger.info(f"Pulling column information from {fqn_table}")
|
2688
|
-
_notify(
|
3039
|
+
_notify(
|
3040
|
+
f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}..."
|
3041
|
+
)
|
2689
3042
|
valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
|
2690
3043
|
session=conn,
|
2691
3044
|
workspace=fqn_table.database,
|
@@ -2751,7 +3104,9 @@ def raw_schema_to_semantic_context(
|
|
2751
3104
|
semantic_model_name,
|
2752
3105
|
actual_model,
|
2753
3106
|
)
|
2754
|
-
_notify(
|
3107
|
+
_notify(
|
3108
|
+
"Running DashScope enrichment to enhance descriptions and metrics..."
|
3109
|
+
)
|
2755
3110
|
|
2756
3111
|
# Create progress tracker for enrichment
|
2757
3112
|
def enrichment_progress_callback(update):
|
@@ -2760,14 +3115,16 @@ def raw_schema_to_semantic_context(
|
|
2760
3115
|
EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
|
2761
3116
|
EnrichmentStage.MODEL_METRICS: "Generating model-level metrics",
|
2762
3117
|
EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
|
2763
|
-
EnrichmentStage.COMPLETE: "Enrichment complete"
|
3118
|
+
EnrichmentStage.COMPLETE: "Enrichment complete",
|
2764
3119
|
}
|
2765
3120
|
|
2766
3121
|
base_message = stage_messages.get(update.stage, "Processing")
|
2767
3122
|
if update.table_name:
|
2768
3123
|
message = f"{base_message} - {update.table_name} ({update.current_step}/{update.total_steps})"
|
2769
3124
|
elif update.total_steps > 1:
|
2770
|
-
message =
|
3125
|
+
message = (
|
3126
|
+
f"{base_message} ({update.current_step}/{update.total_steps})"
|
3127
|
+
)
|
2771
3128
|
else:
|
2772
3129
|
message = base_message
|
2773
3130
|
|
@@ -2801,7 +3158,9 @@ def raw_schema_to_semantic_context(
|
|
2801
3158
|
)
|
2802
3159
|
_notify("DashScope enrichment complete.")
|
2803
3160
|
else:
|
2804
|
-
logger.warning(
|
3161
|
+
logger.warning(
|
3162
|
+
"LLM enrichment was requested but DashScope is not configured; skipping enrichment."
|
3163
|
+
)
|
2805
3164
|
_notify("DashScope configuration missing; skipped enrichment.")
|
2806
3165
|
return context
|
2807
3166
|
|
@@ -2938,6 +3297,7 @@ def generate_model_str_from_clickzetta(
|
|
2938
3297
|
Returns:
|
2939
3298
|
str: The raw string of the semantic context.
|
2940
3299
|
"""
|
3300
|
+
|
2941
3301
|
def _notify(message: str) -> None:
|
2942
3302
|
if progress_callback:
|
2943
3303
|
try:
|
@@ -2946,7 +3306,11 @@ def generate_model_str_from_clickzetta(
|
|
2946
3306
|
logger.debug("Progress callback failed for message: {}", message)
|
2947
3307
|
|
2948
3308
|
table_list = ", ".join(base_tables)
|
2949
|
-
logger.info(
|
3309
|
+
logger.info(
|
3310
|
+
"Generating semantic model '{}' from tables: {}",
|
3311
|
+
semantic_model_name,
|
3312
|
+
table_list,
|
3313
|
+
)
|
2950
3314
|
_notify("Collecting metadata from ClickZetta tables...")
|
2951
3315
|
|
2952
3316
|
context = raw_schema_to_semantic_context(
|