clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/METADATA +5 -5
- clickzetta_semantic_model_generator-1.0.4.dist-info/RECORD +38 -0
- semantic_model_generator/clickzetta_utils/clickzetta_connector.py +100 -48
- semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
- semantic_model_generator/clickzetta_utils/utils.py +44 -2
- semantic_model_generator/data_processing/cte_utils.py +44 -14
- semantic_model_generator/generate_model.py +711 -239
- semantic_model_generator/llm/dashscope_client.py +4 -2
- semantic_model_generator/llm/enrichment.py +144 -57
- semantic_model_generator/llm/progress_tracker.py +16 -15
- semantic_model_generator/relationships/__init__.py +2 -0
- semantic_model_generator/relationships/discovery.py +181 -16
- semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
- semantic_model_generator/tests/cte_utils_test.py +15 -14
- semantic_model_generator/tests/generate_model_classification_test.py +12 -2
- semantic_model_generator/tests/llm_enrichment_test.py +152 -46
- semantic_model_generator/tests/relationship_discovery_test.py +70 -3
- semantic_model_generator/tests/relationships_filters_test.py +166 -30
- semantic_model_generator/tests/utils_test.py +1 -1
- semantic_model_generator/validate/keywords.py +453 -53
- semantic_model_generator/validate/schema.py +4 -2
- clickzetta_semantic_model_generator-1.0.2.dist-info/RECORD +0 -38
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/WHEEL +0 -0
@@ -1,6 +1,7 @@
|
|
1
|
+
import math
|
1
2
|
import os
|
2
3
|
import re
|
3
|
-
import
|
4
|
+
import time
|
4
5
|
from collections import defaultdict
|
5
6
|
from datetime import datetime
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
@@ -8,8 +9,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
8
9
|
from clickzetta.zettapark.session import Session
|
9
10
|
from loguru import logger
|
10
11
|
|
11
|
-
from semantic_model_generator.data_processing import data_types, proto_utils
|
12
|
-
from semantic_model_generator.protos import semantic_model_pb2
|
13
12
|
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
14
13
|
AUTOGEN_TOKEN,
|
15
14
|
DIMENSION_DATATYPES,
|
@@ -19,15 +18,25 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
|
19
18
|
get_table_representation,
|
20
19
|
get_valid_schemas_tables_columns_df,
|
21
20
|
)
|
22
|
-
from semantic_model_generator.clickzetta_utils.utils import
|
23
|
-
|
21
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
22
|
+
create_fqn_table,
|
23
|
+
join_quoted_identifiers,
|
24
|
+
normalize_identifier,
|
25
|
+
quote_identifier,
|
26
|
+
)
|
27
|
+
from semantic_model_generator.data_processing import data_types, proto_utils
|
24
28
|
from semantic_model_generator.llm import (
|
25
29
|
DashscopeClient,
|
26
30
|
DashscopeSettings,
|
27
31
|
enrich_semantic_model,
|
28
32
|
get_dashscope_settings,
|
29
33
|
)
|
30
|
-
from semantic_model_generator.llm.progress_tracker import
|
34
|
+
from semantic_model_generator.llm.progress_tracker import (
|
35
|
+
EnrichmentProgressTracker,
|
36
|
+
EnrichmentStage,
|
37
|
+
)
|
38
|
+
from semantic_model_generator.protos import semantic_model_pb2
|
39
|
+
from semantic_model_generator.validate.context_length import validate_context_length
|
31
40
|
from semantic_model_generator.validate.keywords import CZ_RESERVED_WORDS
|
32
41
|
|
33
42
|
_PLACEHOLDER_COMMENT = " "
|
@@ -38,6 +47,15 @@ _AUTOGEN_COMMENT_TOKEN = (
|
|
38
47
|
)
|
39
48
|
_DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
|
40
49
|
_AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
|
50
|
+
_GENERIC_IDENTIFIER_TOKENS = {
|
51
|
+
"ID",
|
52
|
+
"NAME",
|
53
|
+
"CODE",
|
54
|
+
"KEY",
|
55
|
+
"VALUE",
|
56
|
+
"NUMBER",
|
57
|
+
}
|
58
|
+
|
41
59
|
|
42
60
|
def _singularize(token: str) -> str:
|
43
61
|
if token.endswith("IES") and len(token) > 3:
|
@@ -68,7 +86,9 @@ def _base_type_from_type(column_type: str) -> str:
|
|
68
86
|
return token.split("(")[0]
|
69
87
|
|
70
88
|
|
71
|
-
def _identifier_tokens(
|
89
|
+
def _identifier_tokens(
|
90
|
+
name: str, prefixes_to_drop: Optional[set[str]] = None
|
91
|
+
) -> List[str]:
|
72
92
|
name = name.replace("-", "_")
|
73
93
|
raw_tokens = re.split(r"[^0-9A-Za-z]+", name)
|
74
94
|
tokens: List[str] = []
|
@@ -84,7 +104,17 @@ def _identifier_tokens(name: str, prefixes_to_drop: Optional[set[str]] = None) -
|
|
84
104
|
return tokens
|
85
105
|
|
86
106
|
|
87
|
-
def
|
107
|
+
def _is_generic_identifier(name: str) -> bool:
|
108
|
+
tokens = [token for token in _identifier_tokens(name) if token]
|
109
|
+
if not tokens:
|
110
|
+
return True
|
111
|
+
normalized_tokens = {token.upper() for token in tokens}
|
112
|
+
return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
|
113
|
+
|
114
|
+
|
115
|
+
def _sanitize_identifier_name(
|
116
|
+
name: str, prefixes_to_drop: Optional[set[str]] = None
|
117
|
+
) -> str:
|
88
118
|
if not name:
|
89
119
|
return ""
|
90
120
|
|
@@ -271,7 +301,9 @@ def _looks_like_primary_key(table_name: str, column_name: str) -> bool:
|
|
271
301
|
"PRIMARY_KEY",
|
272
302
|
}
|
273
303
|
for variant in variants:
|
274
|
-
direct_matches.update(
|
304
|
+
direct_matches.update(
|
305
|
+
{f"{variant}_ID", f"{variant}ID", f"{variant}_KEY", f"{variant}KEY"}
|
306
|
+
)
|
275
307
|
if upper_name in direct_matches:
|
276
308
|
return True
|
277
309
|
|
@@ -344,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
|
|
344
376
|
|
345
377
|
def _format_sql_identifier(name: str) -> str:
|
346
378
|
"""
|
347
|
-
Formats an identifier for SQL
|
379
|
+
Formats an identifier for SQL by wrapping it in backticks.
|
348
380
|
"""
|
349
|
-
|
350
|
-
return ""
|
351
|
-
return str(name).replace('"', "").replace("`", "").strip().upper()
|
381
|
+
return quote_identifier(name)
|
352
382
|
|
353
383
|
|
354
384
|
def _qualified_table_name(fqn: data_types.FQNParts) -> str:
|
355
385
|
"""
|
356
|
-
Builds a fully qualified table name
|
386
|
+
Builds a fully qualified, backtick-quoted table name.
|
357
387
|
"""
|
358
|
-
parts = [part for part in (fqn.database, fqn.schema_name, fqn.table)
|
359
|
-
return
|
388
|
+
parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
|
389
|
+
return join_quoted_identifiers(*(part for part in parts if part))
|
360
390
|
|
361
391
|
|
362
392
|
def _levenshtein_distance(s1: str, s2: str) -> int:
|
@@ -368,7 +398,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
368
398
|
return _levenshtein_distance(s2, s1)
|
369
399
|
if len(s2) == 0:
|
370
400
|
return len(s1)
|
371
|
-
|
401
|
+
|
372
402
|
previous_row = range(len(s2) + 1)
|
373
403
|
for i, c1 in enumerate(s1):
|
374
404
|
current_row = [i + 1]
|
@@ -378,7 +408,7 @@ def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
378
408
|
substitutions = previous_row[j] + (c1 != c2)
|
379
409
|
current_row.append(min(insertions, deletions, substitutions))
|
380
410
|
previous_row = current_row
|
381
|
-
|
411
|
+
|
382
412
|
return previous_row[-1]
|
383
413
|
|
384
414
|
|
@@ -389,26 +419,26 @@ def _name_similarity(name1: str, name2: str) -> float:
|
|
389
419
|
"""
|
390
420
|
if not name1 or not name2:
|
391
421
|
return 0.0
|
392
|
-
|
422
|
+
|
393
423
|
# Exact match
|
394
424
|
if name1.upper() == name2.upper():
|
395
425
|
return 1.0
|
396
|
-
|
426
|
+
|
397
427
|
# Normalize names for comparison
|
398
428
|
norm1 = name1.upper().replace("_", "").replace("-", "")
|
399
429
|
norm2 = name2.upper().replace("_", "").replace("-", "")
|
400
|
-
|
430
|
+
|
401
431
|
if norm1 == norm2:
|
402
432
|
return 0.95
|
403
|
-
|
433
|
+
|
404
434
|
# Calculate Levenshtein-based similarity
|
405
435
|
max_len = max(len(norm1), len(norm2))
|
406
436
|
if max_len == 0:
|
407
437
|
return 0.0
|
408
|
-
|
438
|
+
|
409
439
|
distance = _levenshtein_distance(norm1, norm2)
|
410
440
|
similarity = 1.0 - (distance / max_len)
|
411
|
-
|
441
|
+
|
412
442
|
return max(0.0, similarity)
|
413
443
|
|
414
444
|
|
@@ -427,17 +457,24 @@ def _analyze_composite_key_patterns(
|
|
427
457
|
Dict with composite key analysis results
|
428
458
|
"""
|
429
459
|
pk_candidates = table_meta.get("pk_candidates", {})
|
430
|
-
columns_meta = table_meta.get("columns", {})
|
431
460
|
|
432
461
|
# Check if all relationship columns form a composite key
|
433
|
-
relationship_cols = [
|
462
|
+
relationship_cols = [
|
463
|
+
pair[0] if isinstance(pair, tuple) else pair for pair in column_pairs
|
464
|
+
]
|
434
465
|
|
435
466
|
# Normalize column names for comparison
|
436
467
|
global_prefixes = set() # This should come from context but we'll handle it locally
|
437
|
-
table_prefixes = _table_prefixes(
|
468
|
+
table_prefixes = _table_prefixes(
|
469
|
+
list(table_meta.get("columns", {}).keys())[0]
|
470
|
+
if table_meta.get("columns")
|
471
|
+
else ""
|
472
|
+
)
|
438
473
|
|
439
474
|
normalized_rel_cols = [
|
440
|
-
_sanitize_identifier_name(
|
475
|
+
_sanitize_identifier_name(
|
476
|
+
col, prefixes_to_drop=global_prefixes | table_prefixes
|
477
|
+
)
|
441
478
|
for col in relationship_cols
|
442
479
|
]
|
443
480
|
|
@@ -448,7 +485,9 @@ def _analyze_composite_key_patterns(
|
|
448
485
|
analysis = {
|
449
486
|
"is_composite_pk": pk_col_count > 1 and pk_col_count == total_pk_candidates,
|
450
487
|
"partial_pk": pk_col_count > 0 and pk_col_count < total_pk_candidates,
|
451
|
-
"pk_coverage_ratio":
|
488
|
+
"pk_coverage_ratio": (
|
489
|
+
pk_col_count / total_pk_candidates if total_pk_candidates > 0 else 0
|
490
|
+
),
|
452
491
|
"relationship_column_count": len(relationship_cols),
|
453
492
|
"pk_column_count": pk_col_count,
|
454
493
|
}
|
@@ -457,7 +496,10 @@ def _analyze_composite_key_patterns(
|
|
457
496
|
if len(relationship_cols) > 1:
|
458
497
|
sequential_patterns = []
|
459
498
|
for col in relationship_cols:
|
460
|
-
if any(
|
499
|
+
if any(
|
500
|
+
pattern in col.upper()
|
501
|
+
for pattern in ["_ID", "ID", "_KEY", "KEY", "_NUM", "NUM"]
|
502
|
+
):
|
461
503
|
sequential_patterns.append(col)
|
462
504
|
|
463
505
|
analysis["sequential_id_pattern"] = len(sequential_patterns) >= 2
|
@@ -504,9 +546,12 @@ def _infer_composite_cardinality(
|
|
504
546
|
# Rule 3: Composite key uniqueness analysis (if we have sufficient samples)
|
505
547
|
MIN_SAMPLE_SIZE = 20 # Lower threshold for composite keys
|
506
548
|
|
507
|
-
if (
|
508
|
-
|
509
|
-
|
549
|
+
if (
|
550
|
+
left_values_all
|
551
|
+
and right_values_all
|
552
|
+
and len(left_values_all) >= MIN_SAMPLE_SIZE
|
553
|
+
and len(right_values_all) >= MIN_SAMPLE_SIZE
|
554
|
+
):
|
510
555
|
|
511
556
|
# Create composite keys by concatenating values
|
512
557
|
left_composite_keys = []
|
@@ -515,10 +560,12 @@ def _infer_composite_cardinality(
|
|
515
560
|
sample_size = min(len(left_values_all), len(right_values_all))
|
516
561
|
|
517
562
|
for i in range(sample_size):
|
518
|
-
left_key = "|".join(
|
519
|
-
|
520
|
-
|
521
|
-
|
563
|
+
left_key = "|".join(
|
564
|
+
str(vals[i]) if i < len(vals) else "" for vals in left_values_all
|
565
|
+
)
|
566
|
+
right_key = "|".join(
|
567
|
+
str(vals[i]) if i < len(vals) else "" for vals in right_values_all
|
568
|
+
)
|
522
569
|
|
523
570
|
if left_key and not _is_nullish(left_key):
|
524
571
|
left_composite_keys.append(left_key)
|
@@ -527,7 +574,9 @@ def _infer_composite_cardinality(
|
|
527
574
|
|
528
575
|
if left_composite_keys and right_composite_keys:
|
529
576
|
left_unique_ratio = len(set(left_composite_keys)) / len(left_composite_keys)
|
530
|
-
right_unique_ratio = len(set(right_composite_keys)) / len(
|
577
|
+
right_unique_ratio = len(set(right_composite_keys)) / len(
|
578
|
+
right_composite_keys
|
579
|
+
)
|
531
580
|
|
532
581
|
# Lower threshold for composite key uniqueness
|
533
582
|
if right_unique_ratio > 0.9:
|
@@ -561,6 +610,7 @@ def _infer_composite_cardinality(
|
|
561
610
|
adaptive_thresholds=adaptive_thresholds,
|
562
611
|
)
|
563
612
|
|
613
|
+
|
564
614
|
def _detect_bridge_table_pattern(
|
565
615
|
table_meta: Dict[str, Any],
|
566
616
|
all_tables_meta: Dict[str, Dict[str, Any]],
|
@@ -606,7 +656,9 @@ def _detect_bridge_table_pattern(
|
|
606
656
|
base_type = col_info.get("base_type", "")
|
607
657
|
|
608
658
|
# Check if column looks like an ID/foreign key
|
609
|
-
if any(
|
659
|
+
if any(
|
660
|
+
pattern in original_name.upper() for pattern in ["_ID", "ID", "_KEY", "KEY"]
|
661
|
+
):
|
610
662
|
id_columns.append(original_name)
|
611
663
|
|
612
664
|
# Check if this could be a foreign key to another table
|
@@ -615,11 +667,13 @@ def _detect_bridge_table_pattern(
|
|
615
667
|
continue
|
616
668
|
|
617
669
|
if _looks_like_foreign_key(table_name, other_table_name, original_name):
|
618
|
-
fk_like_columns.append(
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
670
|
+
fk_like_columns.append(
|
671
|
+
{
|
672
|
+
"column": original_name,
|
673
|
+
"references_table": other_table_name,
|
674
|
+
"confidence": 0.8,
|
675
|
+
}
|
676
|
+
)
|
623
677
|
break
|
624
678
|
|
625
679
|
# Check if column name contains the other table name
|
@@ -628,11 +682,13 @@ def _detect_bridge_table_pattern(
|
|
628
682
|
|
629
683
|
for variant in other_variants:
|
630
684
|
if variant in col_tokens:
|
631
|
-
fk_like_columns.append(
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
685
|
+
fk_like_columns.append(
|
686
|
+
{
|
687
|
+
"column": original_name,
|
688
|
+
"references_table": other_table_name,
|
689
|
+
"confidence": 0.6,
|
690
|
+
}
|
691
|
+
)
|
636
692
|
break
|
637
693
|
else:
|
638
694
|
# Count descriptive/non-ID columns
|
@@ -680,8 +736,18 @@ def _detect_bridge_table_pattern(
|
|
680
736
|
# Name-based heuristics
|
681
737
|
table_upper = table_name.upper()
|
682
738
|
bridge_keywords = {
|
683
|
-
"BRIDGE",
|
684
|
-
"
|
739
|
+
"BRIDGE",
|
740
|
+
"JUNCTION",
|
741
|
+
"LINK",
|
742
|
+
"ASSOC",
|
743
|
+
"ASSOCIATION",
|
744
|
+
"REL",
|
745
|
+
"RELATIONSHIP",
|
746
|
+
"MAP",
|
747
|
+
"MAPPING",
|
748
|
+
"XREF",
|
749
|
+
"CROSS_REF",
|
750
|
+
"CONNECTOR",
|
685
751
|
}
|
686
752
|
|
687
753
|
for keyword in bridge_keywords:
|
@@ -708,7 +774,9 @@ def _detect_bridge_table_pattern(
|
|
708
774
|
|
709
775
|
is_bridge = confidence >= 0.6 # Threshold for bridge table classification
|
710
776
|
|
711
|
-
connected_tables = [
|
777
|
+
connected_tables = [
|
778
|
+
fk["references_table"] for fk in fk_like_columns if fk["confidence"] >= 0.5
|
779
|
+
]
|
712
780
|
|
713
781
|
return {
|
714
782
|
"is_bridge": is_bridge,
|
@@ -718,14 +786,14 @@ def _detect_bridge_table_pattern(
|
|
718
786
|
"fk_ratio": fk_ratio,
|
719
787
|
"id_ratio": id_ratio,
|
720
788
|
"total_columns": total_columns,
|
721
|
-
"descriptive_columns": descriptive_columns
|
789
|
+
"descriptive_columns": descriptive_columns,
|
722
790
|
}
|
723
791
|
|
724
792
|
|
725
793
|
def _detect_many_to_many_relationships(
|
726
794
|
raw_tables: List[tuple[data_types.FQNParts, data_types.Table]],
|
727
795
|
metadata: Dict[str, Dict[str, Any]],
|
728
|
-
existing_relationships: List[semantic_model_pb2.Relationship]
|
796
|
+
existing_relationships: List[semantic_model_pb2.Relationship],
|
729
797
|
) -> List[semantic_model_pb2.Relationship]:
|
730
798
|
"""
|
731
799
|
Detect many-to-many relationships through bridge table analysis.
|
@@ -746,7 +814,10 @@ def _detect_many_to_many_relationships(
|
|
746
814
|
for table_name, table_meta in metadata.items():
|
747
815
|
bridge_analysis = _detect_bridge_table_pattern(table_meta, metadata)
|
748
816
|
|
749
|
-
if
|
817
|
+
if (
|
818
|
+
bridge_analysis["is_bridge"]
|
819
|
+
and len(bridge_analysis["connected_tables"]) >= 2
|
820
|
+
):
|
750
821
|
bridge_tables[table_name] = bridge_analysis
|
751
822
|
|
752
823
|
logger.debug(
|
@@ -780,9 +851,15 @@ def _detect_many_to_many_relationships(
|
|
780
851
|
right_fk_cols = []
|
781
852
|
|
782
853
|
for fk_info in bridge_info["fk_like_columns"]:
|
783
|
-
if
|
854
|
+
if (
|
855
|
+
fk_info["references_table"] == left_table
|
856
|
+
and fk_info["confidence"] >= 0.5
|
857
|
+
):
|
784
858
|
left_fk_cols.append(fk_info["column"])
|
785
|
-
elif
|
859
|
+
elif (
|
860
|
+
fk_info["references_table"] == right_table
|
861
|
+
and fk_info["confidence"] >= 0.5
|
862
|
+
):
|
786
863
|
right_fk_cols.append(fk_info["column"])
|
787
864
|
|
788
865
|
if not left_fk_cols or not right_fk_cols:
|
@@ -806,8 +883,12 @@ def _detect_many_to_many_relationships(
|
|
806
883
|
# Use the first detected FK columns as a representative
|
807
884
|
relationship.relationship_columns.append(
|
808
885
|
semantic_model_pb2.RelationKey(
|
809
|
-
left_column=left_fk_cols[
|
810
|
-
|
886
|
+
left_column=left_fk_cols[
|
887
|
+
0
|
888
|
+
], # This is actually in the bridge table
|
889
|
+
right_column=right_fk_cols[
|
890
|
+
0
|
891
|
+
], # This is also in the bridge table
|
811
892
|
)
|
812
893
|
)
|
813
894
|
|
@@ -863,13 +944,19 @@ def _calculate_relationship_confidence(
|
|
863
944
|
pk_confidence = 0.4
|
864
945
|
confidence_score += pk_confidence
|
865
946
|
if left_has_pk and right_has_pk:
|
866
|
-
reasoning_factors.append(
|
947
|
+
reasoning_factors.append(
|
948
|
+
"Both sides have primary key metadata (very strong evidence)"
|
949
|
+
)
|
867
950
|
evidence_details["pk_evidence"] = "both_pk"
|
868
951
|
elif right_has_pk:
|
869
|
-
reasoning_factors.append(
|
952
|
+
reasoning_factors.append(
|
953
|
+
"Right side has primary key metadata (strong evidence)"
|
954
|
+
)
|
870
955
|
evidence_details["pk_evidence"] = "right_pk"
|
871
956
|
elif left_has_pk:
|
872
|
-
reasoning_factors.append(
|
957
|
+
reasoning_factors.append(
|
958
|
+
"Left side has primary key metadata (strong evidence)"
|
959
|
+
)
|
873
960
|
evidence_details["pk_evidence"] = "left_pk"
|
874
961
|
|
875
962
|
# Factor 2: Name similarity and pattern matching
|
@@ -884,28 +971,53 @@ def _calculate_relationship_confidence(
|
|
884
971
|
|
885
972
|
if avg_name_similarity >= 0.9:
|
886
973
|
name_confidence = 0.25
|
887
|
-
reasoning_factors.append(
|
974
|
+
reasoning_factors.append(
|
975
|
+
f"Very high column name similarity ({avg_name_similarity:.2f})"
|
976
|
+
)
|
888
977
|
elif avg_name_similarity >= 0.7:
|
889
978
|
name_confidence = 0.2
|
890
|
-
reasoning_factors.append(
|
979
|
+
reasoning_factors.append(
|
980
|
+
f"High column name similarity ({avg_name_similarity:.2f})"
|
981
|
+
)
|
891
982
|
elif avg_name_similarity >= 0.5:
|
892
983
|
name_confidence = 0.15
|
893
|
-
reasoning_factors.append(
|
984
|
+
reasoning_factors.append(
|
985
|
+
f"Moderate column name similarity ({avg_name_similarity:.2f})"
|
986
|
+
)
|
894
987
|
elif avg_name_similarity >= 0.3:
|
895
988
|
name_confidence = 0.1
|
896
|
-
reasoning_factors.append(
|
989
|
+
reasoning_factors.append(
|
990
|
+
f"Low column name similarity ({avg_name_similarity:.2f})"
|
991
|
+
)
|
897
992
|
else:
|
898
993
|
name_confidence = 0.05
|
899
|
-
reasoning_factors.append(
|
994
|
+
reasoning_factors.append(
|
995
|
+
f"Very low column name similarity ({avg_name_similarity:.2f})"
|
996
|
+
)
|
900
997
|
|
901
998
|
confidence_score += name_confidence
|
902
999
|
|
1000
|
+
generic_pair_count = sum(
|
1001
|
+
1
|
1002
|
+
for left_col, right_col in column_pairs
|
1003
|
+
if _is_generic_identifier(left_col)
|
1004
|
+
and _is_generic_identifier(right_col)
|
1005
|
+
)
|
1006
|
+
if generic_pair_count:
|
1007
|
+
penalty = min(0.15 * generic_pair_count, 0.3)
|
1008
|
+
confidence_score = max(confidence_score - penalty, 0.0)
|
1009
|
+
reasoning_factors.append(
|
1010
|
+
f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
|
1011
|
+
)
|
1012
|
+
|
903
1013
|
# Check for foreign key naming patterns
|
904
1014
|
fk_pattern_confidence = 0.0
|
905
1015
|
for left_col, right_col in column_pairs:
|
906
1016
|
if _looks_like_foreign_key(left_table, right_table, left_col):
|
907
1017
|
fk_pattern_confidence += 0.1
|
908
|
-
reasoning_factors.append(
|
1018
|
+
reasoning_factors.append(
|
1019
|
+
f"Column '{left_col}' follows FK naming pattern"
|
1020
|
+
)
|
909
1021
|
|
910
1022
|
confidence_score += min(fk_pattern_confidence, 0.2)
|
911
1023
|
|
@@ -927,29 +1039,45 @@ def _calculate_relationship_confidence(
|
|
927
1039
|
|
928
1040
|
# Check if uniqueness pattern matches inferred cardinality
|
929
1041
|
left_card, right_card = cardinality_result
|
930
|
-
uniqueness_threshold =
|
1042
|
+
uniqueness_threshold = (
|
1043
|
+
adaptive_thresholds.get("uniqueness_threshold", 0.95)
|
1044
|
+
if adaptive_thresholds
|
1045
|
+
else 0.95
|
1046
|
+
)
|
931
1047
|
|
932
1048
|
cardinality_consistency = False
|
933
1049
|
if left_card == "1" and left_unique_ratio > uniqueness_threshold:
|
934
1050
|
cardinality_consistency = True
|
935
|
-
elif
|
1051
|
+
elif (
|
1052
|
+
left_card in ("*", "+")
|
1053
|
+
and left_unique_ratio <= uniqueness_threshold
|
1054
|
+
):
|
936
1055
|
cardinality_consistency = True
|
937
1056
|
|
938
1057
|
if right_card == "1" and right_unique_ratio > uniqueness_threshold:
|
939
1058
|
cardinality_consistency = cardinality_consistency and True
|
940
|
-
elif
|
1059
|
+
elif (
|
1060
|
+
right_card in ("*", "+")
|
1061
|
+
and right_unique_ratio <= uniqueness_threshold
|
1062
|
+
):
|
941
1063
|
cardinality_consistency = cardinality_consistency and True
|
942
1064
|
|
943
1065
|
if cardinality_consistency:
|
944
1066
|
uniqueness_confidence = 0.2
|
945
|
-
reasoning_factors.append(
|
1067
|
+
reasoning_factors.append(
|
1068
|
+
"Sample uniqueness patterns support inferred cardinality"
|
1069
|
+
)
|
946
1070
|
else:
|
947
1071
|
uniqueness_confidence = 0.1
|
948
|
-
reasoning_factors.append(
|
1072
|
+
reasoning_factors.append(
|
1073
|
+
"Sample uniqueness patterns partially support cardinality"
|
1074
|
+
)
|
949
1075
|
|
950
1076
|
confidence_score += uniqueness_confidence
|
951
1077
|
else:
|
952
|
-
reasoning_factors.append(
|
1078
|
+
reasoning_factors.append(
|
1079
|
+
f"Limited sample size ({sample_size}) reduces confidence"
|
1080
|
+
)
|
953
1081
|
|
954
1082
|
# Factor 4: Data type compatibility
|
955
1083
|
if column_pairs and left_meta and right_meta:
|
@@ -992,15 +1120,21 @@ def _calculate_relationship_confidence(
|
|
992
1120
|
evidence_details["left_table_role"] = left_role
|
993
1121
|
evidence_details["right_table_role"] = right_role
|
994
1122
|
|
995
|
-
relationship_context = _get_business_relationship_context(
|
1123
|
+
relationship_context = _get_business_relationship_context(
|
1124
|
+
left_table, right_table, left_role, right_role
|
1125
|
+
)
|
996
1126
|
evidence_details["relationship_context"] = relationship_context
|
997
1127
|
|
998
1128
|
if relationship_context in ["fact_to_dimension", "dimension_to_fact"]:
|
999
1129
|
role_confidence = 0.15
|
1000
|
-
reasoning_factors.append(
|
1130
|
+
reasoning_factors.append(
|
1131
|
+
f"Strong business relationship pattern: {relationship_context}"
|
1132
|
+
)
|
1001
1133
|
elif relationship_context in ["dimension_hierarchy", "bridge_relationship"]:
|
1002
1134
|
role_confidence = 0.1
|
1003
|
-
reasoning_factors.append(
|
1135
|
+
reasoning_factors.append(
|
1136
|
+
f"Valid business relationship pattern: {relationship_context}"
|
1137
|
+
)
|
1004
1138
|
elif relationship_context == "fact_to_fact":
|
1005
1139
|
role_confidence = 0.05
|
1006
1140
|
reasoning_factors.append("Unusual but possible fact-to-fact relationship")
|
@@ -1013,7 +1147,9 @@ def _calculate_relationship_confidence(
|
|
1013
1147
|
# Factor 6: Multiple column relationships (composite keys)
|
1014
1148
|
if len(column_pairs) > 1:
|
1015
1149
|
composite_confidence = 0.1
|
1016
|
-
reasoning_factors.append(
|
1150
|
+
reasoning_factors.append(
|
1151
|
+
f"Multi-column relationship ({len(column_pairs)} columns) increases confidence"
|
1152
|
+
)
|
1017
1153
|
confidence_score += composite_confidence
|
1018
1154
|
|
1019
1155
|
# Normalize confidence score to 0-1 range
|
@@ -1043,7 +1179,9 @@ def _calculate_relationship_confidence(
|
|
1043
1179
|
"reasoning_factors": reasoning_factors,
|
1044
1180
|
"evidence_details": evidence_details,
|
1045
1181
|
"inferred_cardinality": f"{cardinality_result[0]}:{cardinality_result[1]}",
|
1046
|
-
"join_type":
|
1182
|
+
"join_type": (
|
1183
|
+
"INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
|
1184
|
+
),
|
1047
1185
|
"column_count": len(column_pairs),
|
1048
1186
|
}
|
1049
1187
|
|
@@ -1059,101 +1197,196 @@ def _get_domain_knowledge_patterns() -> Dict[str, Any]:
|
|
1059
1197
|
# Common business entity patterns
|
1060
1198
|
"business_entities": {
|
1061
1199
|
"customer": {
|
1062
|
-
"table_patterns": [
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1200
|
+
"table_patterns": [
|
1201
|
+
"CUSTOMER",
|
1202
|
+
"CUST",
|
1203
|
+
"CLIENT",
|
1204
|
+
"ACCOUNT_HOLDER",
|
1205
|
+
"USER",
|
1206
|
+
"MEMBER",
|
1207
|
+
],
|
1208
|
+
"pk_patterns": [
|
1209
|
+
"CUSTOMER_ID",
|
1210
|
+
"CUST_ID",
|
1211
|
+
"CLIENT_ID",
|
1212
|
+
"USER_ID",
|
1213
|
+
"MEMBER_ID",
|
1214
|
+
],
|
1215
|
+
"typical_attributes": [
|
1216
|
+
"NAME",
|
1217
|
+
"EMAIL",
|
1218
|
+
"PHONE",
|
1219
|
+
"ADDRESS",
|
1220
|
+
"STATUS",
|
1221
|
+
"TYPE",
|
1222
|
+
"SEGMENT",
|
1223
|
+
],
|
1224
|
+
"role": "dimension",
|
1066
1225
|
},
|
1067
1226
|
"product": {
|
1068
1227
|
"table_patterns": ["PRODUCT", "ITEM", "SKU", "INVENTORY", "CATALOG"],
|
1069
1228
|
"pk_patterns": ["PRODUCT_ID", "ITEM_ID", "SKU", "PRODUCT_KEY"],
|
1070
|
-
"typical_attributes": [
|
1071
|
-
|
1229
|
+
"typical_attributes": [
|
1230
|
+
"NAME",
|
1231
|
+
"DESCRIPTION",
|
1232
|
+
"CATEGORY",
|
1233
|
+
"PRICE",
|
1234
|
+
"BRAND",
|
1235
|
+
"STATUS",
|
1236
|
+
],
|
1237
|
+
"role": "dimension",
|
1072
1238
|
},
|
1073
1239
|
"order": {
|
1074
1240
|
"table_patterns": ["ORDER", "TRANSACTION", "SALE", "PURCHASE"],
|
1075
|
-
"pk_patterns": [
|
1241
|
+
"pk_patterns": [
|
1242
|
+
"ORDER_ID",
|
1243
|
+
"TRANSACTION_ID",
|
1244
|
+
"SALE_ID",
|
1245
|
+
"ORDER_NUMBER",
|
1246
|
+
],
|
1076
1247
|
"typical_attributes": ["DATE", "AMOUNT", "STATUS", "QUANTITY", "TOTAL"],
|
1077
|
-
"role": "fact"
|
1248
|
+
"role": "fact",
|
1078
1249
|
},
|
1079
1250
|
"date": {
|
1080
1251
|
"table_patterns": ["DATE", "TIME", "CALENDAR", "DIM_DATE"],
|
1081
1252
|
"pk_patterns": ["DATE_ID", "DATE_KEY", "TIME_ID"],
|
1082
|
-
"typical_attributes": [
|
1083
|
-
|
1253
|
+
"typical_attributes": [
|
1254
|
+
"YEAR",
|
1255
|
+
"MONTH",
|
1256
|
+
"DAY",
|
1257
|
+
"QUARTER",
|
1258
|
+
"WEEK",
|
1259
|
+
"WEEKDAY",
|
1260
|
+
],
|
1261
|
+
"role": "dimension",
|
1084
1262
|
},
|
1085
1263
|
"location": {
|
1086
|
-
"table_patterns": [
|
1264
|
+
"table_patterns": [
|
1265
|
+
"LOCATION",
|
1266
|
+
"GEOGRAPHY",
|
1267
|
+
"ADDRESS",
|
1268
|
+
"REGION",
|
1269
|
+
"TERRITORY",
|
1270
|
+
],
|
1087
1271
|
"pk_patterns": ["LOCATION_ID", "GEO_ID", "ADDRESS_ID", "REGION_ID"],
|
1088
|
-
"typical_attributes": [
|
1089
|
-
|
1272
|
+
"typical_attributes": [
|
1273
|
+
"COUNTRY",
|
1274
|
+
"STATE",
|
1275
|
+
"CITY",
|
1276
|
+
"ZIP",
|
1277
|
+
"LATITUDE",
|
1278
|
+
"LONGITUDE",
|
1279
|
+
],
|
1280
|
+
"role": "dimension",
|
1090
1281
|
},
|
1091
1282
|
"employee": {
|
1092
1283
|
"table_patterns": ["EMPLOYEE", "STAFF", "WORKER", "PERSONNEL"],
|
1093
1284
|
"pk_patterns": ["EMPLOYEE_ID", "STAFF_ID", "EMP_ID"],
|
1094
|
-
"typical_attributes": [
|
1095
|
-
|
1096
|
-
|
1285
|
+
"typical_attributes": [
|
1286
|
+
"NAME",
|
1287
|
+
"DEPARTMENT",
|
1288
|
+
"TITLE",
|
1289
|
+
"MANAGER",
|
1290
|
+
"HIRE_DATE",
|
1291
|
+
],
|
1292
|
+
"role": "dimension",
|
1293
|
+
},
|
1097
1294
|
},
|
1098
|
-
|
1099
1295
|
# Common relationship patterns in data warehouses
|
1100
1296
|
"relationship_patterns": {
|
1101
1297
|
"star_schema": {
|
1102
1298
|
"pattern": "fact_to_dimension",
|
1103
1299
|
"confidence_boost": 0.2,
|
1104
|
-
"description": "Standard star schema fact-to-dimension relationship"
|
1300
|
+
"description": "Standard star schema fact-to-dimension relationship",
|
1105
1301
|
},
|
1106
1302
|
"snowflake_schema": {
|
1107
1303
|
"pattern": "dimension_hierarchy",
|
1108
1304
|
"confidence_boost": 0.15,
|
1109
|
-
"description": "Snowflake schema dimension hierarchy"
|
1305
|
+
"description": "Snowflake schema dimension hierarchy",
|
1110
1306
|
},
|
1111
1307
|
"bridge_table": {
|
1112
1308
|
"pattern": "many_to_many_via_bridge",
|
1113
1309
|
"confidence_boost": 0.1,
|
1114
|
-
"description": "Many-to-many relationship through bridge table"
|
1310
|
+
"description": "Many-to-many relationship through bridge table",
|
1115
1311
|
},
|
1116
1312
|
"time_dimension": {
|
1117
1313
|
"pattern": "temporal_relationship",
|
1118
1314
|
"confidence_boost": 0.25,
|
1119
|
-
"description": "Time-based relationship (very common in warehouses)"
|
1120
|
-
}
|
1315
|
+
"description": "Time-based relationship (very common in warehouses)",
|
1316
|
+
},
|
1121
1317
|
},
|
1122
|
-
|
1123
1318
|
# Known FK patterns that often appear in real data warehouses
|
1124
1319
|
"common_fk_patterns": {
|
1125
1320
|
"customer_references": [
|
1126
|
-
"CUSTOMER_ID",
|
1321
|
+
"CUSTOMER_ID",
|
1322
|
+
"CUST_ID",
|
1323
|
+
"CLIENT_ID",
|
1324
|
+
"ACCOUNT_ID",
|
1325
|
+
"USER_ID",
|
1127
1326
|
],
|
1128
1327
|
"product_references": [
|
1129
|
-
"PRODUCT_ID",
|
1328
|
+
"PRODUCT_ID",
|
1329
|
+
"ITEM_ID",
|
1330
|
+
"SKU",
|
1331
|
+
"PROD_ID",
|
1332
|
+
"CATALOG_ID",
|
1130
1333
|
],
|
1131
1334
|
"date_references": [
|
1132
|
-
"DATE_ID",
|
1133
|
-
"
|
1335
|
+
"DATE_ID",
|
1336
|
+
"ORDER_DATE_ID",
|
1337
|
+
"SHIP_DATE_ID",
|
1338
|
+
"CREATE_DATE_ID",
|
1339
|
+
"TRANSACTION_DATE_ID",
|
1340
|
+
"DATE_KEY",
|
1134
1341
|
],
|
1135
1342
|
"location_references": [
|
1136
|
-
"LOCATION_ID",
|
1137
|
-
"
|
1138
|
-
|
1343
|
+
"LOCATION_ID",
|
1344
|
+
"ADDRESS_ID",
|
1345
|
+
"SHIP_TO_ID",
|
1346
|
+
"BILL_TO_ID",
|
1347
|
+
"WAREHOUSE_ID",
|
1348
|
+
"STORE_ID",
|
1349
|
+
],
|
1139
1350
|
},
|
1140
|
-
|
1141
1351
|
# Table naming conventions that indicate specific patterns
|
1142
1352
|
"naming_conventions": {
|
1143
1353
|
"fact_indicators": [
|
1144
|
-
"FACT_",
|
1145
|
-
"
|
1354
|
+
"FACT_",
|
1355
|
+
"FCT_",
|
1356
|
+
"F_",
|
1357
|
+
"SALES_",
|
1358
|
+
"ORDERS_",
|
1359
|
+
"TRANSACTIONS_",
|
1360
|
+
"REVENUE_",
|
1361
|
+
"METRICS_",
|
1362
|
+
"EVENTS_",
|
1363
|
+
"ACTIVITY_",
|
1146
1364
|
],
|
1147
1365
|
"dimension_indicators": [
|
1148
|
-
"DIM_",
|
1366
|
+
"DIM_",
|
1367
|
+
"D_",
|
1368
|
+
"REF_",
|
1369
|
+
"LKP_",
|
1370
|
+
"LOOKUP_",
|
1371
|
+
"MASTER_",
|
1149
1372
|
],
|
1150
1373
|
"bridge_indicators": [
|
1151
|
-
"BRG_",
|
1374
|
+
"BRG_",
|
1375
|
+
"BRIDGE_",
|
1376
|
+
"XREF_",
|
1377
|
+
"MAP_",
|
1378
|
+
"ASSOC_",
|
1379
|
+
"LINK_",
|
1152
1380
|
],
|
1153
1381
|
"staging_indicators": [
|
1154
|
-
"STG_",
|
1155
|
-
|
1156
|
-
|
1382
|
+
"STG_",
|
1383
|
+
"STAGING_",
|
1384
|
+
"TMP_",
|
1385
|
+
"TEMP_",
|
1386
|
+
"RAW_",
|
1387
|
+
"LANDING_",
|
1388
|
+
],
|
1389
|
+
},
|
1157
1390
|
}
|
1158
1391
|
|
1159
1392
|
|
@@ -1204,18 +1437,26 @@ def _apply_domain_knowledge(
|
|
1204
1437
|
if entity_pair in common_pairs:
|
1205
1438
|
boost = common_pairs[entity_pair]
|
1206
1439
|
confidence_boost += boost
|
1207
|
-
enhancement_factors.append(
|
1440
|
+
enhancement_factors.append(
|
1441
|
+
f"Recognized common business pattern: {entity_pair} (+{boost:.2f})"
|
1442
|
+
)
|
1208
1443
|
elif f"{right_entity}-{left_entity}" in common_pairs:
|
1209
1444
|
boost = common_pairs[f"{right_entity}-{left_entity}"]
|
1210
1445
|
confidence_boost += boost
|
1211
|
-
enhancement_factors.append(
|
1446
|
+
enhancement_factors.append(
|
1447
|
+
f"Recognized common business pattern: {right_entity}-{left_entity} (+{boost:.2f})"
|
1448
|
+
)
|
1212
1449
|
|
1213
1450
|
# Factor 2: Check for standard FK naming patterns
|
1214
1451
|
for left_col, right_col in column_pairs:
|
1215
|
-
fk_pattern_match = _check_standard_fk_patterns(
|
1452
|
+
fk_pattern_match = _check_standard_fk_patterns(
|
1453
|
+
left_col, right_col, domain_patterns
|
1454
|
+
)
|
1216
1455
|
if fk_pattern_match:
|
1217
1456
|
confidence_boost += 0.15
|
1218
|
-
enhancement_factors.append(
|
1457
|
+
enhancement_factors.append(
|
1458
|
+
f"Standard FK pattern detected: {fk_pattern_match}"
|
1459
|
+
)
|
1219
1460
|
|
1220
1461
|
# Factor 3: Table naming convention analysis
|
1221
1462
|
left_convention = _identify_naming_convention(left_table, domain_patterns)
|
@@ -1223,8 +1464,9 @@ def _apply_domain_knowledge(
|
|
1223
1464
|
|
1224
1465
|
if left_convention and right_convention:
|
1225
1466
|
# Boost confidence for expected patterns
|
1226
|
-
if (left_convention == "fact" and right_convention == "dimension") or
|
1227
|
-
|
1467
|
+
if (left_convention == "fact" and right_convention == "dimension") or (
|
1468
|
+
left_convention == "dimension" and right_convention == "fact"
|
1469
|
+
):
|
1228
1470
|
confidence_boost += 0.2
|
1229
1471
|
enhancement_factors.append("Standard fact-dimension naming pattern (+0.20)")
|
1230
1472
|
elif left_convention == "dimension" and right_convention == "dimension":
|
@@ -1237,12 +1479,20 @@ def _apply_domain_knowledge(
|
|
1237
1479
|
enhancement_factors.append("Time dimension relationship (very common) (+0.20)")
|
1238
1480
|
|
1239
1481
|
# Factor 5: Schema pattern recognition (star vs snowflake)
|
1240
|
-
schema_pattern = _detect_schema_pattern(
|
1482
|
+
schema_pattern = _detect_schema_pattern(
|
1483
|
+
left_table, right_table, left_meta, right_meta, domain_patterns
|
1484
|
+
)
|
1241
1485
|
if schema_pattern:
|
1242
|
-
pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
|
1486
|
+
pattern_boost = domain_patterns["relationship_patterns"][schema_pattern][
|
1487
|
+
"confidence_boost"
|
1488
|
+
]
|
1243
1489
|
confidence_boost += pattern_boost
|
1244
|
-
pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
|
1245
|
-
|
1490
|
+
pattern_desc = domain_patterns["relationship_patterns"][schema_pattern][
|
1491
|
+
"description"
|
1492
|
+
]
|
1493
|
+
enhancement_factors.append(
|
1494
|
+
f"Schema pattern: {pattern_desc} (+{pattern_boost:.2f})"
|
1495
|
+
)
|
1246
1496
|
|
1247
1497
|
# Apply the boost but cap the final confidence at 1.0
|
1248
1498
|
enhanced_confidence = min(current_confidence + confidence_boost, 1.0)
|
@@ -1259,7 +1509,9 @@ def _apply_domain_knowledge(
|
|
1259
1509
|
}
|
1260
1510
|
|
1261
1511
|
|
1262
|
-
def _identify_business_entity(
|
1512
|
+
def _identify_business_entity(
|
1513
|
+
table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
|
1514
|
+
) -> Optional[str]:
|
1263
1515
|
"""Identify what business entity a table represents."""
|
1264
1516
|
table_upper = table_name.upper()
|
1265
1517
|
business_entities = domain_patterns["business_entities"]
|
@@ -1274,13 +1526,18 @@ def _identify_business_entity(table_name: str, table_meta: Dict[str, Any], domai
|
|
1274
1526
|
pk_candidates = table_meta.get("pk_candidates", {})
|
1275
1527
|
for pk_pattern in entity_info["pk_patterns"]:
|
1276
1528
|
for pk_norm in pk_candidates.keys():
|
1277
|
-
if
|
1529
|
+
if (
|
1530
|
+
pk_pattern.replace("_", "").upper()
|
1531
|
+
in pk_norm.replace("_", "").upper()
|
1532
|
+
):
|
1278
1533
|
return entity_type
|
1279
1534
|
|
1280
1535
|
return None
|
1281
1536
|
|
1282
1537
|
|
1283
|
-
def _check_standard_fk_patterns(
|
1538
|
+
def _check_standard_fk_patterns(
|
1539
|
+
left_col: str, right_col: str, domain_patterns: Dict[str, Any]
|
1540
|
+
) -> Optional[str]:
|
1284
1541
|
"""Check if column pair matches standard FK patterns."""
|
1285
1542
|
common_fks = domain_patterns["common_fk_patterns"]
|
1286
1543
|
|
@@ -1295,7 +1552,9 @@ def _check_standard_fk_patterns(left_col: str, right_col: str, domain_patterns:
|
|
1295
1552
|
return None
|
1296
1553
|
|
1297
1554
|
|
1298
|
-
def _identify_naming_convention(
|
1555
|
+
def _identify_naming_convention(
|
1556
|
+
table_name: str, domain_patterns: Dict[str, Any]
|
1557
|
+
) -> Optional[str]:
|
1299
1558
|
"""Identify the naming convention used for a table."""
|
1300
1559
|
table_upper = table_name.upper()
|
1301
1560
|
naming_conventions = domain_patterns["naming_conventions"]
|
@@ -1308,7 +1567,9 @@ def _identify_naming_convention(table_name: str, domain_patterns: Dict[str, Any]
|
|
1308
1567
|
return None
|
1309
1568
|
|
1310
1569
|
|
1311
|
-
def _is_time_dimension_pattern(
|
1570
|
+
def _is_time_dimension_pattern(
|
1571
|
+
table_name: str, table_meta: Dict[str, Any], domain_patterns: Dict[str, Any]
|
1572
|
+
) -> bool:
|
1312
1573
|
"""Check if table follows time dimension patterns."""
|
1313
1574
|
table_upper = table_name.upper()
|
1314
1575
|
time_patterns = domain_patterns["business_entities"]["date"]["table_patterns"]
|
@@ -1344,15 +1605,16 @@ def _detect_schema_pattern(
|
|
1344
1605
|
right_table: str,
|
1345
1606
|
left_meta: Dict[str, Any],
|
1346
1607
|
right_meta: Dict[str, Any],
|
1347
|
-
domain_patterns: Dict[str, Any]
|
1608
|
+
domain_patterns: Dict[str, Any],
|
1348
1609
|
) -> Optional[str]:
|
1349
1610
|
"""Detect common schema patterns (star, snowflake, etc.)."""
|
1350
1611
|
left_role = _detect_table_role(left_table, left_meta)
|
1351
1612
|
right_role = _detect_table_role(right_table, right_meta)
|
1352
1613
|
|
1353
1614
|
# Star schema pattern: fact table to dimension
|
1354
|
-
if (left_role == "fact" and right_role == "dimension") or
|
1355
|
-
|
1615
|
+
if (left_role == "fact" and right_role == "dimension") or (
|
1616
|
+
left_role == "dimension" and right_role == "fact"
|
1617
|
+
):
|
1356
1618
|
return "star_schema"
|
1357
1619
|
|
1358
1620
|
# Snowflake schema pattern: dimension to dimension
|
@@ -1360,8 +1622,9 @@ def _detect_schema_pattern(
|
|
1360
1622
|
return "snowflake_schema"
|
1361
1623
|
|
1362
1624
|
# Time dimension pattern (very common)
|
1363
|
-
if _is_time_dimension_pattern(
|
1364
|
-
|
1625
|
+
if _is_time_dimension_pattern(
|
1626
|
+
right_table, right_meta, domain_patterns
|
1627
|
+
) or _is_time_dimension_pattern(left_table, left_meta, domain_patterns):
|
1365
1628
|
return "time_dimension"
|
1366
1629
|
|
1367
1630
|
# Bridge table pattern
|
@@ -1397,7 +1660,9 @@ def _calculate_adaptive_thresholds(
|
|
1397
1660
|
# Calculate sample statistics
|
1398
1661
|
sample_sizes = [len(vals) for vals in values_list if vals]
|
1399
1662
|
max_sample_size = max(sample_sizes) if sample_sizes else base_sample_size
|
1400
|
-
avg_sample_size =
|
1663
|
+
avg_sample_size = (
|
1664
|
+
sum(sample_sizes) / len(sample_sizes) if sample_sizes else base_sample_size
|
1665
|
+
)
|
1401
1666
|
|
1402
1667
|
# Calculate data distribution characteristics
|
1403
1668
|
total_unique_values = 0
|
@@ -1425,7 +1690,7 @@ def _calculate_adaptive_thresholds(
|
|
1425
1690
|
if len(value_counts) > 1:
|
1426
1691
|
max_freq = max(value_counts.values())
|
1427
1692
|
min_freq = min(value_counts.values())
|
1428
|
-
skew = max_freq / min_freq if min_freq > 0 else float(
|
1693
|
+
skew = max_freq / min_freq if min_freq > 0 else float("inf")
|
1429
1694
|
skew_ratios.append(skew)
|
1430
1695
|
|
1431
1696
|
# Calculate overall uniqueness ratio
|
@@ -1459,7 +1724,9 @@ def _calculate_adaptive_thresholds(
|
|
1459
1724
|
min_size_adj *= 1.1
|
1460
1725
|
|
1461
1726
|
# Scale with base sample size from configuration
|
1462
|
-
size_scale_factor =
|
1727
|
+
size_scale_factor = (
|
1728
|
+
min(max_sample_size / base_sample_size, 3.0) if base_sample_size > 0 else 1.0
|
1729
|
+
)
|
1463
1730
|
min_size_adj *= size_scale_factor
|
1464
1731
|
|
1465
1732
|
thresholds["min_sample_size"] = max(int(base_min_size * min_size_adj), 10)
|
@@ -1594,8 +1861,12 @@ def _infer_cardinality(
|
|
1594
1861
|
left_non_null = [v for v in left_values if not _is_nullish(v)]
|
1595
1862
|
right_non_null = [v for v in right_values if not _is_nullish(v)]
|
1596
1863
|
|
1597
|
-
left_unique_ratio =
|
1598
|
-
|
1864
|
+
left_unique_ratio = (
|
1865
|
+
len(set(left_non_null)) / len(left_non_null) if left_non_null else 0
|
1866
|
+
)
|
1867
|
+
right_unique_ratio = (
|
1868
|
+
len(set(right_non_null)) / len(right_non_null) if right_non_null else 0
|
1869
|
+
)
|
1599
1870
|
|
1600
1871
|
# Apply adaptive uniqueness threshold
|
1601
1872
|
left_is_unique = left_unique_ratio > uniqueness_threshold
|
@@ -1691,11 +1962,19 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1691
1962
|
Returns:
|
1692
1963
|
str: Table role ('fact', 'dimension', 'bridge', 'staging', 'unknown')
|
1693
1964
|
"""
|
1694
|
-
upper_name = table_name.upper()
|
1695
1965
|
tokens = _identifier_tokens(table_name)
|
1696
1966
|
|
1697
1967
|
# Rule 1: Explicit prefixes/suffixes
|
1698
|
-
fact_indicators = {
|
1968
|
+
fact_indicators = {
|
1969
|
+
"FACT",
|
1970
|
+
"FCT",
|
1971
|
+
"TXN",
|
1972
|
+
"TRANSACTION",
|
1973
|
+
"EVENT",
|
1974
|
+
"LOG",
|
1975
|
+
"SALES",
|
1976
|
+
"ORDER",
|
1977
|
+
}
|
1699
1978
|
dim_indicators = {"DIM", "DIMENSION", "LOOKUP", "REF", "REFERENCE", "MASTER"}
|
1700
1979
|
bridge_indicators = {"BRIDGE", "BRG", "LINK", "JUNCTION", "ASSOC", "ASSOCIATION"}
|
1701
1980
|
staging_indicators = {"STG", "STAGING", "TMP", "TEMP", "WORK", "LANDING", "RAW"}
|
@@ -1734,9 +2013,22 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1734
2013
|
id_count += 1
|
1735
2014
|
|
1736
2015
|
# Count measure-like columns (amounts, counts, quantities)
|
1737
|
-
if any(
|
2016
|
+
if any(
|
2017
|
+
word in col_name
|
2018
|
+
for word in [
|
2019
|
+
"AMOUNT",
|
2020
|
+
"QTY",
|
2021
|
+
"QUANTITY",
|
2022
|
+
"COUNT",
|
2023
|
+
"TOTAL",
|
2024
|
+
"SUM",
|
2025
|
+
"AVG",
|
2026
|
+
]
|
2027
|
+
):
|
1738
2028
|
measure_like_count += 1
|
1739
|
-
elif base_type in MEASURE_DATATYPES and not col_info.get(
|
2029
|
+
elif base_type in MEASURE_DATATYPES and not col_info.get(
|
2030
|
+
"is_identifier", False
|
2031
|
+
):
|
1740
2032
|
measure_like_count += 1
|
1741
2033
|
else:
|
1742
2034
|
dimension_like_count += 1
|
@@ -1761,7 +2053,9 @@ def _detect_table_role(table_name: str, columns_info: Dict[str, Any]) -> str:
|
|
1761
2053
|
return "unknown"
|
1762
2054
|
|
1763
2055
|
|
1764
|
-
def _get_business_relationship_context(
|
2056
|
+
def _get_business_relationship_context(
|
2057
|
+
left_table: str, right_table: str, left_role: str, right_role: str
|
2058
|
+
) -> str:
|
1765
2059
|
"""
|
1766
2060
|
Determine business relationship context between tables based on their roles.
|
1767
2061
|
|
@@ -1833,7 +2127,7 @@ def _infer_join_type(
|
|
1833
2127
|
4. Naming pattern heuristics
|
1834
2128
|
5. Conservative INNER JOIN default
|
1835
2129
|
"""
|
1836
|
-
|
2130
|
+
|
1837
2131
|
# RULE 1: Default to INNER JOIN (most common and safest)
|
1838
2132
|
default_join = semantic_model_pb2.JoinType.inner
|
1839
2133
|
|
@@ -1861,9 +2155,17 @@ def _infer_join_type(
|
|
1861
2155
|
# Apply business rules based on relationship context
|
1862
2156
|
if relationship_context == "fact_to_dimension":
|
1863
2157
|
# Fact → Dimension: usually INNER, but check for optional dimensions
|
1864
|
-
if any(
|
1865
|
-
|
1866
|
-
|
2158
|
+
if any(
|
2159
|
+
keyword in right_table.upper()
|
2160
|
+
for keyword in [
|
2161
|
+
"PROMO",
|
2162
|
+
"PROMOTION",
|
2163
|
+
"DISCOUNT",
|
2164
|
+
"COUPON",
|
2165
|
+
"OPTIONAL",
|
2166
|
+
"SECONDARY",
|
2167
|
+
]
|
2168
|
+
):
|
1867
2169
|
logger.debug(
|
1868
2170
|
f"Join type inference for {left_table} -> {right_table}: "
|
1869
2171
|
f"LEFT_OUTER (fact to optional dimension: {right_role})"
|
@@ -1907,11 +2209,19 @@ def _infer_join_type(
|
|
1907
2209
|
return semantic_model_pb2.JoinType.left_outer
|
1908
2210
|
|
1909
2211
|
# RULE 5: Naming pattern heuristics for optional relationships
|
1910
|
-
left_upper = left_table.upper()
|
1911
2212
|
right_upper = right_table.upper()
|
1912
2213
|
optional_keywords = {
|
1913
|
-
"OPTIONAL",
|
1914
|
-
"
|
2214
|
+
"OPTIONAL",
|
2215
|
+
"ALTERNATE",
|
2216
|
+
"SECONDARY",
|
2217
|
+
"BACKUP",
|
2218
|
+
"FALLBACK",
|
2219
|
+
"PROMO",
|
2220
|
+
"PROMOTION",
|
2221
|
+
"DISCOUNT",
|
2222
|
+
"COUPON",
|
2223
|
+
"TEMP",
|
2224
|
+
"TMP",
|
1915
2225
|
}
|
1916
2226
|
|
1917
2227
|
for keyword in optional_keywords:
|
@@ -1946,12 +2256,17 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
|
|
1946
2256
|
"""
|
1947
2257
|
fk_upper = fk_column.strip().upper()
|
1948
2258
|
pk_table_variants = _table_variants(pk_table)
|
1949
|
-
|
2259
|
+
|
1950
2260
|
# Pattern 1: {table_name}_id or {table_name}_key
|
1951
2261
|
for variant in pk_table_variants:
|
1952
|
-
if fk_upper in {
|
2262
|
+
if fk_upper in {
|
2263
|
+
f"{variant}_ID",
|
2264
|
+
f"{variant}ID",
|
2265
|
+
f"{variant}_KEY",
|
2266
|
+
f"{variant}KEY",
|
2267
|
+
}:
|
1953
2268
|
return True
|
1954
|
-
|
2269
|
+
|
1955
2270
|
# Pattern 2: Column ends with table name variants
|
1956
2271
|
tokens = _identifier_tokens(fk_column)
|
1957
2272
|
if len(tokens) >= 2:
|
@@ -1961,21 +2276,23 @@ def _looks_like_foreign_key(fk_table: str, pk_table: str, fk_column: str) -> boo
|
|
1961
2276
|
tail = tokens[-1]
|
1962
2277
|
if tail in {"ID", "KEY"}:
|
1963
2278
|
return True
|
1964
|
-
|
2279
|
+
|
1965
2280
|
# Pattern 3: Similar to primary key column but with FK table prefix
|
1966
2281
|
# e.g., order_id in order_items table referencing orders.id
|
1967
2282
|
fk_table_variants = _table_variants(fk_table)
|
1968
2283
|
for fk_variant in fk_table_variants:
|
1969
2284
|
if fk_upper.startswith(fk_variant):
|
1970
|
-
remainder = fk_upper[len(fk_variant):].lstrip("_")
|
2285
|
+
remainder = fk_upper[len(fk_variant) :].lstrip("_")
|
1971
2286
|
for pk_variant in pk_table_variants:
|
1972
2287
|
if remainder.startswith(pk_variant):
|
1973
2288
|
return True
|
1974
|
-
|
2289
|
+
|
1975
2290
|
return False
|
1976
2291
|
|
1977
2292
|
|
1978
|
-
def _suggest_filters(
|
2293
|
+
def _suggest_filters(
|
2294
|
+
raw_table: data_types.Table,
|
2295
|
+
) -> List[semantic_model_pb2.NamedFilter]:
|
1979
2296
|
suggestions: List[semantic_model_pb2.NamedFilter] = []
|
1980
2297
|
for col in raw_table.columns:
|
1981
2298
|
base_type = _base_type_from_type(col.column_type)
|
@@ -2011,12 +2328,20 @@ def _suggest_filters(raw_table: data_types.Table) -> List[semantic_model_pb2.Nam
|
|
2011
2328
|
)
|
2012
2329
|
is_textual = base_type in {"STRING", "TEXT", "VARCHAR", "CHAR", "CHARACTER"}
|
2013
2330
|
is_boolean = base_type in {"BOOLEAN"}
|
2014
|
-
is_categorical_numeric = base_type in {
|
2015
|
-
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
2331
|
+
is_categorical_numeric = base_type in {
|
2332
|
+
"INT",
|
2333
|
+
"INTEGER",
|
2334
|
+
"NUMBER",
|
2335
|
+
"SMALLINT",
|
2336
|
+
"BIGINT",
|
2337
|
+
} and any(upper_name.endswith(suffix) for suffix in categorical_suffixes)
|
2338
|
+
|
2339
|
+
if not is_identifier_like and (
|
2340
|
+
is_textual or is_boolean or is_categorical_numeric
|
2341
|
+
):
|
2342
|
+
formatted = [
|
2343
|
+
_format_literal(val, base_type) for val in distinct_values[:5]
|
2344
|
+
]
|
2020
2345
|
expr = f"{col.column_name} IN ({', '.join(formatted)})"
|
2021
2346
|
suggestions.append(
|
2022
2347
|
semantic_model_pb2.NamedFilter(
|
@@ -2034,11 +2359,31 @@ def _infer_relationships(
|
|
2034
2359
|
*,
|
2035
2360
|
session: Optional[Session] = None,
|
2036
2361
|
strict_join_inference: bool = False,
|
2362
|
+
status: Optional[Dict[str, bool]] = None,
|
2363
|
+
max_relationships: Optional[int] = None,
|
2364
|
+
min_confidence: float = 0.2,
|
2365
|
+
timeout_seconds: Optional[float] = None,
|
2037
2366
|
) -> List[semantic_model_pb2.Relationship]:
|
2367
|
+
status_dict = status if status is not None else {}
|
2368
|
+
if "limited_by_timeout" not in status_dict:
|
2369
|
+
status_dict["limited_by_timeout"] = False
|
2370
|
+
if "limited_by_max_relationships" not in status_dict:
|
2371
|
+
status_dict["limited_by_max_relationships"] = False
|
2372
|
+
|
2038
2373
|
relationships: List[semantic_model_pb2.Relationship] = []
|
2039
2374
|
if not raw_tables:
|
2040
2375
|
return relationships
|
2041
2376
|
|
2377
|
+
start_time = time.perf_counter()
|
2378
|
+
min_confidence = max(0.0, min(min_confidence, 1.0))
|
2379
|
+
limit_reached = False
|
2380
|
+
|
2381
|
+
def _timed_out() -> bool:
|
2382
|
+
return (
|
2383
|
+
timeout_seconds is not None
|
2384
|
+
and (time.perf_counter() - start_time) >= timeout_seconds
|
2385
|
+
)
|
2386
|
+
|
2042
2387
|
metadata = {}
|
2043
2388
|
prefix_counter: Dict[str, int] = {}
|
2044
2389
|
for _, raw_table in raw_tables:
|
@@ -2060,7 +2405,9 @@ def _infer_relationships(
|
|
2060
2405
|
table_prefixes = global_prefixes | _table_prefixes(raw_table.name)
|
2061
2406
|
for column in raw_table.columns:
|
2062
2407
|
base_type = _base_type_from_type(column.column_type)
|
2063
|
-
normalized = _sanitize_identifier_name(
|
2408
|
+
normalized = _sanitize_identifier_name(
|
2409
|
+
column.column_name, prefixes_to_drop=table_prefixes
|
2410
|
+
)
|
2064
2411
|
entry = columns_meta.setdefault(
|
2065
2412
|
normalized,
|
2066
2413
|
{
|
@@ -2075,7 +2422,9 @@ def _infer_relationships(
|
|
2075
2422
|
entry["names"].append(column.column_name)
|
2076
2423
|
if column.values:
|
2077
2424
|
entry["values"].extend(column.values)
|
2078
|
-
entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
|
2425
|
+
entry["is_identifier"] = entry["is_identifier"] or _is_identifier_like(
|
2426
|
+
column.column_name, base_type
|
2427
|
+
)
|
2079
2428
|
is_primary = getattr(column, "is_primary_key", False)
|
2080
2429
|
if is_primary:
|
2081
2430
|
entry["is_primary"] = True
|
@@ -2093,15 +2442,42 @@ def _infer_relationships(
|
|
2093
2442
|
pairs: dict[tuple[str, str], List[tuple[str, str]]] = {}
|
2094
2443
|
null_check_cache: Dict[Tuple[str, str, str, str], bool] = {}
|
2095
2444
|
|
2096
|
-
def _record_pair(
|
2445
|
+
def _record_pair(
|
2446
|
+
left_table: str, right_table: str, left_col: str, right_col: str
|
2447
|
+
) -> None:
|
2448
|
+
nonlocal limit_reached
|
2449
|
+
if limit_reached:
|
2450
|
+
return
|
2451
|
+
if _timed_out():
|
2452
|
+
status_dict["limited_by_timeout"] = True
|
2453
|
+
limit_reached = True
|
2454
|
+
return
|
2455
|
+
|
2097
2456
|
key = (left_table, right_table)
|
2098
2457
|
value = (left_col, right_col)
|
2099
|
-
|
2100
|
-
|
2458
|
+
bucket = pairs.setdefault(key, [])
|
2459
|
+
if value not in bucket:
|
2460
|
+
bucket.append(value)
|
2461
|
+
if (
|
2462
|
+
max_relationships is not None
|
2463
|
+
and len(pairs) >= max_relationships
|
2464
|
+
):
|
2465
|
+
status_dict["limited_by_max_relationships"] = True
|
2466
|
+
limit_reached = True
|
2101
2467
|
|
2102
2468
|
table_names = list(metadata.keys())
|
2103
2469
|
for i in range(len(table_names)):
|
2470
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2471
|
+
break
|
2472
|
+
if _timed_out():
|
2473
|
+
status_dict["limited_by_timeout"] = True
|
2474
|
+
break
|
2104
2475
|
for j in range(i + 1, len(table_names)):
|
2476
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2477
|
+
break
|
2478
|
+
if _timed_out():
|
2479
|
+
status_dict["limited_by_timeout"] = True
|
2480
|
+
break
|
2105
2481
|
table_a_name = table_names[i]
|
2106
2482
|
table_b_name = table_names[j]
|
2107
2483
|
table_a = metadata[table_a_name]
|
@@ -2158,7 +2534,7 @@ def _infer_relationships(
|
|
2158
2534
|
continue
|
2159
2535
|
if norm_b == pk_norm:
|
2160
2536
|
continue
|
2161
|
-
|
2537
|
+
|
2162
2538
|
# Direct suffix match
|
2163
2539
|
if norm_b.endswith(pk_norm):
|
2164
2540
|
_record_pair(
|
@@ -2168,23 +2544,34 @@ def _infer_relationships(
|
|
2168
2544
|
pk_cols[0],
|
2169
2545
|
)
|
2170
2546
|
continue
|
2171
|
-
|
2547
|
+
|
2172
2548
|
# Enhanced: Check if column looks like a foreign key to this table
|
2173
|
-
if _looks_like_foreign_key(
|
2549
|
+
if _looks_like_foreign_key(
|
2550
|
+
table_b_name, table_a_name, meta_b["names"][0]
|
2551
|
+
):
|
2174
2552
|
# Additional check: name similarity with adaptive threshold
|
2175
2553
|
similarity = _name_similarity(norm_b, pk_norm)
|
2176
2554
|
# Calculate adaptive threshold for this relationship
|
2177
2555
|
all_sample_values = []
|
2178
|
-
for col_values in [
|
2556
|
+
for col_values in [
|
2557
|
+
pk_meta.get("values", []),
|
2558
|
+
meta_b.get("values", []),
|
2559
|
+
]:
|
2179
2560
|
if col_values:
|
2180
2561
|
all_sample_values.append(col_values)
|
2181
2562
|
|
2182
2563
|
adaptive_thresholds = _calculate_adaptive_thresholds(
|
2183
2564
|
all_sample_values,
|
2184
2565
|
table_count=len(raw_tables),
|
2185
|
-
base_sample_size=
|
2566
|
+
base_sample_size=(
|
2567
|
+
len(pk_meta.get("values", []))
|
2568
|
+
if pk_meta.get("values")
|
2569
|
+
else 10
|
2570
|
+
),
|
2571
|
+
)
|
2572
|
+
similarity_threshold = adaptive_thresholds.get(
|
2573
|
+
"similarity_threshold", 0.6
|
2186
2574
|
)
|
2187
|
-
similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
|
2188
2575
|
|
2189
2576
|
if similarity >= similarity_threshold:
|
2190
2577
|
_record_pair(
|
@@ -2204,7 +2591,7 @@ def _infer_relationships(
|
|
2204
2591
|
continue
|
2205
2592
|
if norm_a == pk_norm:
|
2206
2593
|
continue
|
2207
|
-
|
2594
|
+
|
2208
2595
|
# Direct suffix match
|
2209
2596
|
if norm_a.endswith(pk_norm):
|
2210
2597
|
_record_pair(
|
@@ -2214,23 +2601,34 @@ def _infer_relationships(
|
|
2214
2601
|
pk_cols[0],
|
2215
2602
|
)
|
2216
2603
|
continue
|
2217
|
-
|
2604
|
+
|
2218
2605
|
# Enhanced: Check if column looks like a foreign key to this table
|
2219
|
-
if _looks_like_foreign_key(
|
2606
|
+
if _looks_like_foreign_key(
|
2607
|
+
table_a_name, table_b_name, meta_a["names"][0]
|
2608
|
+
):
|
2220
2609
|
# Additional check: name similarity with adaptive threshold
|
2221
2610
|
similarity = _name_similarity(norm_a, pk_norm)
|
2222
2611
|
# Calculate adaptive threshold for this relationship
|
2223
2612
|
all_sample_values = []
|
2224
|
-
for col_values in [
|
2613
|
+
for col_values in [
|
2614
|
+
pk_meta.get("values", []),
|
2615
|
+
meta_a.get("values", []),
|
2616
|
+
]:
|
2225
2617
|
if col_values:
|
2226
2618
|
all_sample_values.append(col_values)
|
2227
2619
|
|
2228
2620
|
adaptive_thresholds = _calculate_adaptive_thresholds(
|
2229
2621
|
all_sample_values,
|
2230
2622
|
table_count=len(raw_tables),
|
2231
|
-
base_sample_size=
|
2623
|
+
base_sample_size=(
|
2624
|
+
len(pk_meta.get("values", []))
|
2625
|
+
if pk_meta.get("values")
|
2626
|
+
else 10
|
2627
|
+
),
|
2628
|
+
)
|
2629
|
+
similarity_threshold = adaptive_thresholds.get(
|
2630
|
+
"similarity_threshold", 0.6
|
2232
2631
|
)
|
2233
|
-
similarity_threshold = adaptive_thresholds.get("similarity_threshold", 0.6)
|
2234
2632
|
|
2235
2633
|
if similarity >= similarity_threshold:
|
2236
2634
|
_record_pair(
|
@@ -2255,10 +2653,19 @@ def _infer_relationships(
|
|
2255
2653
|
|
2256
2654
|
# Build relationships with inferred cardinality
|
2257
2655
|
for (left_table, right_table), column_pairs in pairs.items():
|
2656
|
+
if _timed_out():
|
2657
|
+
status_dict["limited_by_timeout"] = True
|
2658
|
+
break
|
2659
|
+
if (
|
2660
|
+
max_relationships is not None
|
2661
|
+
and len(relationships) >= max_relationships
|
2662
|
+
):
|
2663
|
+
status_dict["limited_by_max_relationships"] = True
|
2664
|
+
break
|
2258
2665
|
# Infer cardinality based on available metadata
|
2259
2666
|
left_meta = metadata[left_table]
|
2260
2667
|
right_meta = metadata[right_table]
|
2261
|
-
|
2668
|
+
|
2262
2669
|
# Determine if tables have primary keys in the relationship
|
2263
2670
|
left_has_pk = any(
|
2264
2671
|
col_name in [pair[0] for pair in column_pairs]
|
@@ -2270,7 +2677,7 @@ def _infer_relationships(
|
|
2270
2677
|
for pk_list in right_meta["pk_candidates"].values()
|
2271
2678
|
for col_name in pk_list
|
2272
2679
|
)
|
2273
|
-
|
2680
|
+
|
2274
2681
|
# Enhanced: Get sample values for all columns in the relationship (for composite key analysis)
|
2275
2682
|
left_values_all = []
|
2276
2683
|
right_values_all = []
|
@@ -2279,12 +2686,11 @@ def _infer_relationships(
|
|
2279
2686
|
|
2280
2687
|
for left_col, right_col in column_pairs:
|
2281
2688
|
left_col_key = _sanitize_identifier_name(
|
2282
|
-
left_col,
|
2283
|
-
prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
|
2689
|
+
left_col, prefixes_to_drop=global_prefixes | _table_prefixes(left_table)
|
2284
2690
|
)
|
2285
2691
|
right_col_key = _sanitize_identifier_name(
|
2286
2692
|
right_col,
|
2287
|
-
prefixes_to_drop=global_prefixes | _table_prefixes(right_table)
|
2693
|
+
prefixes_to_drop=global_prefixes | _table_prefixes(right_table),
|
2288
2694
|
)
|
2289
2695
|
|
2290
2696
|
left_col_values = []
|
@@ -2293,7 +2699,9 @@ def _infer_relationships(
|
|
2293
2699
|
if left_col_key in left_meta["columns"]:
|
2294
2700
|
left_col_values = left_meta["columns"][left_col_key].get("values") or []
|
2295
2701
|
if right_col_key in right_meta["columns"]:
|
2296
|
-
right_col_values =
|
2702
|
+
right_col_values = (
|
2703
|
+
right_meta["columns"][right_col_key].get("values") or []
|
2704
|
+
)
|
2297
2705
|
|
2298
2706
|
left_values_all.append(left_col_values)
|
2299
2707
|
right_values_all.append(right_col_values)
|
@@ -2322,7 +2730,7 @@ def _infer_relationships(
|
|
2322
2730
|
right_has_pk,
|
2323
2731
|
adaptive_thresholds=global_adaptive_thresholds,
|
2324
2732
|
)
|
2325
|
-
|
2733
|
+
|
2326
2734
|
# Determine if SQL null probe should be executed for stricter inference
|
2327
2735
|
strict_fk_detected = False
|
2328
2736
|
if strict_join_inference and session:
|
@@ -2352,7 +2760,7 @@ def _infer_relationships(
|
|
2352
2760
|
left_table_meta=left_meta,
|
2353
2761
|
right_table_meta=right_meta,
|
2354
2762
|
)
|
2355
|
-
|
2763
|
+
|
2356
2764
|
# Calculate confidence and reasoning for this relationship
|
2357
2765
|
confidence_analysis = _calculate_relationship_confidence(
|
2358
2766
|
left_table=left_table,
|
@@ -2376,45 +2784,54 @@ def _infer_relationships(
|
|
2376
2784
|
column_pairs=column_pairs,
|
2377
2785
|
left_meta=left_meta,
|
2378
2786
|
right_meta=right_meta,
|
2379
|
-
current_confidence=confidence_analysis[
|
2787
|
+
current_confidence=confidence_analysis["confidence_score"],
|
2380
2788
|
)
|
2381
2789
|
|
2382
2790
|
# Update confidence analysis with domain knowledge
|
2383
|
-
if domain_enhancement[
|
2384
|
-
confidence_analysis[
|
2385
|
-
|
2791
|
+
if domain_enhancement["confidence_boost"] > 0:
|
2792
|
+
confidence_analysis["confidence_score"] = min(
|
2793
|
+
1.0,
|
2794
|
+
confidence_analysis["confidence_score"]
|
2795
|
+
+ domain_enhancement["confidence_boost"],
|
2796
|
+
)
|
2386
2797
|
|
2387
2798
|
# Add domain knowledge factors to reasoning
|
2388
|
-
for domain_factor in domain_enhancement[
|
2389
|
-
confidence_analysis[
|
2799
|
+
for domain_factor in domain_enhancement["domain_factors"]:
|
2800
|
+
confidence_analysis["reasoning_factors"].append(
|
2801
|
+
f"Domain knowledge: {domain_factor}"
|
2802
|
+
)
|
2390
2803
|
|
2391
2804
|
# Update confidence level based on new score
|
2392
|
-
if confidence_analysis[
|
2393
|
-
confidence_analysis[
|
2394
|
-
confidence_analysis[
|
2395
|
-
elif confidence_analysis[
|
2396
|
-
confidence_analysis[
|
2397
|
-
confidence_analysis[
|
2398
|
-
elif confidence_analysis[
|
2399
|
-
confidence_analysis[
|
2400
|
-
confidence_analysis[
|
2401
|
-
elif confidence_analysis[
|
2402
|
-
confidence_analysis[
|
2403
|
-
confidence_analysis[
|
2805
|
+
if confidence_analysis["confidence_score"] >= 0.8:
|
2806
|
+
confidence_analysis["confidence_level"] = "very_high"
|
2807
|
+
confidence_analysis["confidence_description"] = "Very High Confidence"
|
2808
|
+
elif confidence_analysis["confidence_score"] >= 0.6:
|
2809
|
+
confidence_analysis["confidence_level"] = "high"
|
2810
|
+
confidence_analysis["confidence_description"] = "High Confidence"
|
2811
|
+
elif confidence_analysis["confidence_score"] >= 0.4:
|
2812
|
+
confidence_analysis["confidence_level"] = "medium"
|
2813
|
+
confidence_analysis["confidence_description"] = "Medium Confidence"
|
2814
|
+
elif confidence_analysis["confidence_score"] >= 0.2:
|
2815
|
+
confidence_analysis["confidence_level"] = "low"
|
2816
|
+
confidence_analysis["confidence_description"] = "Low Confidence"
|
2404
2817
|
else:
|
2405
|
-
confidence_analysis[
|
2406
|
-
confidence_analysis[
|
2818
|
+
confidence_analysis["confidence_level"] = "very_low"
|
2819
|
+
confidence_analysis["confidence_description"] = "Very Low Confidence"
|
2407
2820
|
|
2408
2821
|
# Enhanced logging with confidence and reasoning
|
2409
2822
|
sample_info = f"samples: L={len(left_values)}, R={len(right_values)}"
|
2410
2823
|
pk_info = f"PKs: L={left_has_pk}, R={right_has_pk}"
|
2411
|
-
join_type_name =
|
2824
|
+
join_type_name = (
|
2825
|
+
"INNER" if join_type == semantic_model_pb2.JoinType.inner else "LEFT_OUTER"
|
2826
|
+
)
|
2412
2827
|
confidence_info = f"confidence: {confidence_analysis['confidence_score']:.2f} ({confidence_analysis['confidence_level']})"
|
2413
2828
|
|
2414
2829
|
# Add domain knowledge info if applied
|
2415
2830
|
domain_info = ""
|
2416
|
-
if domain_enhancement[
|
2417
|
-
domain_info =
|
2831
|
+
if domain_enhancement["confidence_boost"] > 0:
|
2832
|
+
domain_info = (
|
2833
|
+
f", domain boost: +{domain_enhancement['confidence_boost']:.2f}"
|
2834
|
+
)
|
2418
2835
|
|
2419
2836
|
logger.info(
|
2420
2837
|
f"Relationship inference for {left_table} -> {right_table}: "
|
@@ -2423,22 +2840,40 @@ def _infer_relationships(
|
|
2423
2840
|
)
|
2424
2841
|
|
2425
2842
|
# Log domain knowledge patterns if detected
|
2426
|
-
domain_factors = [
|
2843
|
+
domain_factors = [
|
2844
|
+
f
|
2845
|
+
for f in confidence_analysis["reasoning_factors"]
|
2846
|
+
if f.startswith("Domain knowledge:")
|
2847
|
+
]
|
2427
2848
|
if domain_factors:
|
2428
|
-
logger.debug(
|
2849
|
+
logger.debug(
|
2850
|
+
f"Domain patterns detected for {left_table} -> {right_table}: {domain_factors}"
|
2851
|
+
)
|
2429
2852
|
|
2430
2853
|
# Log detailed reasoning for medium or lower confidence relationships
|
2431
|
-
if confidence_analysis[
|
2854
|
+
if confidence_analysis["confidence_score"] < 0.6:
|
2432
2855
|
logger.debug(f"Confidence reasoning for {left_table} -> {right_table}:")
|
2433
|
-
for factor in confidence_analysis[
|
2856
|
+
for factor in confidence_analysis["reasoning_factors"]:
|
2434
2857
|
logger.debug(f" - {factor}")
|
2435
2858
|
|
2436
2859
|
# Log very high confidence relationships with their evidence
|
2437
|
-
elif confidence_analysis[
|
2438
|
-
logger.debug(
|
2439
|
-
|
2860
|
+
elif confidence_analysis["confidence_score"] >= 0.8:
|
2861
|
+
logger.debug(
|
2862
|
+
f"High confidence relationship {left_table} -> {right_table} based on:"
|
2863
|
+
)
|
2864
|
+
for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
|
2440
2865
|
logger.debug(f" + {factor}")
|
2441
|
-
|
2866
|
+
|
2867
|
+
if confidence_analysis["confidence_score"] < min_confidence:
|
2868
|
+
logger.debug(
|
2869
|
+
"Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
|
2870
|
+
left_table,
|
2871
|
+
right_table,
|
2872
|
+
confidence_analysis["confidence_score"],
|
2873
|
+
min_confidence,
|
2874
|
+
)
|
2875
|
+
continue
|
2876
|
+
|
2442
2877
|
# Determine relationship type based on cardinality
|
2443
2878
|
if left_card == "1" and right_card == "1":
|
2444
2879
|
rel_type = semantic_model_pb2.RelationshipType.one_to_one
|
@@ -2449,7 +2884,7 @@ def _infer_relationships(
|
|
2449
2884
|
else:
|
2450
2885
|
# Default to many_to_one for backward compatibility
|
2451
2886
|
rel_type = semantic_model_pb2.RelationshipType.many_to_one
|
2452
|
-
|
2887
|
+
|
2453
2888
|
relationship = semantic_model_pb2.Relationship(
|
2454
2889
|
name=f"{left_table}_to_{right_table}",
|
2455
2890
|
left_table=left_table,
|
@@ -2466,15 +2901,30 @@ def _infer_relationships(
|
|
2466
2901
|
relationships.append(relationship)
|
2467
2902
|
|
2468
2903
|
# Phase 2: Detect many-to-many relationships through bridge table analysis
|
2469
|
-
many_to_many_relationships =
|
2470
|
-
|
2471
|
-
|
2904
|
+
many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
|
2905
|
+
if not status_dict["limited_by_timeout"] and (
|
2906
|
+
max_relationships is None or len(relationships) < max_relationships
|
2907
|
+
):
|
2908
|
+
many_to_many_relationships = _detect_many_to_many_relationships(
|
2909
|
+
raw_tables, metadata, relationships
|
2910
|
+
)
|
2472
2911
|
|
2473
|
-
|
2474
|
-
|
2475
|
-
|
2912
|
+
if many_to_many_relationships and max_relationships is not None:
|
2913
|
+
remaining = max_relationships - len(relationships)
|
2914
|
+
if remaining <= 0:
|
2915
|
+
many_to_many_relationships = []
|
2916
|
+
else:
|
2917
|
+
many_to_many_relationships = many_to_many_relationships[:remaining]
|
2476
2918
|
|
2477
|
-
|
2919
|
+
if many_to_many_relationships:
|
2920
|
+
relationships.extend(many_to_many_relationships)
|
2921
|
+
logger.info(
|
2922
|
+
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2923
|
+
)
|
2924
|
+
|
2925
|
+
logger.info(
|
2926
|
+
f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
|
2927
|
+
)
|
2478
2928
|
return relationships
|
2479
2929
|
|
2480
2930
|
|
@@ -2512,7 +2962,14 @@ def _raw_table_to_semantic_context_table(
|
|
2512
2962
|
base_type = _base_type_from_type(col.column_type)
|
2513
2963
|
if _is_time_like_column(col):
|
2514
2964
|
time_data_type = col.column_type
|
2515
|
-
if time_data_type.split("(")[0].upper() in {
|
2965
|
+
if time_data_type.split("(")[0].upper() in {
|
2966
|
+
"STRING",
|
2967
|
+
"VARCHAR",
|
2968
|
+
"TEXT",
|
2969
|
+
"CHAR",
|
2970
|
+
"CHARACTER",
|
2971
|
+
"NVARCHAR",
|
2972
|
+
}:
|
2516
2973
|
time_data_type = "TIMESTAMP_NTZ"
|
2517
2974
|
time_dimension_name = _safe_semantic_identifier(
|
2518
2975
|
col.column_name,
|
@@ -2564,7 +3021,9 @@ def _raw_table_to_semantic_context_table(
|
|
2564
3021
|
data_type=col.column_type,
|
2565
3022
|
sample_values=col.values,
|
2566
3023
|
synonyms=[_PLACEHOLDER_COMMENT],
|
2567
|
-
description=
|
3024
|
+
description=(
|
3025
|
+
col.comment if col.comment else _PLACEHOLDER_COMMENT
|
3026
|
+
),
|
2568
3027
|
)
|
2569
3028
|
)
|
2570
3029
|
continue
|
@@ -2685,7 +3144,9 @@ def raw_schema_to_semantic_context(
|
|
2685
3144
|
unique_database_schema.append(fqn_databse_schema)
|
2686
3145
|
|
2687
3146
|
logger.info(f"Pulling column information from {fqn_table}")
|
2688
|
-
_notify(
|
3147
|
+
_notify(
|
3148
|
+
f"Fetching metadata for {fqn_table.database}.{fqn_table.schema_name}.{fqn_table.table}..."
|
3149
|
+
)
|
2689
3150
|
valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
|
2690
3151
|
session=conn,
|
2691
3152
|
workspace=fqn_table.database,
|
@@ -2751,7 +3212,9 @@ def raw_schema_to_semantic_context(
|
|
2751
3212
|
semantic_model_name,
|
2752
3213
|
actual_model,
|
2753
3214
|
)
|
2754
|
-
_notify(
|
3215
|
+
_notify(
|
3216
|
+
"Running DashScope enrichment to enhance descriptions and metrics..."
|
3217
|
+
)
|
2755
3218
|
|
2756
3219
|
# Create progress tracker for enrichment
|
2757
3220
|
def enrichment_progress_callback(update):
|
@@ -2760,14 +3223,16 @@ def raw_schema_to_semantic_context(
|
|
2760
3223
|
EnrichmentStage.MODEL_DESCRIPTION: "Generating model description",
|
2761
3224
|
EnrichmentStage.MODEL_METRICS: "Generating model-level metrics",
|
2762
3225
|
EnrichmentStage.VERIFIED_QUERIES: "Generating verified queries",
|
2763
|
-
EnrichmentStage.COMPLETE: "Enrichment complete"
|
3226
|
+
EnrichmentStage.COMPLETE: "Enrichment complete",
|
2764
3227
|
}
|
2765
3228
|
|
2766
3229
|
base_message = stage_messages.get(update.stage, "Processing")
|
2767
3230
|
if update.table_name:
|
2768
3231
|
message = f"{base_message} - {update.table_name} ({update.current_step}/{update.total_steps})"
|
2769
3232
|
elif update.total_steps > 1:
|
2770
|
-
message =
|
3233
|
+
message = (
|
3234
|
+
f"{base_message} ({update.current_step}/{update.total_steps})"
|
3235
|
+
)
|
2771
3236
|
else:
|
2772
3237
|
message = base_message
|
2773
3238
|
|
@@ -2801,7 +3266,9 @@ def raw_schema_to_semantic_context(
|
|
2801
3266
|
)
|
2802
3267
|
_notify("DashScope enrichment complete.")
|
2803
3268
|
else:
|
2804
|
-
logger.warning(
|
3269
|
+
logger.warning(
|
3270
|
+
"LLM enrichment was requested but DashScope is not configured; skipping enrichment."
|
3271
|
+
)
|
2805
3272
|
_notify("DashScope configuration missing; skipped enrichment.")
|
2806
3273
|
return context
|
2807
3274
|
|
@@ -2938,6 +3405,7 @@ def generate_model_str_from_clickzetta(
|
|
2938
3405
|
Returns:
|
2939
3406
|
str: The raw string of the semantic context.
|
2940
3407
|
"""
|
3408
|
+
|
2941
3409
|
def _notify(message: str) -> None:
|
2942
3410
|
if progress_callback:
|
2943
3411
|
try:
|
@@ -2946,7 +3414,11 @@ def generate_model_str_from_clickzetta(
|
|
2946
3414
|
logger.debug("Progress callback failed for message: {}", message)
|
2947
3415
|
|
2948
3416
|
table_list = ", ".join(base_tables)
|
2949
|
-
logger.info(
|
3417
|
+
logger.info(
|
3418
|
+
"Generating semantic model '{}' from tables: {}",
|
3419
|
+
semantic_model_name,
|
3420
|
+
table_list,
|
3421
|
+
)
|
2950
3422
|
_notify("Collecting metadata from ClickZetta tables...")
|
2951
3423
|
|
2952
3424
|
context = raw_schema_to_semantic_context(
|