clickzetta-semantic-model-generator 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/PKG-INFO +1 -1
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/pyproject.toml +1 -1
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/relationships/__init__.py +15 -0
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/relationships/discovery.py +207 -0
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/tests/relationship_discovery_test.py +111 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/README.md +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/clickzetta_utils/env_vars.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/clickzetta_utils/utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/data_processing/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/data_processing/cte_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/data_processing/data_types.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/data_processing/proto_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/generate_model.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/llm/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/llm/dashscope_client.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/llm/enrichment.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/llm/progress_tracker.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/output_models/.keep +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/protos/semantic_model.proto +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/clickzetta_connector_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/generate_model_classification_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/llm_enrichment_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/relationships_filters_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/validate_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/validate/context_length.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/validate/keywords.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/validate/schema.py +0 -0
- {clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/semantic_model_generator/validate_model.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "clickzetta-semantic-model-generator"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.2"
|
4
4
|
description = "Curate a Semantic Model for ClickZetta Lakehouse"
|
5
5
|
authors = ["qililiang <qililiang@clickzetta.com>"]
|
6
6
|
license = "Apache Software License; BSD License"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""Public APIs for relationship discovery."""
|
2
|
+
|
3
|
+
from .discovery import (
|
4
|
+
RelationshipDiscoveryResult,
|
5
|
+
RelationshipSummary,
|
6
|
+
discover_relationships_from_schema,
|
7
|
+
discover_relationships_from_tables,
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"RelationshipDiscoveryResult",
|
12
|
+
"RelationshipSummary",
|
13
|
+
"discover_relationships_from_schema",
|
14
|
+
"discover_relationships_from_tables",
|
15
|
+
]
|
@@ -0,0 +1,207 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import time
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
from loguru import logger
|
9
|
+
|
10
|
+
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
11
|
+
_TABLE_NAME_COL,
|
12
|
+
_TABLE_SCHEMA_COL,
|
13
|
+
get_table_representation,
|
14
|
+
get_valid_schemas_tables_columns_df,
|
15
|
+
)
|
16
|
+
from semantic_model_generator.data_processing import data_types
|
17
|
+
from semantic_model_generator.data_processing.data_types import FQNParts, Table
|
18
|
+
from semantic_model_generator.generate_model import (
|
19
|
+
_DEFAULT_N_SAMPLE_VALUES_PER_COL,
|
20
|
+
_infer_relationships,
|
21
|
+
)
|
22
|
+
from semantic_model_generator.protos import semantic_model_pb2
|
23
|
+
|
24
|
+
try: # pragma: no cover - optional dependency for type checking
|
25
|
+
from clickzetta.zettapark.session import Session
|
26
|
+
except Exception: # pragma: no cover
|
27
|
+
Session = Any # type: ignore
|
28
|
+
|
29
|
+
DEFAULT_MAX_WORKERS = 4
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass
|
33
|
+
class RelationshipSummary:
|
34
|
+
total_tables: int
|
35
|
+
total_columns: int
|
36
|
+
total_relationships_found: int
|
37
|
+
processing_time_ms: int
|
38
|
+
|
39
|
+
|
40
|
+
@dataclass
|
41
|
+
class RelationshipDiscoveryResult:
|
42
|
+
relationships: List[semantic_model_pb2.Relationship]
|
43
|
+
tables: List[Table]
|
44
|
+
summary: RelationshipSummary
|
45
|
+
|
46
|
+
|
47
|
+
def _normalize_table_names(table_names: Optional[Iterable[str]]) -> Optional[List[str]]:
|
48
|
+
if table_names is None:
|
49
|
+
return None
|
50
|
+
return [name.upper() for name in table_names]
|
51
|
+
|
52
|
+
|
53
|
+
def _build_tables_from_dataframe(
|
54
|
+
session: Session,
|
55
|
+
workspace: str,
|
56
|
+
schema: str,
|
57
|
+
columns_df: pd.DataFrame,
|
58
|
+
sample_values_per_column: int,
|
59
|
+
max_workers: int = DEFAULT_MAX_WORKERS,
|
60
|
+
) -> List[Tuple[FQNParts, Table]]:
|
61
|
+
if columns_df.empty:
|
62
|
+
return []
|
63
|
+
|
64
|
+
if _TABLE_NAME_COL not in columns_df.columns:
|
65
|
+
raise KeyError(
|
66
|
+
f"Expected '{_TABLE_NAME_COL}' column in metadata dataframe. "
|
67
|
+
"Ensure information_schema query returned table names."
|
68
|
+
)
|
69
|
+
|
70
|
+
table_order = (
|
71
|
+
columns_df[_TABLE_NAME_COL]
|
72
|
+
.astype(str)
|
73
|
+
.str.upper()
|
74
|
+
.drop_duplicates()
|
75
|
+
.tolist()
|
76
|
+
)
|
77
|
+
|
78
|
+
tables: List[Tuple[FQNParts, Table]] = []
|
79
|
+
for idx, table_name in enumerate(table_order):
|
80
|
+
table_columns_df = columns_df[columns_df[_TABLE_NAME_COL] == table_name]
|
81
|
+
if table_columns_df.empty:
|
82
|
+
continue
|
83
|
+
|
84
|
+
max_workers_for_table = min(max_workers, len(table_columns_df.index) or 1)
|
85
|
+
table_proto = get_table_representation(
|
86
|
+
session=session,
|
87
|
+
workspace=workspace,
|
88
|
+
schema_name=schema,
|
89
|
+
table_name=table_name,
|
90
|
+
table_index=idx,
|
91
|
+
ndv_per_column=sample_values_per_column,
|
92
|
+
columns_df=table_columns_df,
|
93
|
+
max_workers=max_workers_for_table,
|
94
|
+
)
|
95
|
+
tables.append(
|
96
|
+
(
|
97
|
+
FQNParts(database=workspace, schema_name=schema, table=table_name),
|
98
|
+
table_proto,
|
99
|
+
)
|
100
|
+
)
|
101
|
+
|
102
|
+
return tables
|
103
|
+
|
104
|
+
|
105
|
+
def _discover_relationships(
|
106
|
+
raw_tables: List[Tuple[FQNParts, Table]],
|
107
|
+
strict_join_inference: bool,
|
108
|
+
session: Optional[Session],
|
109
|
+
) -> List[semantic_model_pb2.Relationship]:
|
110
|
+
if not raw_tables:
|
111
|
+
return []
|
112
|
+
|
113
|
+
relationships = _infer_relationships(
|
114
|
+
raw_tables,
|
115
|
+
session=session if strict_join_inference else None,
|
116
|
+
strict_join_inference=strict_join_inference,
|
117
|
+
)
|
118
|
+
return relationships
|
119
|
+
|
120
|
+
|
121
|
+
def discover_relationships_from_tables(
|
122
|
+
tables: Sequence[Tuple[FQNParts, Table]],
|
123
|
+
*,
|
124
|
+
strict_join_inference: bool = False,
|
125
|
+
session: Optional[Session] = None,
|
126
|
+
) -> RelationshipDiscoveryResult:
|
127
|
+
"""
|
128
|
+
Run relationship inference using pre-constructed table metadata.
|
129
|
+
"""
|
130
|
+
start = time.perf_counter()
|
131
|
+
relationships = _discover_relationships(
|
132
|
+
list(tables),
|
133
|
+
strict_join_inference=strict_join_inference,
|
134
|
+
session=session,
|
135
|
+
)
|
136
|
+
end = time.perf_counter()
|
137
|
+
|
138
|
+
all_columns = sum(len(table.columns) for _, table in tables)
|
139
|
+
summary = RelationshipSummary(
|
140
|
+
total_tables=len(tables),
|
141
|
+
total_columns=all_columns,
|
142
|
+
total_relationships_found=len(relationships),
|
143
|
+
processing_time_ms=int((end - start) * 1000),
|
144
|
+
)
|
145
|
+
|
146
|
+
return RelationshipDiscoveryResult(
|
147
|
+
relationships=relationships,
|
148
|
+
tables=[table for _, table in tables],
|
149
|
+
summary=summary,
|
150
|
+
)
|
151
|
+
|
152
|
+
|
153
|
+
def discover_relationships_from_schema(
|
154
|
+
session: Session,
|
155
|
+
workspace: str,
|
156
|
+
schema: str,
|
157
|
+
*,
|
158
|
+
table_names: Optional[Sequence[str]] = None,
|
159
|
+
sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
|
160
|
+
strict_join_inference: bool = False,
|
161
|
+
max_workers: int = DEFAULT_MAX_WORKERS,
|
162
|
+
) -> RelationshipDiscoveryResult:
|
163
|
+
"""
|
164
|
+
Discover table relationships for all tables in a ClickZetta schema.
|
165
|
+
"""
|
166
|
+
normalized_tables = _normalize_table_names(table_names)
|
167
|
+
|
168
|
+
metadata_df = get_valid_schemas_tables_columns_df(
|
169
|
+
session=session,
|
170
|
+
workspace=workspace,
|
171
|
+
table_schema=schema,
|
172
|
+
table_names=normalized_tables,
|
173
|
+
)
|
174
|
+
metadata_df.columns = [str(col).upper() for col in metadata_df.columns]
|
175
|
+
|
176
|
+
if metadata_df.empty:
|
177
|
+
logger.warning(
|
178
|
+
"No column metadata found for workspace=%s schema=%s tables=%s",
|
179
|
+
workspace,
|
180
|
+
schema,
|
181
|
+
table_names,
|
182
|
+
)
|
183
|
+
return RelationshipDiscoveryResult(
|
184
|
+
relationships=[],
|
185
|
+
tables=[],
|
186
|
+
summary=RelationshipSummary(
|
187
|
+
total_tables=0,
|
188
|
+
total_columns=0,
|
189
|
+
total_relationships_found=0,
|
190
|
+
processing_time_ms=0,
|
191
|
+
),
|
192
|
+
)
|
193
|
+
|
194
|
+
raw_tables = _build_tables_from_dataframe(
|
195
|
+
session=session,
|
196
|
+
workspace=workspace,
|
197
|
+
schema=schema,
|
198
|
+
columns_df=metadata_df,
|
199
|
+
sample_values_per_column=sample_values_per_column,
|
200
|
+
max_workers=max_workers,
|
201
|
+
)
|
202
|
+
|
203
|
+
return discover_relationships_from_tables(
|
204
|
+
raw_tables,
|
205
|
+
strict_join_inference=strict_join_inference,
|
206
|
+
session=session,
|
207
|
+
)
|
@@ -0,0 +1,111 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Any, Dict, List
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from semantic_model_generator.relationships.discovery import (
|
8
|
+
discover_relationships_from_schema,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
class _FakeResult:
|
13
|
+
def __init__(self, df: pd.DataFrame):
|
14
|
+
self._df = df
|
15
|
+
|
16
|
+
def to_pandas(self) -> pd.DataFrame:
|
17
|
+
return self._df.copy()
|
18
|
+
|
19
|
+
|
20
|
+
class _FakeSession:
|
21
|
+
def __init__(self, tables: List[str], columns_df: pd.DataFrame):
|
22
|
+
self.tables = tables
|
23
|
+
self.columns_df = columns_df
|
24
|
+
|
25
|
+
def sql(self, query: str):
|
26
|
+
normalized = query.upper()
|
27
|
+
if "SHOW CATALOGS" in normalized:
|
28
|
+
return _FakeResult(
|
29
|
+
pd.DataFrame(
|
30
|
+
{
|
31
|
+
"CATALOG_NAME": ["CLICKZETTA_SAMPLE_DATA"],
|
32
|
+
"CATEGORY": ["MANAGED"],
|
33
|
+
}
|
34
|
+
)
|
35
|
+
)
|
36
|
+
if "FROM INFORMATION_SCHEMA.TABLES" in normalized:
|
37
|
+
data = {"TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables), "TABLE_NAME": self.tables}
|
38
|
+
return _FakeResult(pd.DataFrame(data))
|
39
|
+
if "FROM INFORMATION_SCHEMA.COLUMNS" in normalized:
|
40
|
+
return _FakeResult(self.columns_df)
|
41
|
+
if "SELECT DISTINCT" in normalized:
|
42
|
+
# Return single column of sample values
|
43
|
+
return _FakeResult(pd.DataFrame({"VALUE": [1, 2, 3]}))
|
44
|
+
raise AssertionError(f"Unexpected query: {query}")
|
45
|
+
|
46
|
+
|
47
|
+
def _build_columns_df() -> pd.DataFrame:
|
48
|
+
records: List[Dict[str, Any]] = []
|
49
|
+
# Orders table
|
50
|
+
records.extend(
|
51
|
+
[
|
52
|
+
{
|
53
|
+
"TABLE_SCHEMA": "TPCH_100G",
|
54
|
+
"TABLE_NAME": "ORDERS",
|
55
|
+
"COLUMN_NAME": "ORDER_ID",
|
56
|
+
"DATA_TYPE": "NUMBER",
|
57
|
+
"IS_PRIMARY_KEY": True,
|
58
|
+
},
|
59
|
+
{
|
60
|
+
"TABLE_SCHEMA": "TPCH_100G",
|
61
|
+
"TABLE_NAME": "ORDERS",
|
62
|
+
"COLUMN_NAME": "CUSTOMER_ID",
|
63
|
+
"DATA_TYPE": "NUMBER",
|
64
|
+
"IS_PRIMARY_KEY": False,
|
65
|
+
},
|
66
|
+
]
|
67
|
+
)
|
68
|
+
# Customer table
|
69
|
+
records.extend(
|
70
|
+
[
|
71
|
+
{
|
72
|
+
"TABLE_SCHEMA": "TPCH_100G",
|
73
|
+
"TABLE_NAME": "CUSTOMER",
|
74
|
+
"COLUMN_NAME": "CUSTOMER_ID",
|
75
|
+
"DATA_TYPE": "NUMBER",
|
76
|
+
"IS_PRIMARY_KEY": True,
|
77
|
+
},
|
78
|
+
{
|
79
|
+
"TABLE_SCHEMA": "TPCH_100G",
|
80
|
+
"TABLE_NAME": "CUSTOMER",
|
81
|
+
"COLUMN_NAME": "NAME",
|
82
|
+
"DATA_TYPE": "STRING",
|
83
|
+
"IS_PRIMARY_KEY": False,
|
84
|
+
},
|
85
|
+
]
|
86
|
+
)
|
87
|
+
return pd.DataFrame.from_records(records)
|
88
|
+
|
89
|
+
|
90
|
+
def test_discover_relationships_from_schema_builds_relationships():
|
91
|
+
tables = ["ORDERS", "CUSTOMER"]
|
92
|
+
columns_df = _build_columns_df()
|
93
|
+
session = _FakeSession(tables, columns_df)
|
94
|
+
|
95
|
+
result = discover_relationships_from_schema(
|
96
|
+
session=session,
|
97
|
+
workspace="CLICKZETTA_SAMPLE_DATA",
|
98
|
+
schema="TPCH_100G",
|
99
|
+
strict_join_inference=False,
|
100
|
+
)
|
101
|
+
|
102
|
+
assert result.summary.total_tables == 2
|
103
|
+
assert result.summary.total_relationships_found >= 1
|
104
|
+
|
105
|
+
names = {rel.name for rel in result.relationships}
|
106
|
+
assert any("ORDERS" in name and "CUSTOMER" in name for name in names)
|
107
|
+
|
108
|
+
left_tables = {rel.left_table for rel in result.relationships}
|
109
|
+
right_tables = {rel.right_table for rel in result.relationships}
|
110
|
+
assert "ORDERS" in left_tables
|
111
|
+
assert "CUSTOMER" in right_tables
|
{clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/LICENSE
RENAMED
File without changes
|
{clickzetta_semantic_model_generator-1.0.1 → clickzetta_semantic_model_generator-1.0.2}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|