clickzetta-semantic-model-generator 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: clickzetta-semantic-model-generator
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Curate a Semantic Model for ClickZetta Lakehouse
5
5
  License: Apache Software License; BSD License
6
6
  Author: qililiang
@@ -16,10 +16,13 @@ semantic_model_generator/output_models/.keep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
16
16
  semantic_model_generator/protos/semantic_model.proto,sha256=WZiN4b8vR-ZX-Lj9Vsm6HjZNAyNvM1znIyut_YkPVSI,16473
17
17
  semantic_model_generator/protos/semantic_model_pb2.py,sha256=scbWkW-I-r3_hp_5SHoOWn02p52RJ9DJ0_-nRgr0LHc,25606
18
18
  semantic_model_generator/protos/semantic_model_pb2.pyi,sha256=iiBIZxtX9d6IuUO3aLcsJsHUeZqdi14vYNuUsSM8C0g,18267
19
+ semantic_model_generator/relationships/__init__.py,sha256=HN6Opie25Oawt2fCDM_bZwRBVBEzqRsEXgDzYC7ytns,373
20
+ semantic_model_generator/relationships/discovery.py,sha256=fy8mzfAZtWWVUjiSP5jeKffoa2GodNIKA3eGi4dxhHo,6020
19
21
  semantic_model_generator/tests/clickzetta_connector_test.py,sha256=NKDbhll8TxFtbwFJIUcQQDKJU1LkfEz9nL0SE32hx3o,3114
20
22
  semantic_model_generator/tests/cte_utils_test.py,sha256=8v2nrrD2GkH_PTGIsKm3lQ06unzis8iR31atT8bUX98,17385
21
23
  semantic_model_generator/tests/generate_model_classification_test.py,sha256=q7dh29h9iF17ChzuoFSLtRX9ASiAm2oY4OkGyFVfn5Y,2117
22
24
  semantic_model_generator/tests/llm_enrichment_test.py,sha256=YeYg4voQ3wy2vgF7H9JNdnMOyZmUfiMdL6oEXFv-ztg,14415
25
+ semantic_model_generator/tests/relationship_discovery_test.py,sha256=1SVX59-mpHQvxk7RDGtglesg6VXU9TnnZZcfjZi5IHs,3448
23
26
  semantic_model_generator/tests/relationships_filters_test.py,sha256=fVyA-hwxGqdlFr_PGT9YdrPz13XGvI4J_5F3IpVTFEE,8009
24
27
  semantic_model_generator/tests/samples/validate_yamls.py,sha256=262j-2i2oFZtTyK2susOrbxxE5eS-6IN-V0jFEOpt_w,156249
25
28
  semantic_model_generator/tests/utils_test.py,sha256=Tfvb-ErZPBS_HjXr4N7XSJnL3hlncNGF5pay1xFfNHg,539
@@ -29,7 +32,7 @@ semantic_model_generator/validate/context_length.py,sha256=HL-GfaRXNcVji1-pAFGXG
29
32
  semantic_model_generator/validate/keywords.py,sha256=mFtmIK72YLQ7wODL-zCC-uMXzXMJT6Tg4fISAin2WtQ,5480
30
33
  semantic_model_generator/validate/schema.py,sha256=ImFUzO5EzNbl65-0dqt5AmdEvy4lqX8gBkOSOKN8uZM,5863
31
34
  semantic_model_generator/validate_model.py,sha256=Uq-V-GfPeF2Dy4l9uF5Guv104gDCDGh0Cxz1AJOu5dk,836
32
- clickzetta_semantic_model_generator-1.0.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- clickzetta_semantic_model_generator-1.0.1.dist-info/METADATA,sha256=mbWjvg1PtkZlQfPAE06kHDe6whzheZ0iYj_HfXNqqGE,7816
34
- clickzetta_semantic_model_generator-1.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
35
- clickzetta_semantic_model_generator-1.0.1.dist-info/RECORD,,
35
+ clickzetta_semantic_model_generator-1.0.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
+ clickzetta_semantic_model_generator-1.0.2.dist-info/METADATA,sha256=vgObGkoBnyfxUuJM8eAH8ZnIGuJrueJFnU1UfTo_gN4,7816
37
+ clickzetta_semantic_model_generator-1.0.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
38
+ clickzetta_semantic_model_generator-1.0.2.dist-info/RECORD,,
@@ -0,0 +1,15 @@
1
+ """Public APIs for relationship discovery."""
2
+
3
+ from .discovery import (
4
+ RelationshipDiscoveryResult,
5
+ RelationshipSummary,
6
+ discover_relationships_from_schema,
7
+ discover_relationships_from_tables,
8
+ )
9
+
10
+ __all__ = [
11
+ "RelationshipDiscoveryResult",
12
+ "RelationshipSummary",
13
+ "discover_relationships_from_schema",
14
+ "discover_relationships_from_tables",
15
+ ]
@@ -0,0 +1,207 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from dataclasses import dataclass
5
+ from typing import Any, Iterable, List, Optional, Sequence, Tuple
6
+
7
+ import pandas as pd
8
+ from loguru import logger
9
+
10
+ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
11
+ _TABLE_NAME_COL,
12
+ _TABLE_SCHEMA_COL,
13
+ get_table_representation,
14
+ get_valid_schemas_tables_columns_df,
15
+ )
16
+ from semantic_model_generator.data_processing import data_types
17
+ from semantic_model_generator.data_processing.data_types import FQNParts, Table
18
+ from semantic_model_generator.generate_model import (
19
+ _DEFAULT_N_SAMPLE_VALUES_PER_COL,
20
+ _infer_relationships,
21
+ )
22
+ from semantic_model_generator.protos import semantic_model_pb2
23
+
24
+ try: # pragma: no cover - optional dependency for type checking
25
+ from clickzetta.zettapark.session import Session
26
+ except Exception: # pragma: no cover
27
+ Session = Any # type: ignore
28
+
29
+ DEFAULT_MAX_WORKERS = 4
30
+
31
+
32
+ @dataclass
33
+ class RelationshipSummary:
34
+ total_tables: int
35
+ total_columns: int
36
+ total_relationships_found: int
37
+ processing_time_ms: int
38
+
39
+
40
+ @dataclass
41
+ class RelationshipDiscoveryResult:
42
+ relationships: List[semantic_model_pb2.Relationship]
43
+ tables: List[Table]
44
+ summary: RelationshipSummary
45
+
46
+
47
+ def _normalize_table_names(table_names: Optional[Iterable[str]]) -> Optional[List[str]]:
48
+ if table_names is None:
49
+ return None
50
+ return [name.upper() for name in table_names]
51
+
52
+
53
+ def _build_tables_from_dataframe(
54
+ session: Session,
55
+ workspace: str,
56
+ schema: str,
57
+ columns_df: pd.DataFrame,
58
+ sample_values_per_column: int,
59
+ max_workers: int = DEFAULT_MAX_WORKERS,
60
+ ) -> List[Tuple[FQNParts, Table]]:
61
+ if columns_df.empty:
62
+ return []
63
+
64
+ if _TABLE_NAME_COL not in columns_df.columns:
65
+ raise KeyError(
66
+ f"Expected '{_TABLE_NAME_COL}' column in metadata dataframe. "
67
+ "Ensure information_schema query returned table names."
68
+ )
69
+
70
+ table_order = (
71
+ columns_df[_TABLE_NAME_COL]
72
+ .astype(str)
73
+ .str.upper()
74
+ .drop_duplicates()
75
+ .tolist()
76
+ )
77
+
78
+ tables: List[Tuple[FQNParts, Table]] = []
79
+ for idx, table_name in enumerate(table_order):
80
+ table_columns_df = columns_df[columns_df[_TABLE_NAME_COL] == table_name]
81
+ if table_columns_df.empty:
82
+ continue
83
+
84
+ max_workers_for_table = min(max_workers, len(table_columns_df.index) or 1)
85
+ table_proto = get_table_representation(
86
+ session=session,
87
+ workspace=workspace,
88
+ schema_name=schema,
89
+ table_name=table_name,
90
+ table_index=idx,
91
+ ndv_per_column=sample_values_per_column,
92
+ columns_df=table_columns_df,
93
+ max_workers=max_workers_for_table,
94
+ )
95
+ tables.append(
96
+ (
97
+ FQNParts(database=workspace, schema_name=schema, table=table_name),
98
+ table_proto,
99
+ )
100
+ )
101
+
102
+ return tables
103
+
104
+
105
+ def _discover_relationships(
106
+ raw_tables: List[Tuple[FQNParts, Table]],
107
+ strict_join_inference: bool,
108
+ session: Optional[Session],
109
+ ) -> List[semantic_model_pb2.Relationship]:
110
+ if not raw_tables:
111
+ return []
112
+
113
+ relationships = _infer_relationships(
114
+ raw_tables,
115
+ session=session if strict_join_inference else None,
116
+ strict_join_inference=strict_join_inference,
117
+ )
118
+ return relationships
119
+
120
+
121
+ def discover_relationships_from_tables(
122
+ tables: Sequence[Tuple[FQNParts, Table]],
123
+ *,
124
+ strict_join_inference: bool = False,
125
+ session: Optional[Session] = None,
126
+ ) -> RelationshipDiscoveryResult:
127
+ """
128
+ Run relationship inference using pre-constructed table metadata.
129
+ """
130
+ start = time.perf_counter()
131
+ relationships = _discover_relationships(
132
+ list(tables),
133
+ strict_join_inference=strict_join_inference,
134
+ session=session,
135
+ )
136
+ end = time.perf_counter()
137
+
138
+ all_columns = sum(len(table.columns) for _, table in tables)
139
+ summary = RelationshipSummary(
140
+ total_tables=len(tables),
141
+ total_columns=all_columns,
142
+ total_relationships_found=len(relationships),
143
+ processing_time_ms=int((end - start) * 1000),
144
+ )
145
+
146
+ return RelationshipDiscoveryResult(
147
+ relationships=relationships,
148
+ tables=[table for _, table in tables],
149
+ summary=summary,
150
+ )
151
+
152
+
153
+ def discover_relationships_from_schema(
154
+ session: Session,
155
+ workspace: str,
156
+ schema: str,
157
+ *,
158
+ table_names: Optional[Sequence[str]] = None,
159
+ sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
160
+ strict_join_inference: bool = False,
161
+ max_workers: int = DEFAULT_MAX_WORKERS,
162
+ ) -> RelationshipDiscoveryResult:
163
+ """
164
+ Discover table relationships for all tables in a ClickZetta schema.
165
+ """
166
+ normalized_tables = _normalize_table_names(table_names)
167
+
168
+ metadata_df = get_valid_schemas_tables_columns_df(
169
+ session=session,
170
+ workspace=workspace,
171
+ table_schema=schema,
172
+ table_names=normalized_tables,
173
+ )
174
+ metadata_df.columns = [str(col).upper() for col in metadata_df.columns]
175
+
176
+ if metadata_df.empty:
177
+ logger.warning(
178
+ "No column metadata found for workspace=%s schema=%s tables=%s",
179
+ workspace,
180
+ schema,
181
+ table_names,
182
+ )
183
+ return RelationshipDiscoveryResult(
184
+ relationships=[],
185
+ tables=[],
186
+ summary=RelationshipSummary(
187
+ total_tables=0,
188
+ total_columns=0,
189
+ total_relationships_found=0,
190
+ processing_time_ms=0,
191
+ ),
192
+ )
193
+
194
+ raw_tables = _build_tables_from_dataframe(
195
+ session=session,
196
+ workspace=workspace,
197
+ schema=schema,
198
+ columns_df=metadata_df,
199
+ sample_values_per_column=sample_values_per_column,
200
+ max_workers=max_workers,
201
+ )
202
+
203
+ return discover_relationships_from_tables(
204
+ raw_tables,
205
+ strict_join_inference=strict_join_inference,
206
+ session=session,
207
+ )
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ import pandas as pd
6
+
7
+ from semantic_model_generator.relationships.discovery import (
8
+ discover_relationships_from_schema,
9
+ )
10
+
11
+
12
+ class _FakeResult:
13
+ def __init__(self, df: pd.DataFrame):
14
+ self._df = df
15
+
16
+ def to_pandas(self) -> pd.DataFrame:
17
+ return self._df.copy()
18
+
19
+
20
+ class _FakeSession:
21
+ def __init__(self, tables: List[str], columns_df: pd.DataFrame):
22
+ self.tables = tables
23
+ self.columns_df = columns_df
24
+
25
+ def sql(self, query: str):
26
+ normalized = query.upper()
27
+ if "SHOW CATALOGS" in normalized:
28
+ return _FakeResult(
29
+ pd.DataFrame(
30
+ {
31
+ "CATALOG_NAME": ["CLICKZETTA_SAMPLE_DATA"],
32
+ "CATEGORY": ["MANAGED"],
33
+ }
34
+ )
35
+ )
36
+ if "FROM INFORMATION_SCHEMA.TABLES" in normalized:
37
+ data = {"TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables), "TABLE_NAME": self.tables}
38
+ return _FakeResult(pd.DataFrame(data))
39
+ if "FROM INFORMATION_SCHEMA.COLUMNS" in normalized:
40
+ return _FakeResult(self.columns_df)
41
+ if "SELECT DISTINCT" in normalized:
42
+ # Return single column of sample values
43
+ return _FakeResult(pd.DataFrame({"VALUE": [1, 2, 3]}))
44
+ raise AssertionError(f"Unexpected query: {query}")
45
+
46
+
47
+ def _build_columns_df() -> pd.DataFrame:
48
+ records: List[Dict[str, Any]] = []
49
+ # Orders table
50
+ records.extend(
51
+ [
52
+ {
53
+ "TABLE_SCHEMA": "TPCH_100G",
54
+ "TABLE_NAME": "ORDERS",
55
+ "COLUMN_NAME": "ORDER_ID",
56
+ "DATA_TYPE": "NUMBER",
57
+ "IS_PRIMARY_KEY": True,
58
+ },
59
+ {
60
+ "TABLE_SCHEMA": "TPCH_100G",
61
+ "TABLE_NAME": "ORDERS",
62
+ "COLUMN_NAME": "CUSTOMER_ID",
63
+ "DATA_TYPE": "NUMBER",
64
+ "IS_PRIMARY_KEY": False,
65
+ },
66
+ ]
67
+ )
68
+ # Customer table
69
+ records.extend(
70
+ [
71
+ {
72
+ "TABLE_SCHEMA": "TPCH_100G",
73
+ "TABLE_NAME": "CUSTOMER",
74
+ "COLUMN_NAME": "CUSTOMER_ID",
75
+ "DATA_TYPE": "NUMBER",
76
+ "IS_PRIMARY_KEY": True,
77
+ },
78
+ {
79
+ "TABLE_SCHEMA": "TPCH_100G",
80
+ "TABLE_NAME": "CUSTOMER",
81
+ "COLUMN_NAME": "NAME",
82
+ "DATA_TYPE": "STRING",
83
+ "IS_PRIMARY_KEY": False,
84
+ },
85
+ ]
86
+ )
87
+ return pd.DataFrame.from_records(records)
88
+
89
+
90
+ def test_discover_relationships_from_schema_builds_relationships():
91
+ tables = ["ORDERS", "CUSTOMER"]
92
+ columns_df = _build_columns_df()
93
+ session = _FakeSession(tables, columns_df)
94
+
95
+ result = discover_relationships_from_schema(
96
+ session=session,
97
+ workspace="CLICKZETTA_SAMPLE_DATA",
98
+ schema="TPCH_100G",
99
+ strict_join_inference=False,
100
+ )
101
+
102
+ assert result.summary.total_tables == 2
103
+ assert result.summary.total_relationships_found >= 1
104
+
105
+ names = {rel.name for rel in result.relationships}
106
+ assert any("ORDERS" in name and "CUSTOMER" in name for name in names)
107
+
108
+ left_tables = {rel.left_table for rel in result.relationships}
109
+ right_tables = {rel.right_table for rel in result.relationships}
110
+ assert "ORDERS" in left_tables
111
+ assert "CUSTOMER" in right_tables