clickzetta-semantic-model-generator 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/METADATA +5 -5
  2. {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/RECORD +22 -19
  3. semantic_model_generator/clickzetta_utils/clickzetta_connector.py +91 -33
  4. semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
  5. semantic_model_generator/data_processing/cte_utils.py +1 -1
  6. semantic_model_generator/generate_model.py +588 -224
  7. semantic_model_generator/llm/dashscope_client.py +4 -2
  8. semantic_model_generator/llm/enrichment.py +144 -57
  9. semantic_model_generator/llm/progress_tracker.py +16 -15
  10. semantic_model_generator/relationships/__init__.py +15 -0
  11. semantic_model_generator/relationships/discovery.py +202 -0
  12. semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
  13. semantic_model_generator/tests/cte_utils_test.py +1 -1
  14. semantic_model_generator/tests/generate_model_classification_test.py +12 -2
  15. semantic_model_generator/tests/llm_enrichment_test.py +152 -46
  16. semantic_model_generator/tests/relationship_discovery_test.py +114 -0
  17. semantic_model_generator/tests/relationships_filters_test.py +166 -30
  18. semantic_model_generator/tests/utils_test.py +1 -1
  19. semantic_model_generator/validate/keywords.py +453 -53
  20. semantic_model_generator/validate/schema.py +4 -2
  21. {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/LICENSE +0 -0
  22. {clickzetta_semantic_model_generator-1.0.1.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ import pandas as pd
6
+
7
+ from semantic_model_generator.relationships.discovery import (
8
+ discover_relationships_from_schema,
9
+ )
10
+
11
+
12
+ class _FakeResult:
13
+ def __init__(self, df: pd.DataFrame):
14
+ self._df = df
15
+
16
+ def to_pandas(self) -> pd.DataFrame:
17
+ return self._df.copy()
18
+
19
+
20
+ class _FakeSession:
21
+ def __init__(self, tables: List[str], columns_df: pd.DataFrame):
22
+ self.tables = tables
23
+ self.columns_df = columns_df
24
+
25
+ def sql(self, query: str):
26
+ normalized = query.upper()
27
+ if "SHOW CATALOGS" in normalized:
28
+ return _FakeResult(
29
+ pd.DataFrame(
30
+ {
31
+ "CATALOG_NAME": ["CLICKZETTA_SAMPLE_DATA"],
32
+ "CATEGORY": ["MANAGED"],
33
+ }
34
+ )
35
+ )
36
+ if "INFORMATION_SCHEMA.COLUMNS" in normalized:
37
+ return _FakeResult(self.columns_df)
38
+ if "FROM INFORMATION_SCHEMA.TABLES" in normalized:
39
+ data = {
40
+ "TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables),
41
+ "TABLE_NAME": self.tables,
42
+ }
43
+ return _FakeResult(pd.DataFrame(data))
44
+ if "SELECT DISTINCT" in normalized:
45
+ # Return single column of sample values
46
+ return _FakeResult(pd.DataFrame({"VALUE": [1, 2, 3]}))
47
+ raise AssertionError(f"Unexpected query: {query}")
48
+
49
+
50
+ def _build_columns_df() -> pd.DataFrame:
51
+ records: List[Dict[str, Any]] = []
52
+ # Orders table
53
+ records.extend(
54
+ [
55
+ {
56
+ "TABLE_SCHEMA": "TPCH_100G",
57
+ "TABLE_NAME": "ORDERS",
58
+ "COLUMN_NAME": "ORDER_ID",
59
+ "DATA_TYPE": "NUMBER",
60
+ "IS_PRIMARY_KEY": True,
61
+ },
62
+ {
63
+ "TABLE_SCHEMA": "TPCH_100G",
64
+ "TABLE_NAME": "ORDERS",
65
+ "COLUMN_NAME": "CUSTOMER_ID",
66
+ "DATA_TYPE": "NUMBER",
67
+ "IS_PRIMARY_KEY": False,
68
+ },
69
+ ]
70
+ )
71
+ # Customer table
72
+ records.extend(
73
+ [
74
+ {
75
+ "TABLE_SCHEMA": "TPCH_100G",
76
+ "TABLE_NAME": "CUSTOMER",
77
+ "COLUMN_NAME": "CUSTOMER_ID",
78
+ "DATA_TYPE": "NUMBER",
79
+ "IS_PRIMARY_KEY": True,
80
+ },
81
+ {
82
+ "TABLE_SCHEMA": "TPCH_100G",
83
+ "TABLE_NAME": "CUSTOMER",
84
+ "COLUMN_NAME": "NAME",
85
+ "DATA_TYPE": "STRING",
86
+ "IS_PRIMARY_KEY": False,
87
+ },
88
+ ]
89
+ )
90
+ return pd.DataFrame.from_records(records)
91
+
92
+
93
+ def test_discover_relationships_from_schema_builds_relationships():
94
+ tables = ["ORDERS", "CUSTOMER"]
95
+ columns_df = _build_columns_df()
96
+ session = _FakeSession(tables, columns_df)
97
+
98
+ result = discover_relationships_from_schema(
99
+ session=session,
100
+ workspace="CLICKZETTA_SAMPLE_DATA",
101
+ schema="TPCH_100G",
102
+ strict_join_inference=False,
103
+ )
104
+
105
+ assert result.summary.total_tables == 2
106
+ assert result.summary.total_relationships_found >= 1
107
+
108
+ names = {rel.name for rel in result.relationships}
109
+ assert any("ORDERS" in name and "CUSTOMER" in name for name in names)
110
+
111
+ left_tables = {rel.left_table for rel in result.relationships}
112
+ right_tables = {rel.right_table for rel in result.relationships}
113
+ assert "ORDERS" in left_tables
114
+ assert "CUSTOMER" in right_tables
@@ -7,9 +7,24 @@ def test_suggest_filters_builds_in_clause_and_time_filter() -> None:
7
7
  id_=0,
8
8
  name="ORDERS",
9
9
  columns=[
10
- Column(id_=0, column_name="status", column_type="STRING", values=["OPEN", "CLOSED", "OPEN"]),
11
- Column(id_=1, column_name="order_date", column_type="TIMESTAMP", values=["2024-01-01", "2024-01-15"]),
12
- Column(id_=2, column_name="created_at", column_type="STRING", values=["2024-01-05 10:00:00"]),
10
+ Column(
11
+ id_=0,
12
+ column_name="status",
13
+ column_type="STRING",
14
+ values=["OPEN", "CLOSED", "OPEN"],
15
+ ),
16
+ Column(
17
+ id_=1,
18
+ column_name="order_date",
19
+ column_type="TIMESTAMP",
20
+ values=["2024-01-01", "2024-01-15"],
21
+ ),
22
+ Column(
23
+ id_=2,
24
+ column_name="created_at",
25
+ column_type="STRING",
26
+ values=["2024-01-05 10:00:00"],
27
+ ),
13
28
  ],
14
29
  )
15
30
 
@@ -26,7 +41,12 @@ def test_infer_relationships_uses_pk_candidate() -> None:
26
41
  id_=0,
27
42
  name="CUSTOMERS",
28
43
  columns=[
29
- Column(id_=0, column_name="customer_id", column_type="INT", values=["1", "2", "3"]),
44
+ Column(
45
+ id_=0,
46
+ column_name="customer_id",
47
+ column_type="INT",
48
+ values=["1", "2", "3"],
49
+ ),
30
50
  Column(id_=1, column_name="customer_name", column_type="STRING"),
31
51
  ],
32
52
  )
@@ -34,15 +54,35 @@ def test_infer_relationships_uses_pk_candidate() -> None:
34
54
  id_=1,
35
55
  name="ORDERS",
36
56
  columns=[
37
- Column(id_=0, column_name="order_id", column_type="INT", values=["10", "11", "12"]),
38
- Column(id_=1, column_name="customer_id", column_type="INT", values=["1", "2", "1"]),
57
+ Column(
58
+ id_=0,
59
+ column_name="order_id",
60
+ column_type="INT",
61
+ values=["10", "11", "12"],
62
+ ),
63
+ Column(
64
+ id_=1,
65
+ column_name="customer_id",
66
+ column_type="INT",
67
+ values=["1", "2", "1"],
68
+ ),
39
69
  ],
40
70
  )
41
71
 
42
72
  relationships = generate_model._infer_relationships(
43
73
  [
44
- (FQNParts(database="QUICK_START", schema_name="MCP_DEMO", table="CUSTOMERS"), customers_table),
45
- (FQNParts(database="QUICK_START", schema_name="MCP_DEMO", table="ORDERS"), orders_table),
74
+ (
75
+ FQNParts(
76
+ database="QUICK_START", schema_name="MCP_DEMO", table="CUSTOMERS"
77
+ ),
78
+ customers_table,
79
+ ),
80
+ (
81
+ FQNParts(
82
+ database="QUICK_START", schema_name="MCP_DEMO", table="ORDERS"
83
+ ),
84
+ orders_table,
85
+ ),
46
86
  ]
47
87
  )
48
88
 
@@ -61,23 +101,46 @@ def test_infer_relationships_matches_synonym_keys() -> None:
61
101
  id_=0,
62
102
  name="ORDERS",
63
103
  columns=[
64
- Column(id_=0, column_name="o_orderkey", column_type="INT", values=["1", "2", "3"]),
65
- Column(id_=1, column_name="o_custkey", column_type="INT", values=["10", "20", "30"]),
104
+ Column(
105
+ id_=0,
106
+ column_name="o_orderkey",
107
+ column_type="INT",
108
+ values=["1", "2", "3"],
109
+ ),
110
+ Column(
111
+ id_=1,
112
+ column_name="o_custkey",
113
+ column_type="INT",
114
+ values=["10", "20", "30"],
115
+ ),
66
116
  ],
67
117
  )
68
118
  lineitem_table = Table(
69
119
  id_=1,
70
120
  name="LINEITEM",
71
121
  columns=[
72
- Column(id_=0, column_name="l_orderkey", column_type="INT", values=["1", "1", "2"]),
73
- Column(id_=1, column_name="l_linenumber", column_type="INT", values=["1", "2", "1"]),
122
+ Column(
123
+ id_=0,
124
+ column_name="l_orderkey",
125
+ column_type="INT",
126
+ values=["1", "1", "2"],
127
+ ),
128
+ Column(
129
+ id_=1,
130
+ column_name="l_linenumber",
131
+ column_type="INT",
132
+ values=["1", "2", "1"],
133
+ ),
74
134
  ],
75
135
  )
76
136
 
77
137
  relationships = generate_model._infer_relationships(
78
138
  [
79
139
  (FQNParts(database="CAT", schema_name="SCH", table="ORDERS"), orders_table),
80
- (FQNParts(database="CAT", schema_name="SCH", table="LINEITEM"), lineitem_table),
140
+ (
141
+ FQNParts(database="CAT", schema_name="SCH", table="LINEITEM"),
142
+ lineitem_table,
143
+ ),
81
144
  ]
82
145
  )
83
146
 
@@ -95,22 +158,40 @@ def test_infer_relationships_handles_part_supplier() -> None:
95
158
  id_=0,
96
159
  name="PART",
97
160
  columns=[
98
- Column(id_=0, column_name="p_partkey", column_type="INT", values=["1", "2", "3"]),
161
+ Column(
162
+ id_=0,
163
+ column_name="p_partkey",
164
+ column_type="INT",
165
+ values=["1", "2", "3"],
166
+ ),
99
167
  ],
100
168
  )
101
169
  partsupp_table = Table(
102
170
  id_=1,
103
171
  name="PARTSUPP",
104
172
  columns=[
105
- Column(id_=0, column_name="ps_partkey", column_type="INT", values=["1", "1", "2"]),
106
- Column(id_=1, column_name="ps_suppkey", column_type="INT", values=["10", "20", "30"]),
173
+ Column(
174
+ id_=0,
175
+ column_name="ps_partkey",
176
+ column_type="INT",
177
+ values=["1", "1", "2"],
178
+ ),
179
+ Column(
180
+ id_=1,
181
+ column_name="ps_suppkey",
182
+ column_type="INT",
183
+ values=["10", "20", "30"],
184
+ ),
107
185
  ],
108
186
  )
109
187
 
110
188
  relationships = generate_model._infer_relationships(
111
189
  [
112
190
  (FQNParts(database="CAT", schema_name="SCH", table="PART"), part_table),
113
- (FQNParts(database="CAT", schema_name="SCH", table="PARTSUPP"), partsupp_table),
191
+ (
192
+ FQNParts(database="CAT", schema_name="SCH", table="PARTSUPP"),
193
+ partsupp_table,
194
+ ),
114
195
  ]
115
196
  )
116
197
 
@@ -128,15 +209,30 @@ def test_infer_relationships_orders_customer() -> None:
128
209
  id_=0,
129
210
  name="ORDERS",
130
211
  columns=[
131
- Column(id_=0, column_name="o_orderkey", column_type="INT", values=["1", "2", "3"]),
132
- Column(id_=1, column_name="o_custkey", column_type="INT", values=["10", "20", "30"]),
212
+ Column(
213
+ id_=0,
214
+ column_name="o_orderkey",
215
+ column_type="INT",
216
+ values=["1", "2", "3"],
217
+ ),
218
+ Column(
219
+ id_=1,
220
+ column_name="o_custkey",
221
+ column_type="INT",
222
+ values=["10", "20", "30"],
223
+ ),
133
224
  ],
134
225
  )
135
226
  customer_table = Table(
136
227
  id_=1,
137
228
  name="CUSTOMER",
138
229
  columns=[
139
- Column(id_=0, column_name="c_custkey", column_type="INT", values=["10", "20", "30"]),
230
+ Column(
231
+ id_=0,
232
+ column_name="c_custkey",
233
+ column_type="INT",
234
+ values=["10", "20", "30"],
235
+ ),
140
236
  Column(id_=1, column_name="c_name", column_type="STRING"),
141
237
  ],
142
238
  )
@@ -144,7 +240,10 @@ def test_infer_relationships_orders_customer() -> None:
144
240
  relationships = generate_model._infer_relationships(
145
241
  [
146
242
  (FQNParts(database="CAT", schema_name="SCH", table="ORDERS"), orders_table),
147
- (FQNParts(database="CAT", schema_name="SCH", table="CUSTOMER"), customer_table),
243
+ (
244
+ FQNParts(database="CAT", schema_name="SCH", table="CUSTOMER"),
245
+ customer_table,
246
+ ),
148
247
  ]
149
248
  )
150
249
 
@@ -162,23 +261,44 @@ def test_infer_relationships_lineitem_supplier() -> None:
162
261
  id_=0,
163
262
  name="LINEITEM",
164
263
  columns=[
165
- Column(id_=0, column_name="l_orderkey", column_type="INT", values=["1", "2", "3"]),
166
- Column(id_=1, column_name="l_suppkey", column_type="INT", values=["100", "101", "102"]),
264
+ Column(
265
+ id_=0,
266
+ column_name="l_orderkey",
267
+ column_type="INT",
268
+ values=["1", "2", "3"],
269
+ ),
270
+ Column(
271
+ id_=1,
272
+ column_name="l_suppkey",
273
+ column_type="INT",
274
+ values=["100", "101", "102"],
275
+ ),
167
276
  ],
168
277
  )
169
278
  supplier_table = Table(
170
279
  id_=1,
171
280
  name="SUPPLIER",
172
281
  columns=[
173
- Column(id_=0, column_name="s_suppkey", column_type="INT", values=["100", "101", "102"]),
282
+ Column(
283
+ id_=0,
284
+ column_name="s_suppkey",
285
+ column_type="INT",
286
+ values=["100", "101", "102"],
287
+ ),
174
288
  Column(id_=1, column_name="s_name", column_type="STRING"),
175
289
  ],
176
290
  )
177
291
 
178
292
  relationships = generate_model._infer_relationships(
179
293
  [
180
- (FQNParts(database="CAT", schema_name="SCH", table="LINEITEM"), lineitem_table),
181
- (FQNParts(database="CAT", schema_name="SCH", table="SUPPLIER"), supplier_table),
294
+ (
295
+ FQNParts(database="CAT", schema_name="SCH", table="LINEITEM"),
296
+ lineitem_table,
297
+ ),
298
+ (
299
+ FQNParts(database="CAT", schema_name="SCH", table="SUPPLIER"),
300
+ supplier_table,
301
+ ),
182
302
  ]
183
303
  )
184
304
 
@@ -196,7 +316,13 @@ def test_infer_relationships_handles_suffix_based_foreign_keys() -> None:
196
316
  id_=0,
197
317
  name="DIM_DATE",
198
318
  columns=[
199
- Column(id_=0, column_name="date_id", column_type="INT", values=["20240101", "20240102"], is_primary_key=True),
319
+ Column(
320
+ id_=0,
321
+ column_name="date_id",
322
+ column_type="INT",
323
+ values=["20240101", "20240102"],
324
+ is_primary_key=True,
325
+ ),
200
326
  Column(id_=1, column_name="date_value", column_type="DATE"),
201
327
  ],
202
328
  )
@@ -204,15 +330,25 @@ def test_infer_relationships_handles_suffix_based_foreign_keys() -> None:
204
330
  id_=1,
205
331
  name="FACT_SALES",
206
332
  columns=[
207
- Column(id_=0, column_name="order_id", column_type="INT", values=["10", "11"]),
208
- Column(id_=1, column_name="order_date_id", column_type="INT", values=["20240101", "20240102"]),
333
+ Column(
334
+ id_=0, column_name="order_id", column_type="INT", values=["10", "11"]
335
+ ),
336
+ Column(
337
+ id_=1,
338
+ column_name="order_date_id",
339
+ column_type="INT",
340
+ values=["20240101", "20240102"],
341
+ ),
209
342
  ],
210
343
  )
211
344
 
212
345
  relationships = generate_model._infer_relationships(
213
346
  [
214
347
  (FQNParts(database="CAT", schema_name="SCH", table="DIM_DATE"), dim_date),
215
- (FQNParts(database="CAT", schema_name="SCH", table="FACT_SALES"), fact_sales),
348
+ (
349
+ FQNParts(database="CAT", schema_name="SCH", table="FACT_SALES"),
350
+ fact_sales,
351
+ ),
216
352
  ]
217
353
  )
218
354
 
@@ -1,7 +1,7 @@
1
1
  import pytest
2
2
 
3
- from semantic_model_generator.data_processing.data_types import FQNParts
4
3
  from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
4
+ from semantic_model_generator.data_processing.data_types import FQNParts
5
5
 
6
6
 
7
7
  def test_fqn_creation():