clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/METADATA +5 -5
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/RECORD +21 -21
- semantic_model_generator/clickzetta_utils/clickzetta_connector.py +91 -33
- semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
- semantic_model_generator/data_processing/cte_utils.py +1 -1
- semantic_model_generator/generate_model.py +588 -224
- semantic_model_generator/llm/dashscope_client.py +4 -2
- semantic_model_generator/llm/enrichment.py +144 -57
- semantic_model_generator/llm/progress_tracker.py +16 -15
- semantic_model_generator/relationships/discovery.py +1 -6
- semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
- semantic_model_generator/tests/cte_utils_test.py +1 -1
- semantic_model_generator/tests/generate_model_classification_test.py +12 -2
- semantic_model_generator/tests/llm_enrichment_test.py +152 -46
- semantic_model_generator/tests/relationship_discovery_test.py +6 -3
- semantic_model_generator/tests/relationships_filters_test.py +166 -30
- semantic_model_generator/tests/utils_test.py +1 -1
- semantic_model_generator/validate/keywords.py +453 -53
- semantic_model_generator/validate/schema.py +4 -2
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/WHEEL +0 -0
@@ -1,6 +1,5 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
-
from semantic_model_generator import generate_model
|
4
3
|
from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
|
5
4
|
from semantic_model_generator.llm.dashscope_client import DashscopeResponse
|
6
5
|
from semantic_model_generator.llm.enrichment import enrich_semantic_model
|
@@ -16,9 +15,16 @@ class _FakeDashscopeClient:
|
|
16
15
|
self._index = 0
|
17
16
|
|
18
17
|
def chat_completion(self, messages): # type: ignore[no-untyped-def]
|
19
|
-
payload =
|
18
|
+
payload = (
|
19
|
+
self._payloads[self._index]
|
20
|
+
if self._index < len(self._payloads)
|
21
|
+
else self._payloads[-1]
|
22
|
+
)
|
20
23
|
self._index += 1
|
21
|
-
return DashscopeResponse(
|
24
|
+
return DashscopeResponse(
|
25
|
+
content=json.dumps(payload, ensure_ascii=False),
|
26
|
+
request_id=f"test_{self._index}",
|
27
|
+
)
|
22
28
|
|
23
29
|
|
24
30
|
def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
@@ -26,15 +32,27 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
|
26
32
|
id_=0,
|
27
33
|
name="orders",
|
28
34
|
columns=[
|
29
|
-
Column(
|
30
|
-
|
35
|
+
Column(
|
36
|
+
id_=0,
|
37
|
+
column_name="order_status",
|
38
|
+
column_type="STRING",
|
39
|
+
values=["OPEN", "CLOSED"],
|
40
|
+
),
|
41
|
+
Column(
|
42
|
+
id_=1,
|
43
|
+
column_name="total_amount",
|
44
|
+
column_type="NUMBER",
|
45
|
+
values=["12.5", "18.3"],
|
46
|
+
),
|
31
47
|
],
|
32
48
|
)
|
33
49
|
|
34
50
|
table_proto = semantic_model_pb2.Table(
|
35
51
|
name="ORDERS",
|
36
52
|
description=" ",
|
37
|
-
base_table=semantic_model_pb2.FullyQualifiedTable(
|
53
|
+
base_table=semantic_model_pb2.FullyQualifiedTable(
|
54
|
+
database="SALES", schema="PUBLIC", table="ORDERS"
|
55
|
+
),
|
38
56
|
dimensions=[
|
39
57
|
semantic_model_pb2.Dimension(
|
40
58
|
name="order_status",
|
@@ -98,16 +116,18 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
|
98
116
|
}
|
99
117
|
],
|
100
118
|
"filters": [
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
119
|
+
{
|
120
|
+
"name": "order_status_include_values",
|
121
|
+
"description": "Limit the result set to a sample of order statuses.",
|
122
|
+
"synonyms": ["Order status filter"],
|
123
|
+
}
|
106
124
|
],
|
107
125
|
"model_description": "Semantic model for customer orders and related metrics.",
|
108
126
|
}
|
109
127
|
|
110
|
-
client = _FakeDashscopeClient(
|
128
|
+
client = _FakeDashscopeClient(
|
129
|
+
[fake_response, {"model_metrics": []}, {"verified_queries": []}]
|
130
|
+
)
|
111
131
|
enrich_semantic_model(
|
112
132
|
model,
|
113
133
|
[(FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_table)],
|
@@ -116,7 +136,10 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
|
116
136
|
)
|
117
137
|
|
118
138
|
table = model.tables[0]
|
119
|
-
assert
|
139
|
+
assert (
|
140
|
+
table.description
|
141
|
+
== "Orders fact table that records order status and total amount."
|
142
|
+
)
|
120
143
|
|
121
144
|
dimension = next(dim for dim in table.dimensions if dim.expr == "order_status")
|
122
145
|
assert dimension.description == "Current execution status for each order."
|
@@ -126,8 +149,12 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
|
126
149
|
assert fact.description == "Order total including taxes."
|
127
150
|
assert "Order total" in list(fact.synonyms)
|
128
151
|
|
129
|
-
filter_obj = next(
|
130
|
-
|
152
|
+
filter_obj = next(
|
153
|
+
flt for flt in table.filters if flt.name == "order_status_include_values"
|
154
|
+
)
|
155
|
+
assert (
|
156
|
+
filter_obj.description == "Limit the result set to a sample of order statuses."
|
157
|
+
)
|
131
158
|
assert "Order status filter" in list(filter_obj.synonyms)
|
132
159
|
|
133
160
|
assert len(table.metrics) == 1
|
@@ -135,10 +162,15 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
|
|
135
162
|
assert metric.name.startswith("gmv")
|
136
163
|
assert metric.expr == "SUM(total_amount)"
|
137
164
|
assert "GMV" in list(metric.synonyms)
|
138
|
-
assert
|
165
|
+
assert (
|
166
|
+
metric.description
|
167
|
+
== "Based on total_amount and used as gross merchandise value."
|
168
|
+
)
|
139
169
|
|
140
170
|
assert model.custom_instructions == ""
|
141
|
-
assert
|
171
|
+
assert (
|
172
|
+
model.description == "Semantic model for customer orders and related metrics."
|
173
|
+
)
|
142
174
|
|
143
175
|
|
144
176
|
class _FakeSession:
|
@@ -160,8 +192,15 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
|
|
160
192
|
id_=0,
|
161
193
|
name="orders",
|
162
194
|
columns=[
|
163
|
-
Column(
|
164
|
-
|
195
|
+
Column(
|
196
|
+
id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
|
197
|
+
),
|
198
|
+
Column(
|
199
|
+
id_=1,
|
200
|
+
column_name="total_amount",
|
201
|
+
column_type="NUMBER",
|
202
|
+
values=["10", "20"],
|
203
|
+
),
|
165
204
|
],
|
166
205
|
)
|
167
206
|
|
@@ -169,15 +208,21 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
|
|
169
208
|
id_=1,
|
170
209
|
name="payments",
|
171
210
|
columns=[
|
172
|
-
Column(
|
173
|
-
|
211
|
+
Column(
|
212
|
+
id_=0, column_name="payment_id", column_type="NUMBER", values=["1", "2"]
|
213
|
+
),
|
214
|
+
Column(
|
215
|
+
id_=1, column_name="amount", column_type="NUMBER", values=["5", "15"]
|
216
|
+
),
|
174
217
|
],
|
175
218
|
)
|
176
219
|
|
177
220
|
orders_proto = semantic_model_pb2.Table(
|
178
221
|
name="ORDERS",
|
179
222
|
description=" ",
|
180
|
-
base_table=semantic_model_pb2.FullyQualifiedTable(
|
223
|
+
base_table=semantic_model_pb2.FullyQualifiedTable(
|
224
|
+
database="SALES", schema="PUBLIC", table="ORDERS"
|
225
|
+
),
|
181
226
|
facts=[
|
182
227
|
semantic_model_pb2.Fact(
|
183
228
|
name="total_amount",
|
@@ -191,7 +236,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
|
|
191
236
|
payments_proto = semantic_model_pb2.Table(
|
192
237
|
name="PAYMENTS",
|
193
238
|
description=" ",
|
194
|
-
base_table=semantic_model_pb2.FullyQualifiedTable(
|
239
|
+
base_table=semantic_model_pb2.FullyQualifiedTable(
|
240
|
+
database="SALES", schema="PUBLIC", table="PAYMENTS"
|
241
|
+
),
|
195
242
|
facts=[
|
196
243
|
semantic_model_pb2.Fact(
|
197
244
|
name="amount",
|
@@ -202,7 +249,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
|
|
202
249
|
],
|
203
250
|
)
|
204
251
|
|
205
|
-
model = semantic_model_pb2.SemanticModel(
|
252
|
+
model = semantic_model_pb2.SemanticModel(
|
253
|
+
name="Orders Model", tables=[orders_proto, payments_proto]
|
254
|
+
)
|
206
255
|
|
207
256
|
table_payload = {
|
208
257
|
"table_description": "Orders fact table with totals.",
|
@@ -253,22 +302,32 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
|
|
253
302
|
}
|
254
303
|
|
255
304
|
# Model description response for when _summarize_model_description is called
|
256
|
-
model_description_payload =
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
305
|
+
model_description_payload = (
|
306
|
+
"This is an orders model for tracking sales and payments."
|
307
|
+
)
|
308
|
+
|
309
|
+
client = _FakeDashscopeClient(
|
310
|
+
[
|
311
|
+
table_payload,
|
312
|
+
table_payload_payments,
|
313
|
+
model_description_payload,
|
314
|
+
model_metrics_payload,
|
315
|
+
verified_queries_payload,
|
316
|
+
]
|
317
|
+
)
|
265
318
|
session = _FakeSession()
|
266
319
|
|
267
320
|
enrich_semantic_model(
|
268
321
|
model,
|
269
322
|
[
|
270
|
-
(
|
271
|
-
|
323
|
+
(
|
324
|
+
FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
|
325
|
+
raw_orders,
|
326
|
+
),
|
327
|
+
(
|
328
|
+
FQNParts(database="SALES", schema_name="PUBLIC", table="PAYMENTS"),
|
329
|
+
raw_payments,
|
330
|
+
),
|
272
331
|
],
|
273
332
|
client,
|
274
333
|
placeholder=" ",
|
@@ -297,15 +356,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
|
|
297
356
|
id_=0,
|
298
357
|
name="orders",
|
299
358
|
columns=[
|
300
|
-
Column(
|
301
|
-
|
359
|
+
Column(
|
360
|
+
id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
|
361
|
+
),
|
362
|
+
Column(
|
363
|
+
id_=1,
|
364
|
+
column_name="total_amount",
|
365
|
+
column_type="NUMBER",
|
366
|
+
values=["10", "20"],
|
367
|
+
),
|
302
368
|
],
|
303
369
|
)
|
304
370
|
|
305
371
|
orders_proto = semantic_model_pb2.Table(
|
306
372
|
name="ORDERS",
|
307
373
|
description=" ",
|
308
|
-
base_table=semantic_model_pb2.FullyQualifiedTable(
|
374
|
+
base_table=semantic_model_pb2.FullyQualifiedTable(
|
375
|
+
database="SALES", schema="PUBLIC", table="ORDERS"
|
376
|
+
),
|
309
377
|
facts=[
|
310
378
|
semantic_model_pb2.Fact(
|
311
379
|
name="total_amount",
|
@@ -341,12 +409,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
|
|
341
409
|
# Model description response for when _summarize_model_description is called
|
342
410
|
model_description_payload = "This is an orders model for tracking order metrics."
|
343
411
|
|
344
|
-
client = _FakeDashscopeClient(
|
412
|
+
client = _FakeDashscopeClient(
|
413
|
+
[
|
414
|
+
table_payload,
|
415
|
+
model_description_payload,
|
416
|
+
model_metrics_payload,
|
417
|
+
verified_queries_payload,
|
418
|
+
]
|
419
|
+
)
|
345
420
|
session = _FakeSession()
|
346
421
|
|
347
422
|
enrich_semantic_model(
|
348
423
|
model,
|
349
|
-
[
|
424
|
+
[
|
425
|
+
(
|
426
|
+
FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
|
427
|
+
raw_orders,
|
428
|
+
)
|
429
|
+
],
|
350
430
|
client,
|
351
431
|
placeholder=" ",
|
352
432
|
session=session,
|
@@ -365,15 +445,27 @@ def test_model_metrics_skipped_with_no_facts() -> None:
|
|
365
445
|
id_=0,
|
366
446
|
name="customers",
|
367
447
|
columns=[
|
368
|
-
Column(
|
369
|
-
|
448
|
+
Column(
|
449
|
+
id_=0,
|
450
|
+
column_name="customer_id",
|
451
|
+
column_type="NUMBER",
|
452
|
+
values=["1", "2"],
|
453
|
+
),
|
454
|
+
Column(
|
455
|
+
id_=1,
|
456
|
+
column_name="customer_name",
|
457
|
+
column_type="STRING",
|
458
|
+
values=["Alice", "Bob"],
|
459
|
+
),
|
370
460
|
],
|
371
461
|
)
|
372
462
|
|
373
463
|
customers_proto = semantic_model_pb2.Table(
|
374
464
|
name="CUSTOMERS",
|
375
465
|
description=" ",
|
376
|
-
base_table=semantic_model_pb2.FullyQualifiedTable(
|
466
|
+
base_table=semantic_model_pb2.FullyQualifiedTable(
|
467
|
+
database="SALES", schema="PUBLIC", table="CUSTOMERS"
|
468
|
+
),
|
377
469
|
dimensions=[
|
378
470
|
semantic_model_pb2.Dimension(
|
379
471
|
name="customer_name",
|
@@ -384,7 +476,9 @@ def test_model_metrics_skipped_with_no_facts() -> None:
|
|
384
476
|
],
|
385
477
|
)
|
386
478
|
|
387
|
-
model = semantic_model_pb2.SemanticModel(
|
479
|
+
model = semantic_model_pb2.SemanticModel(
|
480
|
+
name="Customer Model", tables=[customers_proto]
|
481
|
+
)
|
388
482
|
|
389
483
|
table_payload = {
|
390
484
|
"table_description": "Customer dimension table.",
|
@@ -408,12 +502,24 @@ def test_model_metrics_skipped_with_no_facts() -> None:
|
|
408
502
|
# Model description response for when _summarize_model_description is called
|
409
503
|
model_description_payload = "This is a customer dimension model."
|
410
504
|
|
411
|
-
client = _FakeDashscopeClient(
|
505
|
+
client = _FakeDashscopeClient(
|
506
|
+
[
|
507
|
+
table_payload,
|
508
|
+
model_description_payload,
|
509
|
+
model_metrics_payload,
|
510
|
+
verified_queries_payload,
|
511
|
+
]
|
512
|
+
)
|
412
513
|
session = _FakeSession()
|
413
514
|
|
414
515
|
enrich_semantic_model(
|
415
516
|
model,
|
416
|
-
[
|
517
|
+
[
|
518
|
+
(
|
519
|
+
FQNParts(database="SALES", schema_name="PUBLIC", table="CUSTOMERS"),
|
520
|
+
raw_customers,
|
521
|
+
)
|
522
|
+
],
|
417
523
|
client,
|
418
524
|
placeholder=" ",
|
419
525
|
session=session,
|
@@ -33,11 +33,14 @@ class _FakeSession:
|
|
33
33
|
}
|
34
34
|
)
|
35
35
|
)
|
36
|
+
if "INFORMATION_SCHEMA.COLUMNS" in normalized:
|
37
|
+
return _FakeResult(self.columns_df)
|
36
38
|
if "FROM INFORMATION_SCHEMA.TABLES" in normalized:
|
37
|
-
data = {
|
39
|
+
data = {
|
40
|
+
"TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables),
|
41
|
+
"TABLE_NAME": self.tables,
|
42
|
+
}
|
38
43
|
return _FakeResult(pd.DataFrame(data))
|
39
|
-
if "FROM INFORMATION_SCHEMA.COLUMNS" in normalized:
|
40
|
-
return _FakeResult(self.columns_df)
|
41
44
|
if "SELECT DISTINCT" in normalized:
|
42
45
|
# Return single column of sample values
|
43
46
|
return _FakeResult(pd.DataFrame({"VALUE": [1, 2, 3]}))
|