clickzetta-semantic-model-generator 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/METADATA +5 -5
  2. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/RECORD +21 -21
  3. semantic_model_generator/clickzetta_utils/clickzetta_connector.py +91 -33
  4. semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
  5. semantic_model_generator/data_processing/cte_utils.py +1 -1
  6. semantic_model_generator/generate_model.py +588 -224
  7. semantic_model_generator/llm/dashscope_client.py +4 -2
  8. semantic_model_generator/llm/enrichment.py +144 -57
  9. semantic_model_generator/llm/progress_tracker.py +16 -15
  10. semantic_model_generator/relationships/discovery.py +1 -6
  11. semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
  12. semantic_model_generator/tests/cte_utils_test.py +1 -1
  13. semantic_model_generator/tests/generate_model_classification_test.py +12 -2
  14. semantic_model_generator/tests/llm_enrichment_test.py +152 -46
  15. semantic_model_generator/tests/relationship_discovery_test.py +6 -3
  16. semantic_model_generator/tests/relationships_filters_test.py +166 -30
  17. semantic_model_generator/tests/utils_test.py +1 -1
  18. semantic_model_generator/validate/keywords.py +453 -53
  19. semantic_model_generator/validate/schema.py +4 -2
  20. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/LICENSE +0 -0
  21. {clickzetta_semantic_model_generator-1.0.2.dist-info → clickzetta_semantic_model_generator-1.0.3.dist-info}/WHEEL +0 -0
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from semantic_model_generator import generate_model
4
3
  from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
5
4
  from semantic_model_generator.llm.dashscope_client import DashscopeResponse
6
5
  from semantic_model_generator.llm.enrichment import enrich_semantic_model
@@ -16,9 +15,16 @@ class _FakeDashscopeClient:
16
15
  self._index = 0
17
16
 
18
17
  def chat_completion(self, messages): # type: ignore[no-untyped-def]
19
- payload = self._payloads[self._index] if self._index < len(self._payloads) else self._payloads[-1]
18
+ payload = (
19
+ self._payloads[self._index]
20
+ if self._index < len(self._payloads)
21
+ else self._payloads[-1]
22
+ )
20
23
  self._index += 1
21
- return DashscopeResponse(content=json.dumps(payload, ensure_ascii=False), request_id=f"test_{self._index}")
24
+ return DashscopeResponse(
25
+ content=json.dumps(payload, ensure_ascii=False),
26
+ request_id=f"test_{self._index}",
27
+ )
22
28
 
23
29
 
24
30
  def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
@@ -26,15 +32,27 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
26
32
  id_=0,
27
33
  name="orders",
28
34
  columns=[
29
- Column(id_=0, column_name="order_status", column_type="STRING", values=["OPEN", "CLOSED"]),
30
- Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["12.5", "18.3"]),
35
+ Column(
36
+ id_=0,
37
+ column_name="order_status",
38
+ column_type="STRING",
39
+ values=["OPEN", "CLOSED"],
40
+ ),
41
+ Column(
42
+ id_=1,
43
+ column_name="total_amount",
44
+ column_type="NUMBER",
45
+ values=["12.5", "18.3"],
46
+ ),
31
47
  ],
32
48
  )
33
49
 
34
50
  table_proto = semantic_model_pb2.Table(
35
51
  name="ORDERS",
36
52
  description=" ",
37
- base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
53
+ base_table=semantic_model_pb2.FullyQualifiedTable(
54
+ database="SALES", schema="PUBLIC", table="ORDERS"
55
+ ),
38
56
  dimensions=[
39
57
  semantic_model_pb2.Dimension(
40
58
  name="order_status",
@@ -98,16 +116,18 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
98
116
  }
99
117
  ],
100
118
  "filters": [
101
- {
102
- "name": "order_status_include_values",
103
- "description": "Limit the result set to a sample of order statuses.",
104
- "synonyms": ["Order status filter"],
105
- }
119
+ {
120
+ "name": "order_status_include_values",
121
+ "description": "Limit the result set to a sample of order statuses.",
122
+ "synonyms": ["Order status filter"],
123
+ }
106
124
  ],
107
125
  "model_description": "Semantic model for customer orders and related metrics.",
108
126
  }
109
127
 
110
- client = _FakeDashscopeClient([fake_response, {"model_metrics": []}, {"verified_queries": []}])
128
+ client = _FakeDashscopeClient(
129
+ [fake_response, {"model_metrics": []}, {"verified_queries": []}]
130
+ )
111
131
  enrich_semantic_model(
112
132
  model,
113
133
  [(FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_table)],
@@ -116,7 +136,10 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
116
136
  )
117
137
 
118
138
  table = model.tables[0]
119
- assert table.description == "Orders fact table that records order status and total amount."
139
+ assert (
140
+ table.description
141
+ == "Orders fact table that records order status and total amount."
142
+ )
120
143
 
121
144
  dimension = next(dim for dim in table.dimensions if dim.expr == "order_status")
122
145
  assert dimension.description == "Current execution status for each order."
@@ -126,8 +149,12 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
126
149
  assert fact.description == "Order total including taxes."
127
150
  assert "Order total" in list(fact.synonyms)
128
151
 
129
- filter_obj = next(flt for flt in table.filters if flt.name == "order_status_include_values")
130
- assert filter_obj.description == "Limit the result set to a sample of order statuses."
152
+ filter_obj = next(
153
+ flt for flt in table.filters if flt.name == "order_status_include_values"
154
+ )
155
+ assert (
156
+ filter_obj.description == "Limit the result set to a sample of order statuses."
157
+ )
131
158
  assert "Order status filter" in list(filter_obj.synonyms)
132
159
 
133
160
  assert len(table.metrics) == 1
@@ -135,10 +162,15 @@ def test_enrich_semantic_model_populates_descriptions_and_synonyms() -> None:
135
162
  assert metric.name.startswith("gmv")
136
163
  assert metric.expr == "SUM(total_amount)"
137
164
  assert "GMV" in list(metric.synonyms)
138
- assert metric.description == "Based on total_amount and used as gross merchandise value."
165
+ assert (
166
+ metric.description
167
+ == "Based on total_amount and used as gross merchandise value."
168
+ )
139
169
 
140
170
  assert model.custom_instructions == ""
141
- assert model.description == "Semantic model for customer orders and related metrics."
171
+ assert (
172
+ model.description == "Semantic model for customer orders and related metrics."
173
+ )
142
174
 
143
175
 
144
176
  class _FakeSession:
@@ -160,8 +192,15 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
160
192
  id_=0,
161
193
  name="orders",
162
194
  columns=[
163
- Column(id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]),
164
- Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["10", "20"]),
195
+ Column(
196
+ id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
197
+ ),
198
+ Column(
199
+ id_=1,
200
+ column_name="total_amount",
201
+ column_type="NUMBER",
202
+ values=["10", "20"],
203
+ ),
165
204
  ],
166
205
  )
167
206
 
@@ -169,15 +208,21 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
169
208
  id_=1,
170
209
  name="payments",
171
210
  columns=[
172
- Column(id_=0, column_name="payment_id", column_type="NUMBER", values=["1", "2"]),
173
- Column(id_=1, column_name="amount", column_type="NUMBER", values=["5", "15"]),
211
+ Column(
212
+ id_=0, column_name="payment_id", column_type="NUMBER", values=["1", "2"]
213
+ ),
214
+ Column(
215
+ id_=1, column_name="amount", column_type="NUMBER", values=["5", "15"]
216
+ ),
174
217
  ],
175
218
  )
176
219
 
177
220
  orders_proto = semantic_model_pb2.Table(
178
221
  name="ORDERS",
179
222
  description=" ",
180
- base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
223
+ base_table=semantic_model_pb2.FullyQualifiedTable(
224
+ database="SALES", schema="PUBLIC", table="ORDERS"
225
+ ),
181
226
  facts=[
182
227
  semantic_model_pb2.Fact(
183
228
  name="total_amount",
@@ -191,7 +236,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
191
236
  payments_proto = semantic_model_pb2.Table(
192
237
  name="PAYMENTS",
193
238
  description=" ",
194
- base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="PAYMENTS"),
239
+ base_table=semantic_model_pb2.FullyQualifiedTable(
240
+ database="SALES", schema="PUBLIC", table="PAYMENTS"
241
+ ),
195
242
  facts=[
196
243
  semantic_model_pb2.Fact(
197
244
  name="amount",
@@ -202,7 +249,9 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
202
249
  ],
203
250
  )
204
251
 
205
- model = semantic_model_pb2.SemanticModel(name="Orders Model", tables=[orders_proto, payments_proto])
252
+ model = semantic_model_pb2.SemanticModel(
253
+ name="Orders Model", tables=[orders_proto, payments_proto]
254
+ )
206
255
 
207
256
  table_payload = {
208
257
  "table_description": "Orders fact table with totals.",
@@ -253,22 +302,32 @@ def test_enrich_semantic_model_generates_model_metrics_and_verified_queries() ->
253
302
  }
254
303
 
255
304
  # Model description response for when _summarize_model_description is called
256
- model_description_payload = "This is an orders model for tracking sales and payments."
257
-
258
- client = _FakeDashscopeClient([
259
- table_payload,
260
- table_payload_payments,
261
- model_description_payload,
262
- model_metrics_payload,
263
- verified_queries_payload,
264
- ])
305
+ model_description_payload = (
306
+ "This is an orders model for tracking sales and payments."
307
+ )
308
+
309
+ client = _FakeDashscopeClient(
310
+ [
311
+ table_payload,
312
+ table_payload_payments,
313
+ model_description_payload,
314
+ model_metrics_payload,
315
+ verified_queries_payload,
316
+ ]
317
+ )
265
318
  session = _FakeSession()
266
319
 
267
320
  enrich_semantic_model(
268
321
  model,
269
322
  [
270
- (FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_orders),
271
- (FQNParts(database="SALES", schema_name="PUBLIC", table="PAYMENTS"), raw_payments),
323
+ (
324
+ FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
325
+ raw_orders,
326
+ ),
327
+ (
328
+ FQNParts(database="SALES", schema_name="PUBLIC", table="PAYMENTS"),
329
+ raw_payments,
330
+ ),
272
331
  ],
273
332
  client,
274
333
  placeholder=" ",
@@ -297,15 +356,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
297
356
  id_=0,
298
357
  name="orders",
299
358
  columns=[
300
- Column(id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]),
301
- Column(id_=1, column_name="total_amount", column_type="NUMBER", values=["10", "20"]),
359
+ Column(
360
+ id_=0, column_name="order_id", column_type="NUMBER", values=["1", "2"]
361
+ ),
362
+ Column(
363
+ id_=1,
364
+ column_name="total_amount",
365
+ column_type="NUMBER",
366
+ values=["10", "20"],
367
+ ),
302
368
  ],
303
369
  )
304
370
 
305
371
  orders_proto = semantic_model_pb2.Table(
306
372
  name="ORDERS",
307
373
  description=" ",
308
- base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="ORDERS"),
374
+ base_table=semantic_model_pb2.FullyQualifiedTable(
375
+ database="SALES", schema="PUBLIC", table="ORDERS"
376
+ ),
309
377
  facts=[
310
378
  semantic_model_pb2.Fact(
311
379
  name="total_amount",
@@ -341,12 +409,24 @@ def test_model_metrics_generated_with_single_fact_table() -> None:
341
409
  # Model description response for when _summarize_model_description is called
342
410
  model_description_payload = "This is an orders model for tracking order metrics."
343
411
 
344
- client = _FakeDashscopeClient([table_payload, model_description_payload, model_metrics_payload, verified_queries_payload])
412
+ client = _FakeDashscopeClient(
413
+ [
414
+ table_payload,
415
+ model_description_payload,
416
+ model_metrics_payload,
417
+ verified_queries_payload,
418
+ ]
419
+ )
345
420
  session = _FakeSession()
346
421
 
347
422
  enrich_semantic_model(
348
423
  model,
349
- [(FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"), raw_orders)],
424
+ [
425
+ (
426
+ FQNParts(database="SALES", schema_name="PUBLIC", table="ORDERS"),
427
+ raw_orders,
428
+ )
429
+ ],
350
430
  client,
351
431
  placeholder=" ",
352
432
  session=session,
@@ -365,15 +445,27 @@ def test_model_metrics_skipped_with_no_facts() -> None:
365
445
  id_=0,
366
446
  name="customers",
367
447
  columns=[
368
- Column(id_=0, column_name="customer_id", column_type="NUMBER", values=["1", "2"]),
369
- Column(id_=1, column_name="customer_name", column_type="STRING", values=["Alice", "Bob"]),
448
+ Column(
449
+ id_=0,
450
+ column_name="customer_id",
451
+ column_type="NUMBER",
452
+ values=["1", "2"],
453
+ ),
454
+ Column(
455
+ id_=1,
456
+ column_name="customer_name",
457
+ column_type="STRING",
458
+ values=["Alice", "Bob"],
459
+ ),
370
460
  ],
371
461
  )
372
462
 
373
463
  customers_proto = semantic_model_pb2.Table(
374
464
  name="CUSTOMERS",
375
465
  description=" ",
376
- base_table=semantic_model_pb2.FullyQualifiedTable(database="SALES", schema="PUBLIC", table="CUSTOMERS"),
466
+ base_table=semantic_model_pb2.FullyQualifiedTable(
467
+ database="SALES", schema="PUBLIC", table="CUSTOMERS"
468
+ ),
377
469
  dimensions=[
378
470
  semantic_model_pb2.Dimension(
379
471
  name="customer_name",
@@ -384,7 +476,9 @@ def test_model_metrics_skipped_with_no_facts() -> None:
384
476
  ],
385
477
  )
386
478
 
387
- model = semantic_model_pb2.SemanticModel(name="Customer Model", tables=[customers_proto])
479
+ model = semantic_model_pb2.SemanticModel(
480
+ name="Customer Model", tables=[customers_proto]
481
+ )
388
482
 
389
483
  table_payload = {
390
484
  "table_description": "Customer dimension table.",
@@ -408,12 +502,24 @@ def test_model_metrics_skipped_with_no_facts() -> None:
408
502
  # Model description response for when _summarize_model_description is called
409
503
  model_description_payload = "This is a customer dimension model."
410
504
 
411
- client = _FakeDashscopeClient([table_payload, model_description_payload, model_metrics_payload, verified_queries_payload])
505
+ client = _FakeDashscopeClient(
506
+ [
507
+ table_payload,
508
+ model_description_payload,
509
+ model_metrics_payload,
510
+ verified_queries_payload,
511
+ ]
512
+ )
412
513
  session = _FakeSession()
413
514
 
414
515
  enrich_semantic_model(
415
516
  model,
416
- [(FQNParts(database="SALES", schema_name="PUBLIC", table="CUSTOMERS"), raw_customers)],
517
+ [
518
+ (
519
+ FQNParts(database="SALES", schema_name="PUBLIC", table="CUSTOMERS"),
520
+ raw_customers,
521
+ )
522
+ ],
417
523
  client,
418
524
  placeholder=" ",
419
525
  session=session,
@@ -33,11 +33,14 @@ class _FakeSession:
33
33
  }
34
34
  )
35
35
  )
36
+ if "INFORMATION_SCHEMA.COLUMNS" in normalized:
37
+ return _FakeResult(self.columns_df)
36
38
  if "FROM INFORMATION_SCHEMA.TABLES" in normalized:
37
- data = {"TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables), "TABLE_NAME": self.tables}
39
+ data = {
40
+ "TABLE_SCHEMA": ["TPCH_100G"] * len(self.tables),
41
+ "TABLE_NAME": self.tables,
42
+ }
38
43
  return _FakeResult(pd.DataFrame(data))
39
- if "FROM INFORMATION_SCHEMA.COLUMNS" in normalized:
40
- return _FakeResult(self.columns_df)
41
44
  if "SELECT DISTINCT" in normalized:
42
45
  # Return single column of sample values
43
46
  return _FakeResult(pd.DataFrame({"VALUE": [1, 2, 3]}))