clickzetta-semantic-model-generator 1.0.2__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/PKG-INFO +5 -5
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/pyproject.toml +5 -5
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +100 -48
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/utils.py +44 -2
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/cte_utils.py +44 -14
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/generate_model.py +711 -239
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/dashscope_client.py +4 -2
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/enrichment.py +144 -57
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/progress_tracker.py +16 -15
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/relationships/__init__.py +2 -0
- clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/relationships/discovery.py +372 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/cte_utils_test.py +15 -14
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/generate_model_classification_test.py +12 -2
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/llm_enrichment_test.py +152 -46
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/relationship_discovery_test.py +70 -3
- clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/tests/relationships_filters_test.py +361 -0
- clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/validate/keywords.py +457 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate/schema.py +4 -2
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/relationships/discovery.py +0 -207
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/tests/relationships_filters_test.py +0 -225
- clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/validate/keywords.py +0 -57
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/README.md +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/data_types.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/proto_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/output_models/.keep +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model.proto +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/utils_test.py +1 -1
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/validate_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate/context_length.py +0 -0
- {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate_model.py +0 -0
{clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: clickzetta-semantic-model-generator
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.4
|
4
4
|
Summary: Curate a Semantic Model for ClickZetta Lakehouse
|
5
5
|
License: Apache Software License; BSD License
|
6
6
|
Author: qililiang
|
@@ -13,12 +13,12 @@ Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
14
14
|
Provides-Extra: looker
|
15
15
|
Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
|
16
|
-
Requires-Dist: clickzetta-connector-python (
|
17
|
-
Requires-Dist: clickzetta-zettapark-python (
|
16
|
+
Requires-Dist: clickzetta-connector-python (>=0.8.92)
|
17
|
+
Requires-Dist: clickzetta-zettapark-python (>=0.1.3)
|
18
18
|
Requires-Dist: dashscope (>=1.22.2,<2.0.0)
|
19
19
|
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
20
20
|
Requires-Dist: looker-sdk (>=24.14.0,<25.0.0) ; extra == "looker"
|
21
|
-
Requires-Dist: numpy (>=1.26.4,<
|
21
|
+
Requires-Dist: numpy (>=1.26.4,<3.0.0)
|
22
22
|
Requires-Dist: pandas (>=2.0.1,<3.0.0)
|
23
23
|
Requires-Dist: protobuf (==5.26.1)
|
24
24
|
Requires-Dist: pyarrow (==14.0.2)
|
@@ -31,7 +31,7 @@ Requires-Dist: streamlit (==1.36.0)
|
|
31
31
|
Requires-Dist: streamlit-extras (==0.4.0)
|
32
32
|
Requires-Dist: strictyaml (>=1.7.3,<2.0.0)
|
33
33
|
Requires-Dist: tqdm (>=4.66.5,<5.0.0)
|
34
|
-
Requires-Dist: urllib3 (>=1.26.19,<
|
34
|
+
Requires-Dist: urllib3 (>=1.26.19,<3.0.0)
|
35
35
|
Description-Content-Type: text/markdown
|
36
36
|
|
37
37
|
# semantic-model-generator
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "clickzetta-semantic-model-generator"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.4"
|
4
4
|
description = "Curate a Semantic Model for ClickZetta Lakehouse"
|
5
5
|
authors = ["qililiang <qililiang@clickzetta.com>"]
|
6
6
|
license = "Apache Software License; BSD License"
|
@@ -11,7 +11,7 @@ packages = [{include = "semantic_model_generator"}]
|
|
11
11
|
python = ">=3.9,<3.9.7 || >3.9.7,<3.12"
|
12
12
|
pandas = "^2.0.1"
|
13
13
|
loguru = "^0.7.2"
|
14
|
-
clickzetta-connector-python = "0.8.92"
|
14
|
+
clickzetta-connector-python = ">=0.8.92"
|
15
15
|
protobuf = "5.26.1"
|
16
16
|
pydantic = ">=2.8.2,<3.0.0"
|
17
17
|
PyYAML = "^6.0.1"
|
@@ -22,11 +22,11 @@ sqlglot = "25.10.0"
|
|
22
22
|
strictyaml = "^1.7.3"
|
23
23
|
streamlit = "1.36.0"
|
24
24
|
streamlit-extras = "0.4.0"
|
25
|
-
numpy = "
|
25
|
+
numpy = ">=1.26.4,<3.0.0"
|
26
26
|
python-dotenv = "^1.0.1"
|
27
|
-
urllib3 = "
|
27
|
+
urllib3 = ">=1.26.19,<3.0.0"
|
28
28
|
requests = "^2.32.3"
|
29
|
-
clickzetta-zettapark-python = "0.1.3"
|
29
|
+
clickzetta-zettapark-python = ">=0.1.3"
|
30
30
|
dashscope = "^1.22.2"
|
31
31
|
|
32
32
|
# Optional dependencies for functionality such as partner semantic model support.
|
@@ -4,7 +4,6 @@ import concurrent.futures
|
|
4
4
|
import re
|
5
5
|
from collections import defaultdict
|
6
6
|
from contextlib import contextmanager
|
7
|
-
from types import SimpleNamespace
|
8
7
|
from typing import Any, Dict, Generator, List, Optional, TypeVar, Union
|
9
8
|
|
10
9
|
import pandas as pd
|
@@ -13,8 +12,10 @@ from loguru import logger
|
|
13
12
|
|
14
13
|
from semantic_model_generator.clickzetta_utils import env_vars
|
15
14
|
from semantic_model_generator.clickzetta_utils.utils import (
|
16
|
-
clickzetta_connection,
|
17
15
|
create_session,
|
16
|
+
join_quoted_identifiers,
|
17
|
+
normalize_identifier,
|
18
|
+
quote_identifier,
|
18
19
|
)
|
19
20
|
from semantic_model_generator.data_processing.data_types import Column, Table
|
20
21
|
|
@@ -115,7 +116,9 @@ class ClickzettaCursor:
|
|
115
116
|
def execute(self, query: str) -> "ClickzettaCursor":
|
116
117
|
self._df = _execute_query_to_pandas(self._session, query)
|
117
118
|
columns = [] if self._df is None else list(self._df.columns)
|
118
|
-
self.description = [
|
119
|
+
self.description = [
|
120
|
+
(col, None, None, None, None, None, None) for col in columns
|
121
|
+
]
|
119
122
|
return self
|
120
123
|
|
121
124
|
def fetchone(self) -> Optional[tuple[Any, ...]]:
|
@@ -153,14 +156,8 @@ class ClickzettaConnectionProxy:
|
|
153
156
|
self.session.close()
|
154
157
|
|
155
158
|
|
156
|
-
def _quote_identifier(name: str) -> str:
|
157
|
-
return f'"{name}"'
|
158
|
-
|
159
|
-
|
160
159
|
def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
|
161
|
-
return
|
162
|
-
[_quote_identifier(workspace), _quote_identifier(schema_name), _quote_identifier(table_name)]
|
163
|
-
)
|
160
|
+
return join_quoted_identifiers(workspace, schema_name, table_name)
|
164
161
|
|
165
162
|
|
166
163
|
def _value_is_true(value: Any) -> bool:
|
@@ -173,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
|
|
173
170
|
|
174
171
|
|
175
172
|
def _sanitize_identifier(value: Any, fallback: str = "") -> str:
|
176
|
-
|
173
|
+
normalized = normalize_identifier(value)
|
174
|
+
if not normalized:
|
177
175
|
return fallback
|
178
|
-
normalized = str(value).strip()
|
179
|
-
if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
|
180
|
-
normalized = normalized[1:-1]
|
181
176
|
return normalized
|
182
177
|
|
183
178
|
|
@@ -214,15 +209,19 @@ def _fetch_distinct_values(
|
|
214
209
|
column_name: str,
|
215
210
|
ndv: int,
|
216
211
|
) -> Optional[List[str]]:
|
217
|
-
workspace_part = _sanitize_identifier(workspace, workspace)
|
218
|
-
schema_part =
|
219
|
-
|
220
|
-
|
212
|
+
workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
|
213
|
+
schema_part = (
|
214
|
+
_sanitize_identifier(schema_name, schema_name) if schema_name else ""
|
215
|
+
)
|
216
|
+
table_part = _sanitize_identifier(table_name, table_name)
|
217
|
+
column_part = _sanitize_identifier(column_name, column_name)
|
221
218
|
|
222
|
-
|
223
|
-
|
219
|
+
qualified_table = join_quoted_identifiers(
|
220
|
+
workspace_part, schema_part, table_part
|
221
|
+
)
|
222
|
+
column_expr = quote_identifier(column_part)
|
224
223
|
|
225
|
-
query = f"SELECT DISTINCT {
|
224
|
+
query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
|
226
225
|
try:
|
227
226
|
df = session.sql(query).to_pandas()
|
228
227
|
if df.empty:
|
@@ -257,7 +256,6 @@ def _get_column_representation(
|
|
257
256
|
else:
|
258
257
|
column_datatype = str(column_datatype_raw)
|
259
258
|
column_datatype = _normalize_column_type(column_datatype)
|
260
|
-
normalized_type = column_datatype.split("(")[0].strip()
|
261
259
|
column_values = (
|
262
260
|
_fetch_distinct_values(
|
263
261
|
session=session,
|
@@ -351,7 +349,14 @@ def _catalog_category(session: Session, workspace: str) -> str:
|
|
351
349
|
return "UNKNOWN"
|
352
350
|
|
353
351
|
df.columns = [str(col).upper() for col in df.columns]
|
354
|
-
name_col = next(
|
352
|
+
name_col = next(
|
353
|
+
(
|
354
|
+
col
|
355
|
+
for col in ("WORKSPACE_NAME", "NAME", "CATALOG_NAME")
|
356
|
+
if col in df.columns
|
357
|
+
),
|
358
|
+
None,
|
359
|
+
)
|
355
360
|
category_col = next((col for col in ("CATEGORY",) if col in df.columns), None)
|
356
361
|
if not name_col or not category_col:
|
357
362
|
_CATALOG_CATEGORY_CACHE[workspace_upper] = "UNKNOWN"
|
@@ -408,7 +413,9 @@ ORDER BY kc.ordinal_position
|
|
408
413
|
if result is not None:
|
409
414
|
return result
|
410
415
|
except Exception:
|
411
|
-
logger.debug(
|
416
|
+
logger.debug(
|
417
|
+
"Primary key lookup via sys.information_schema failed; falling back."
|
418
|
+
)
|
412
419
|
|
413
420
|
fallback_query = f"""
|
414
421
|
SELECT kc.column_name
|
@@ -423,7 +430,13 @@ ORDER BY kc.ordinal_position
|
|
423
430
|
if result is not None:
|
424
431
|
return result
|
425
432
|
except Exception as exc:
|
426
|
-
logger.warning(
|
433
|
+
logger.warning(
|
434
|
+
"Primary key lookup failed for {}.{}.{}: {}",
|
435
|
+
workspace,
|
436
|
+
schema_name,
|
437
|
+
table_name,
|
438
|
+
exc,
|
439
|
+
)
|
427
440
|
return None
|
428
441
|
|
429
442
|
|
@@ -432,9 +445,7 @@ def _build_information_schema_query(
|
|
432
445
|
table_schema: Optional[str],
|
433
446
|
table_names: Optional[List[str]],
|
434
447
|
) -> str:
|
435
|
-
where_conditions: List[str] = [
|
436
|
-
"1=1"
|
437
|
-
]
|
448
|
+
where_conditions: List[str] = ["1=1"]
|
438
449
|
if table_schema:
|
439
450
|
where_conditions.append(f"upper(t.table_schema) = '{table_schema.upper()}'")
|
440
451
|
if table_names:
|
@@ -442,7 +453,6 @@ def _build_information_schema_query(
|
|
442
453
|
where_conditions.append(f"upper(t.table_name) IN ({formatted_names})")
|
443
454
|
|
444
455
|
where_clause = " AND ".join(where_conditions)
|
445
|
-
base = "information_schema"
|
446
456
|
return f"""
|
447
457
|
SELECT
|
448
458
|
t.table_schema AS {_TABLE_SCHEMA_COL},
|
@@ -474,27 +484,48 @@ def _fetch_columns_via_show(
|
|
474
484
|
schema = table_schema.upper() if table_schema else ""
|
475
485
|
|
476
486
|
for table_name in table_names:
|
477
|
-
qualified_parts = [
|
487
|
+
qualified_parts = [
|
488
|
+
part for part in (catalog, schema, table_name.upper()) if part
|
489
|
+
]
|
478
490
|
qualified_table = ".".join(qualified_parts)
|
479
491
|
query = f"SHOW COLUMNS IN {qualified_table}"
|
480
492
|
try:
|
481
493
|
df = session.sql(query).to_pandas()
|
482
494
|
except Exception as exc:
|
483
|
-
logger.debug(
|
495
|
+
logger.debug(
|
496
|
+
"SHOW COLUMNS fallback failed for {}: {}", qualified_table, exc
|
497
|
+
)
|
484
498
|
continue
|
485
499
|
if df.empty:
|
486
500
|
continue
|
487
501
|
df.columns = [str(col).upper() for col in df.columns]
|
488
|
-
schema_col = next(
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
502
|
+
schema_col = next(
|
503
|
+
(col for col in ("TABLE_SCHEMA", "SCHEMA_NAME") if col in df.columns), None
|
504
|
+
)
|
505
|
+
table_col = next(
|
506
|
+
(col for col in ("TABLE_NAME", "NAME") if col in df.columns), None
|
507
|
+
)
|
508
|
+
column_col = next(
|
509
|
+
(
|
510
|
+
col
|
511
|
+
for col in ("COLUMN_NAME", "NAME")
|
512
|
+
if col in df.columns and col != table_col
|
513
|
+
),
|
514
|
+
None,
|
515
|
+
)
|
516
|
+
datatype_col = next(
|
517
|
+
(col for col in ("DATA_TYPE", "TYPE") if col in df.columns), None
|
518
|
+
)
|
519
|
+
comment_col = next(
|
520
|
+
(col for col in ("COMMENT", "COLUMN_COMMENT") if col in df.columns), None
|
521
|
+
)
|
493
522
|
|
494
523
|
normalized = pd.DataFrame()
|
495
524
|
normalized[_TABLE_SCHEMA_COL] = df[schema_col] if schema_col else table_schema
|
496
525
|
normalized[_TABLE_NAME_COL] = df[table_col] if table_col else table_name
|
497
|
-
normalized[_COLUMN_NAME_COL] =
|
526
|
+
normalized[_COLUMN_NAME_COL] = (
|
527
|
+
df[column_col] if column_col else df.index.astype(str)
|
528
|
+
)
|
498
529
|
normalized[_DATATYPE_COL] = df[datatype_col] if datatype_col else ""
|
499
530
|
normalized[_COLUMN_COMMENT_ALIAS] = df[comment_col] if comment_col else ""
|
500
531
|
normalized[_TABLE_COMMENT_COL] = ""
|
@@ -552,6 +583,7 @@ def get_valid_schemas_tables_columns_df(
|
|
552
583
|
if _TABLE_SCHEMA_COL in result.columns:
|
553
584
|
result[_TABLE_SCHEMA_COL] = result[_TABLE_SCHEMA_COL].astype(str).str.upper()
|
554
585
|
if _IS_PRIMARY_KEY_COL in result.columns:
|
586
|
+
|
555
587
|
def _normalize_pk(value: Any) -> bool:
|
556
588
|
if isinstance(value, bool):
|
557
589
|
return value
|
@@ -617,10 +649,10 @@ def fetch_tables_views_in_schema(
|
|
617
649
|
workspace_upper = workspace.upper()
|
618
650
|
schema_upper = schema.upper()
|
619
651
|
|
620
|
-
target = ""
|
621
652
|
try:
|
622
653
|
if workspace_upper and schema_upper:
|
623
|
-
|
654
|
+
scope = join_quoted_identifiers(workspace_upper, schema_upper)
|
655
|
+
df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
|
624
656
|
else:
|
625
657
|
df = session.sql("SHOW TABLES").to_pandas()
|
626
658
|
except Exception as exc: # pragma: no cover
|
@@ -634,17 +666,27 @@ def fetch_tables_views_in_schema(
|
|
634
666
|
df.columns = [str(col).upper() for col in df.columns]
|
635
667
|
name_column = "TABLE_NAME" if "TABLE_NAME" in df.columns else df.columns[0]
|
636
668
|
schema_column = next(
|
637
|
-
(
|
669
|
+
(
|
670
|
+
col
|
671
|
+
for col in ("SCHEMA_NAME", "TABLE_SCHEMA", "NAMESPACE")
|
672
|
+
if col in df.columns
|
673
|
+
),
|
638
674
|
None,
|
639
675
|
)
|
640
676
|
catalog_column = next(
|
641
|
-
(
|
677
|
+
(
|
678
|
+
col
|
679
|
+
for col in ("CATALOG_NAME", "WORKSPACE_NAME", "TABLE_CATALOG")
|
680
|
+
if col in df.columns
|
681
|
+
),
|
642
682
|
None,
|
643
683
|
)
|
644
684
|
|
645
685
|
results: List[str] = []
|
646
686
|
for _, row in df.iterrows():
|
647
|
-
if _value_is_true(row.get("IS_VIEW")) and not _value_is_true(
|
687
|
+
if _value_is_true(row.get("IS_VIEW")) and not _value_is_true(
|
688
|
+
row.get("IS_MATERIALIZED_VIEW")
|
689
|
+
):
|
648
690
|
continue
|
649
691
|
if _value_is_true(row.get("IS_EXTERNAL")):
|
650
692
|
continue
|
@@ -686,11 +728,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
|
|
686
728
|
|
687
729
|
queries: List[str] = []
|
688
730
|
if schema:
|
689
|
-
|
690
|
-
|
731
|
+
scope = join_quoted_identifiers(workspace, schema)
|
732
|
+
if scope:
|
733
|
+
queries.append(f"SHOW VOLUMES IN {scope}")
|
734
|
+
queries.append(f"SHOW STAGES IN SCHEMA {scope}")
|
691
735
|
else:
|
692
|
-
|
693
|
-
|
736
|
+
workspace_identifier = quote_identifier(workspace)
|
737
|
+
if workspace_identifier:
|
738
|
+
queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
|
739
|
+
queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
|
694
740
|
|
695
741
|
stage_names: List[str] = ["volume:user://~/semantic_models/"]
|
696
742
|
seen: set[str] = set(stage_names)
|
@@ -756,7 +802,11 @@ def fetch_yaml_names_in_stage(
|
|
756
802
|
if stage.lower().startswith("volume:user://"):
|
757
803
|
volume_body = stage[len("volume:") :]
|
758
804
|
# Normalize relative directory
|
759
|
-
relative =
|
805
|
+
relative = (
|
806
|
+
volume_body[len("user://") :]
|
807
|
+
if volume_body.startswith("user://")
|
808
|
+
else volume_body
|
809
|
+
)
|
760
810
|
relative = relative.lstrip("~/")
|
761
811
|
relative = relative.strip("/")
|
762
812
|
|
@@ -842,7 +892,9 @@ def create_table_in_schema(
|
|
842
892
|
table_fqn: str,
|
843
893
|
columns_schema: Dict[str, str],
|
844
894
|
) -> bool:
|
845
|
-
fields = ", ".join(
|
895
|
+
fields = ", ".join(
|
896
|
+
f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
|
897
|
+
)
|
846
898
|
query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
|
847
899
|
try:
|
848
900
|
session.sql(query).collect()
|
@@ -20,7 +20,9 @@ _CONFIG_PATHS = [
|
|
20
20
|
_ACTIVE_CONFIG_PATH: Optional[str] = None
|
21
21
|
|
22
22
|
|
23
|
-
def _load_config_from_file() ->
|
23
|
+
def _load_config_from_file() -> (
|
24
|
+
Tuple[Optional[Dict[str, str]], Dict[str, Dict[str, str]]]
|
25
|
+
):
|
24
26
|
global _ACTIVE_CONFIG_PATH
|
25
27
|
_ACTIVE_CONFIG_PATH = None
|
26
28
|
for path in _CONFIG_PATHS:
|
@@ -91,7 +93,10 @@ def _deep_lookup(mapping: Any, key: str) -> Optional[Any]:
|
|
91
93
|
if isinstance(current, dict):
|
92
94
|
for candidate_key, candidate_value in current.items():
|
93
95
|
candidate_key_str = str(candidate_key).lower()
|
94
|
-
if candidate_key_str == normalized_key and candidate_value not in (
|
96
|
+
if candidate_key_str == normalized_key and candidate_value not in (
|
97
|
+
None,
|
98
|
+
"",
|
99
|
+
):
|
95
100
|
return candidate_value
|
96
101
|
if isinstance(candidate_value, (dict, list)):
|
97
102
|
queue.append(candidate_value)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from contextlib import contextmanager
|
4
|
-
from typing import Dict, Iterable
|
4
|
+
from typing import Any, Dict, Iterable
|
5
5
|
|
6
6
|
from clickzetta.zettapark.session import Session
|
7
7
|
|
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
|
|
21
21
|
}
|
22
22
|
|
23
23
|
|
24
|
+
def normalize_identifier(value: Any) -> str:
|
25
|
+
"""
|
26
|
+
Strips outer quotes/backticks and surrounding whitespace from an identifier.
|
27
|
+
Returns an empty string when the identifier is missing.
|
28
|
+
"""
|
29
|
+
|
30
|
+
if value is None:
|
31
|
+
return ""
|
32
|
+
text = str(value).strip()
|
33
|
+
if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
|
34
|
+
return text[1:-1]
|
35
|
+
return text
|
36
|
+
|
37
|
+
|
38
|
+
def quote_identifier(value: Any) -> str:
|
39
|
+
"""
|
40
|
+
Wraps an identifier in backticks, escaping embedded backticks as needed.
|
41
|
+
Returns an empty string if the identifier is missing.
|
42
|
+
"""
|
43
|
+
|
44
|
+
normalized = normalize_identifier(value)
|
45
|
+
if not normalized:
|
46
|
+
return ""
|
47
|
+
escaped = normalized.replace("`", "``")
|
48
|
+
return f"`{escaped}`"
|
49
|
+
|
50
|
+
|
51
|
+
def join_quoted_identifiers(*parts: Any) -> str:
|
52
|
+
"""
|
53
|
+
Joins identifier parts with '.' and ensures each segment is backtick-quoted.
|
54
|
+
Empty segments are skipped.
|
55
|
+
"""
|
56
|
+
|
57
|
+
quoted_parts = [
|
58
|
+
quote_identifier(part)
|
59
|
+
for part in parts
|
60
|
+
if normalize_identifier(part)
|
61
|
+
]
|
62
|
+
return ".".join(part for part in quoted_parts if part)
|
63
|
+
|
64
|
+
|
24
65
|
def create_fqn_table(fqn_str: str) -> FQNParts:
|
25
66
|
"""
|
26
67
|
Splits a fully qualified table name into its ClickZetta components.
|
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
|
|
72
113
|
("schema", schema),
|
73
114
|
("vcluster", vcluster),
|
74
115
|
):
|
75
|
-
|
116
|
+
identifier = quote_identifier(value)
|
117
|
+
session.sql(f"USE {component.upper()} {identifier}")
|
76
118
|
|
77
119
|
|
78
120
|
def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:
|
@@ -8,15 +8,37 @@ import sqlglot.expressions
|
|
8
8
|
from loguru import logger
|
9
9
|
from sqlglot import Dialect
|
10
10
|
|
11
|
-
from semantic_model_generator.protos import semantic_model_pb2
|
12
11
|
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
13
12
|
OBJECT_DATATYPES,
|
14
13
|
)
|
14
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
15
|
+
join_quoted_identifiers,
|
16
|
+
normalize_identifier,
|
17
|
+
)
|
18
|
+
from semantic_model_generator.protos import semantic_model_pb2
|
15
19
|
|
16
20
|
_SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
|
17
21
|
ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
|
18
22
|
|
19
23
|
_LOGICAL_TABLE_PREFIX = "__"
|
24
|
+
_SQLGLOT_QUOTE_CHAR = '"'
|
25
|
+
|
26
|
+
|
27
|
+
def _prepare_sql_for_parsing(sql: str) -> str:
|
28
|
+
"""
|
29
|
+
Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
|
30
|
+
"""
|
31
|
+
|
32
|
+
return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
|
33
|
+
|
34
|
+
|
35
|
+
def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
|
36
|
+
"""
|
37
|
+
Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
|
38
|
+
"""
|
39
|
+
|
40
|
+
rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
|
41
|
+
return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
|
20
42
|
|
21
43
|
|
22
44
|
def is_logical_table(table_name: str) -> bool:
|
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
|
|
33
55
|
|
34
56
|
def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
|
35
57
|
"""Returns fully qualified table name such as my_db.my_schema.my_table"""
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return
|
58
|
+
parts = [
|
59
|
+
normalize_identifier(component)
|
60
|
+
for component in (table.database, table.schema, table.table)
|
61
|
+
if component
|
62
|
+
]
|
63
|
+
return join_quoted_identifiers(*parts) # type: ignore[no-any-return]
|
42
64
|
|
43
65
|
|
44
66
|
def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
|
@@ -156,8 +178,8 @@ def _generate_cte_for(
|
|
156
178
|
cte = f"WITH {logical_table_name(table)} AS (\n"
|
157
179
|
cte += "SELECT \n"
|
158
180
|
cte += ",\n".join(expr_columns) + "\n"
|
159
|
-
cte += f"FROM {fully_qualified_table_name(table.base_table)}"
|
160
|
-
cte += ")"
|
181
|
+
cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
|
182
|
+
cte += ")\n"
|
161
183
|
return cte
|
162
184
|
|
163
185
|
|
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
|
|
261
283
|
str: The SQL statement in ClickZetta syntax.
|
262
284
|
"""
|
263
285
|
try:
|
264
|
-
expression = sqlglot.parse_one(
|
286
|
+
expression = sqlglot.parse_one(
|
287
|
+
_prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
|
288
|
+
)
|
265
289
|
except Exception as e:
|
266
290
|
raise ValueError(
|
267
291
|
f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
|
268
292
|
)
|
269
293
|
|
270
|
-
return expression
|
294
|
+
return _render_clickzetta_sql(expression)
|
271
295
|
|
272
296
|
|
273
297
|
def generate_select(
|
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
|
|
332
356
|
for cte in ctes:
|
333
357
|
new_withs.append(
|
334
358
|
sqlglot.parse_one(
|
335
|
-
cte,
|
359
|
+
_prepare_sql_for_parsing(cte),
|
360
|
+
read=ClickzettaDialect,
|
361
|
+
into=sqlglot.expressions.With,
|
336
362
|
)
|
337
363
|
)
|
338
364
|
|
339
365
|
# Step 3: Prefix the CTEs to the original query.
|
340
|
-
ast = sqlglot.parse_one(
|
366
|
+
ast = sqlglot.parse_one(
|
367
|
+
_prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
|
368
|
+
)
|
341
369
|
with_ = ast.args.get("with")
|
342
370
|
# If the query doesn't have a WITH clause, then generate one.
|
343
371
|
if with_ is None:
|
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
|
|
349
377
|
else:
|
350
378
|
new_ctes = [w.expressions[0] for w in new_withs]
|
351
379
|
with_.set("expressions", new_ctes + with_.expressions)
|
352
|
-
return
|
380
|
+
return _render_clickzetta_sql(
|
381
|
+
ast, pretty=True
|
382
|
+
) # type: ignore [no-any-return]
|
353
383
|
|
354
384
|
|
355
385
|
def context_to_column_format(
|