clickzetta-semantic-model-generator 1.0.2__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/PKG-INFO +5 -5
  2. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/pyproject.toml +5 -5
  3. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +100 -48
  4. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/env_vars.py +7 -2
  5. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/clickzetta_utils/utils.py +44 -2
  6. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/cte_utils.py +44 -14
  7. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/generate_model.py +711 -239
  8. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/dashscope_client.py +4 -2
  9. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/enrichment.py +144 -57
  10. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/progress_tracker.py +16 -15
  11. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/relationships/__init__.py +2 -0
  12. clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/relationships/discovery.py +372 -0
  13. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/clickzetta_connector_test.py +3 -7
  14. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/cte_utils_test.py +15 -14
  15. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/generate_model_classification_test.py +12 -2
  16. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/llm_enrichment_test.py +152 -46
  17. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/relationship_discovery_test.py +70 -3
  18. clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/tests/relationships_filters_test.py +361 -0
  19. clickzetta_semantic_model_generator-1.0.4/semantic_model_generator/validate/keywords.py +457 -0
  20. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate/schema.py +4 -2
  21. clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/relationships/discovery.py +0 -207
  22. clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/tests/relationships_filters_test.py +0 -225
  23. clickzetta_semantic_model_generator-1.0.2/semantic_model_generator/validate/keywords.py +0 -57
  24. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/LICENSE +0 -0
  25. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/README.md +0 -0
  26. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/__init__.py +0 -0
  27. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/__init__.py +0 -0
  28. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
  29. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/data_types.py +0 -0
  30. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/data_processing/proto_utils.py +0 -0
  31. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/llm/__init__.py +0 -0
  32. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/output_models/.keep +0 -0
  33. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model.proto +0 -0
  34. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
  35. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
  36. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
  37. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/utils_test.py +1 -1
  38. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/validate_model_test.py +0 -0
  39. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
  40. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate/context_length.py +0 -0
  41. {clickzetta_semantic_model_generator-1.0.2 → clickzetta_semantic_model_generator-1.0.4}/semantic_model_generator/validate_model.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: clickzetta-semantic-model-generator
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: Curate a Semantic Model for ClickZetta Lakehouse
5
5
  License: Apache Software License; BSD License
6
6
  Author: qililiang
@@ -13,12 +13,12 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Provides-Extra: looker
15
15
  Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
16
- Requires-Dist: clickzetta-connector-python (==0.8.92)
17
- Requires-Dist: clickzetta-zettapark-python (==0.1.3)
16
+ Requires-Dist: clickzetta-connector-python (>=0.8.92)
17
+ Requires-Dist: clickzetta-zettapark-python (>=0.1.3)
18
18
  Requires-Dist: dashscope (>=1.22.2,<2.0.0)
19
19
  Requires-Dist: loguru (>=0.7.2,<0.8.0)
20
20
  Requires-Dist: looker-sdk (>=24.14.0,<25.0.0) ; extra == "looker"
21
- Requires-Dist: numpy (>=1.26.4,<2.0.0)
21
+ Requires-Dist: numpy (>=1.26.4,<3.0.0)
22
22
  Requires-Dist: pandas (>=2.0.1,<3.0.0)
23
23
  Requires-Dist: protobuf (==5.26.1)
24
24
  Requires-Dist: pyarrow (==14.0.2)
@@ -31,7 +31,7 @@ Requires-Dist: streamlit (==1.36.0)
31
31
  Requires-Dist: streamlit-extras (==0.4.0)
32
32
  Requires-Dist: strictyaml (>=1.7.3,<2.0.0)
33
33
  Requires-Dist: tqdm (>=4.66.5,<5.0.0)
34
- Requires-Dist: urllib3 (>=1.26.19,<2.0.0)
34
+ Requires-Dist: urllib3 (>=1.26.19,<3.0.0)
35
35
  Description-Content-Type: text/markdown
36
36
 
37
37
  # semantic-model-generator
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "clickzetta-semantic-model-generator"
3
- version = "1.0.2"
3
+ version = "1.0.4"
4
4
  description = "Curate a Semantic Model for ClickZetta Lakehouse"
5
5
  authors = ["qililiang <qililiang@clickzetta.com>"]
6
6
  license = "Apache Software License; BSD License"
@@ -11,7 +11,7 @@ packages = [{include = "semantic_model_generator"}]
11
11
  python = ">=3.9,<3.9.7 || >3.9.7,<3.12"
12
12
  pandas = "^2.0.1"
13
13
  loguru = "^0.7.2"
14
- clickzetta-connector-python = "0.8.92"
14
+ clickzetta-connector-python = ">=0.8.92"
15
15
  protobuf = "5.26.1"
16
16
  pydantic = ">=2.8.2,<3.0.0"
17
17
  PyYAML = "^6.0.1"
@@ -22,11 +22,11 @@ sqlglot = "25.10.0"
22
22
  strictyaml = "^1.7.3"
23
23
  streamlit = "1.36.0"
24
24
  streamlit-extras = "0.4.0"
25
- numpy = "^1.26.4"
25
+ numpy = ">=1.26.4,<3.0.0"
26
26
  python-dotenv = "^1.0.1"
27
- urllib3 = "^1.26.19"
27
+ urllib3 = ">=1.26.19,<3.0.0"
28
28
  requests = "^2.32.3"
29
- clickzetta-zettapark-python = "0.1.3"
29
+ clickzetta-zettapark-python = ">=0.1.3"
30
30
  dashscope = "^1.22.2"
31
31
 
32
32
  # Optional dependencies for functionality such as partner semantic model support.
@@ -4,7 +4,6 @@ import concurrent.futures
4
4
  import re
5
5
  from collections import defaultdict
6
6
  from contextlib import contextmanager
7
- from types import SimpleNamespace
8
7
  from typing import Any, Dict, Generator, List, Optional, TypeVar, Union
9
8
 
10
9
  import pandas as pd
@@ -13,8 +12,10 @@ from loguru import logger
13
12
 
14
13
  from semantic_model_generator.clickzetta_utils import env_vars
15
14
  from semantic_model_generator.clickzetta_utils.utils import (
16
- clickzetta_connection,
17
15
  create_session,
16
+ join_quoted_identifiers,
17
+ normalize_identifier,
18
+ quote_identifier,
18
19
  )
19
20
  from semantic_model_generator.data_processing.data_types import Column, Table
20
21
 
@@ -115,7 +116,9 @@ class ClickzettaCursor:
115
116
  def execute(self, query: str) -> "ClickzettaCursor":
116
117
  self._df = _execute_query_to_pandas(self._session, query)
117
118
  columns = [] if self._df is None else list(self._df.columns)
118
- self.description = [(col, None, None, None, None, None, None) for col in columns]
119
+ self.description = [
120
+ (col, None, None, None, None, None, None) for col in columns
121
+ ]
119
122
  return self
120
123
 
121
124
  def fetchone(self) -> Optional[tuple[Any, ...]]:
@@ -153,14 +156,8 @@ class ClickzettaConnectionProxy:
153
156
  self.session.close()
154
157
 
155
158
 
156
- def _quote_identifier(name: str) -> str:
157
- return f'"{name}"'
158
-
159
-
160
159
  def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
161
- return ".".join(
162
- [_quote_identifier(workspace), _quote_identifier(schema_name), _quote_identifier(table_name)]
163
- )
160
+ return join_quoted_identifiers(workspace, schema_name, table_name)
164
161
 
165
162
 
166
163
  def _value_is_true(value: Any) -> bool:
@@ -173,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
173
170
 
174
171
 
175
172
  def _sanitize_identifier(value: Any, fallback: str = "") -> str:
176
- if value is None or value == "":
173
+ normalized = normalize_identifier(value)
174
+ if not normalized:
177
175
  return fallback
178
- normalized = str(value).strip()
179
- if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
180
- normalized = normalized[1:-1]
181
176
  return normalized
182
177
 
183
178
 
@@ -214,15 +209,19 @@ def _fetch_distinct_values(
214
209
  column_name: str,
215
210
  ndv: int,
216
211
  ) -> Optional[List[str]]:
217
- workspace_part = _sanitize_identifier(workspace, workspace).upper() if workspace else ""
218
- schema_part = _sanitize_identifier(schema_name, schema_name).upper() if schema_name else ""
219
- table_part = _sanitize_identifier(table_name, table_name).upper()
220
- column_part = _sanitize_identifier(column_name, column_name).upper()
212
+ workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
213
+ schema_part = (
214
+ _sanitize_identifier(schema_name, schema_name) if schema_name else ""
215
+ )
216
+ table_part = _sanitize_identifier(table_name, table_name)
217
+ column_part = _sanitize_identifier(column_name, column_name)
221
218
 
222
- qualified_parts = [part for part in (workspace_part, schema_part, table_part) if part]
223
- qualified_table = ".".join(qualified_parts)
219
+ qualified_table = join_quoted_identifiers(
220
+ workspace_part, schema_part, table_part
221
+ )
222
+ column_expr = quote_identifier(column_part)
224
223
 
225
- query = f"SELECT DISTINCT {column_part} FROM {qualified_table} LIMIT {ndv}"
224
+ query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
226
225
  try:
227
226
  df = session.sql(query).to_pandas()
228
227
  if df.empty:
@@ -257,7 +256,6 @@ def _get_column_representation(
257
256
  else:
258
257
  column_datatype = str(column_datatype_raw)
259
258
  column_datatype = _normalize_column_type(column_datatype)
260
- normalized_type = column_datatype.split("(")[0].strip()
261
259
  column_values = (
262
260
  _fetch_distinct_values(
263
261
  session=session,
@@ -351,7 +349,14 @@ def _catalog_category(session: Session, workspace: str) -> str:
351
349
  return "UNKNOWN"
352
350
 
353
351
  df.columns = [str(col).upper() for col in df.columns]
354
- name_col = next((col for col in ("WORKSPACE_NAME", "NAME", "CATALOG_NAME") if col in df.columns), None)
352
+ name_col = next(
353
+ (
354
+ col
355
+ for col in ("WORKSPACE_NAME", "NAME", "CATALOG_NAME")
356
+ if col in df.columns
357
+ ),
358
+ None,
359
+ )
355
360
  category_col = next((col for col in ("CATEGORY",) if col in df.columns), None)
356
361
  if not name_col or not category_col:
357
362
  _CATALOG_CATEGORY_CACHE[workspace_upper] = "UNKNOWN"
@@ -408,7 +413,9 @@ ORDER BY kc.ordinal_position
408
413
  if result is not None:
409
414
  return result
410
415
  except Exception:
411
- logger.debug("Primary key lookup via sys.information_schema failed; falling back.")
416
+ logger.debug(
417
+ "Primary key lookup via sys.information_schema failed; falling back."
418
+ )
412
419
 
413
420
  fallback_query = f"""
414
421
  SELECT kc.column_name
@@ -423,7 +430,13 @@ ORDER BY kc.ordinal_position
423
430
  if result is not None:
424
431
  return result
425
432
  except Exception as exc:
426
- logger.warning("Primary key lookup failed for {}.{}.{}: {}", workspace, schema_name, table_name, exc)
433
+ logger.warning(
434
+ "Primary key lookup failed for {}.{}.{}: {}",
435
+ workspace,
436
+ schema_name,
437
+ table_name,
438
+ exc,
439
+ )
427
440
  return None
428
441
 
429
442
 
@@ -432,9 +445,7 @@ def _build_information_schema_query(
432
445
  table_schema: Optional[str],
433
446
  table_names: Optional[List[str]],
434
447
  ) -> str:
435
- where_conditions: List[str] = [
436
- "1=1"
437
- ]
448
+ where_conditions: List[str] = ["1=1"]
438
449
  if table_schema:
439
450
  where_conditions.append(f"upper(t.table_schema) = '{table_schema.upper()}'")
440
451
  if table_names:
@@ -442,7 +453,6 @@ def _build_information_schema_query(
442
453
  where_conditions.append(f"upper(t.table_name) IN ({formatted_names})")
443
454
 
444
455
  where_clause = " AND ".join(where_conditions)
445
- base = "information_schema"
446
456
  return f"""
447
457
  SELECT
448
458
  t.table_schema AS {_TABLE_SCHEMA_COL},
@@ -474,27 +484,48 @@ def _fetch_columns_via_show(
474
484
  schema = table_schema.upper() if table_schema else ""
475
485
 
476
486
  for table_name in table_names:
477
- qualified_parts = [part for part in (catalog, schema, table_name.upper()) if part]
487
+ qualified_parts = [
488
+ part for part in (catalog, schema, table_name.upper()) if part
489
+ ]
478
490
  qualified_table = ".".join(qualified_parts)
479
491
  query = f"SHOW COLUMNS IN {qualified_table}"
480
492
  try:
481
493
  df = session.sql(query).to_pandas()
482
494
  except Exception as exc:
483
- logger.debug("SHOW COLUMNS fallback failed for {}: {}", qualified_table, exc)
495
+ logger.debug(
496
+ "SHOW COLUMNS fallback failed for {}: {}", qualified_table, exc
497
+ )
484
498
  continue
485
499
  if df.empty:
486
500
  continue
487
501
  df.columns = [str(col).upper() for col in df.columns]
488
- schema_col = next((col for col in ("TABLE_SCHEMA", "SCHEMA_NAME") if col in df.columns), None)
489
- table_col = next((col for col in ("TABLE_NAME", "NAME") if col in df.columns), None)
490
- column_col = next((col for col in ("COLUMN_NAME", "NAME") if col in df.columns and col != table_col), None)
491
- datatype_col = next((col for col in ("DATA_TYPE", "TYPE") if col in df.columns), None)
492
- comment_col = next((col for col in ("COMMENT", "COLUMN_COMMENT") if col in df.columns), None)
502
+ schema_col = next(
503
+ (col for col in ("TABLE_SCHEMA", "SCHEMA_NAME") if col in df.columns), None
504
+ )
505
+ table_col = next(
506
+ (col for col in ("TABLE_NAME", "NAME") if col in df.columns), None
507
+ )
508
+ column_col = next(
509
+ (
510
+ col
511
+ for col in ("COLUMN_NAME", "NAME")
512
+ if col in df.columns and col != table_col
513
+ ),
514
+ None,
515
+ )
516
+ datatype_col = next(
517
+ (col for col in ("DATA_TYPE", "TYPE") if col in df.columns), None
518
+ )
519
+ comment_col = next(
520
+ (col for col in ("COMMENT", "COLUMN_COMMENT") if col in df.columns), None
521
+ )
493
522
 
494
523
  normalized = pd.DataFrame()
495
524
  normalized[_TABLE_SCHEMA_COL] = df[schema_col] if schema_col else table_schema
496
525
  normalized[_TABLE_NAME_COL] = df[table_col] if table_col else table_name
497
- normalized[_COLUMN_NAME_COL] = df[column_col] if column_col else df.index.astype(str)
526
+ normalized[_COLUMN_NAME_COL] = (
527
+ df[column_col] if column_col else df.index.astype(str)
528
+ )
498
529
  normalized[_DATATYPE_COL] = df[datatype_col] if datatype_col else ""
499
530
  normalized[_COLUMN_COMMENT_ALIAS] = df[comment_col] if comment_col else ""
500
531
  normalized[_TABLE_COMMENT_COL] = ""
@@ -552,6 +583,7 @@ def get_valid_schemas_tables_columns_df(
552
583
  if _TABLE_SCHEMA_COL in result.columns:
553
584
  result[_TABLE_SCHEMA_COL] = result[_TABLE_SCHEMA_COL].astype(str).str.upper()
554
585
  if _IS_PRIMARY_KEY_COL in result.columns:
586
+
555
587
  def _normalize_pk(value: Any) -> bool:
556
588
  if isinstance(value, bool):
557
589
  return value
@@ -617,10 +649,10 @@ def fetch_tables_views_in_schema(
617
649
  workspace_upper = workspace.upper()
618
650
  schema_upper = schema.upper()
619
651
 
620
- target = ""
621
652
  try:
622
653
  if workspace_upper and schema_upper:
623
- df = session.sql(f"SHOW TABLES IN {workspace_upper}.{schema_upper}").to_pandas()
654
+ scope = join_quoted_identifiers(workspace_upper, schema_upper)
655
+ df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
624
656
  else:
625
657
  df = session.sql("SHOW TABLES").to_pandas()
626
658
  except Exception as exc: # pragma: no cover
@@ -634,17 +666,27 @@ def fetch_tables_views_in_schema(
634
666
  df.columns = [str(col).upper() for col in df.columns]
635
667
  name_column = "TABLE_NAME" if "TABLE_NAME" in df.columns else df.columns[0]
636
668
  schema_column = next(
637
- (col for col in ("SCHEMA_NAME", "TABLE_SCHEMA", "NAMESPACE") if col in df.columns),
669
+ (
670
+ col
671
+ for col in ("SCHEMA_NAME", "TABLE_SCHEMA", "NAMESPACE")
672
+ if col in df.columns
673
+ ),
638
674
  None,
639
675
  )
640
676
  catalog_column = next(
641
- (col for col in ("CATALOG_NAME", "WORKSPACE_NAME", "TABLE_CATALOG") if col in df.columns),
677
+ (
678
+ col
679
+ for col in ("CATALOG_NAME", "WORKSPACE_NAME", "TABLE_CATALOG")
680
+ if col in df.columns
681
+ ),
642
682
  None,
643
683
  )
644
684
 
645
685
  results: List[str] = []
646
686
  for _, row in df.iterrows():
647
- if _value_is_true(row.get("IS_VIEW")) and not _value_is_true(row.get("IS_MATERIALIZED_VIEW")):
687
+ if _value_is_true(row.get("IS_VIEW")) and not _value_is_true(
688
+ row.get("IS_MATERIALIZED_VIEW")
689
+ ):
648
690
  continue
649
691
  if _value_is_true(row.get("IS_EXTERNAL")):
650
692
  continue
@@ -686,11 +728,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
686
728
 
687
729
  queries: List[str] = []
688
730
  if schema:
689
- queries.append(f"SHOW VOLUMES IN {workspace}.{schema}")
690
- queries.append(f"SHOW STAGES IN SCHEMA {workspace}.{schema}")
731
+ scope = join_quoted_identifiers(workspace, schema)
732
+ if scope:
733
+ queries.append(f"SHOW VOLUMES IN {scope}")
734
+ queries.append(f"SHOW STAGES IN SCHEMA {scope}")
691
735
  else:
692
- queries.append(f"SHOW VOLUMES IN {workspace}")
693
- queries.append(f"SHOW STAGES IN DATABASE {workspace}")
736
+ workspace_identifier = quote_identifier(workspace)
737
+ if workspace_identifier:
738
+ queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
739
+ queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
694
740
 
695
741
  stage_names: List[str] = ["volume:user://~/semantic_models/"]
696
742
  seen: set[str] = set(stage_names)
@@ -756,7 +802,11 @@ def fetch_yaml_names_in_stage(
756
802
  if stage.lower().startswith("volume:user://"):
757
803
  volume_body = stage[len("volume:") :]
758
804
  # Normalize relative directory
759
- relative = volume_body[len("user://") :] if volume_body.startswith("user://") else volume_body
805
+ relative = (
806
+ volume_body[len("user://") :]
807
+ if volume_body.startswith("user://")
808
+ else volume_body
809
+ )
760
810
  relative = relative.lstrip("~/")
761
811
  relative = relative.strip("/")
762
812
 
@@ -842,7 +892,9 @@ def create_table_in_schema(
842
892
  table_fqn: str,
843
893
  columns_schema: Dict[str, str],
844
894
  ) -> bool:
845
- fields = ", ".join(f"{_quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items())
895
+ fields = ", ".join(
896
+ f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
897
+ )
846
898
  query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
847
899
  try:
848
900
  session.sql(query).collect()
@@ -20,7 +20,9 @@ _CONFIG_PATHS = [
20
20
  _ACTIVE_CONFIG_PATH: Optional[str] = None
21
21
 
22
22
 
23
- def _load_config_from_file() -> Tuple[Optional[Dict[str, str]], Dict[str, Dict[str, str]]]:
23
+ def _load_config_from_file() -> (
24
+ Tuple[Optional[Dict[str, str]], Dict[str, Dict[str, str]]]
25
+ ):
24
26
  global _ACTIVE_CONFIG_PATH
25
27
  _ACTIVE_CONFIG_PATH = None
26
28
  for path in _CONFIG_PATHS:
@@ -91,7 +93,10 @@ def _deep_lookup(mapping: Any, key: str) -> Optional[Any]:
91
93
  if isinstance(current, dict):
92
94
  for candidate_key, candidate_value in current.items():
93
95
  candidate_key_str = str(candidate_key).lower()
94
- if candidate_key_str == normalized_key and candidate_value not in (None, ""):
96
+ if candidate_key_str == normalized_key and candidate_value not in (
97
+ None,
98
+ "",
99
+ ):
95
100
  return candidate_value
96
101
  if isinstance(candidate_value, (dict, list)):
97
102
  queue.append(candidate_value)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from contextlib import contextmanager
4
- from typing import Dict, Iterable
4
+ from typing import Any, Dict, Iterable
5
5
 
6
6
  from clickzetta.zettapark.session import Session
7
7
 
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
21
21
  }
22
22
 
23
23
 
24
+ def normalize_identifier(value: Any) -> str:
25
+ """
26
+ Strips outer quotes/backticks and surrounding whitespace from an identifier.
27
+ Returns an empty string when the identifier is missing.
28
+ """
29
+
30
+ if value is None:
31
+ return ""
32
+ text = str(value).strip()
33
+ if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
34
+ return text[1:-1]
35
+ return text
36
+
37
+
38
+ def quote_identifier(value: Any) -> str:
39
+ """
40
+ Wraps an identifier in backticks, escaping embedded backticks as needed.
41
+ Returns an empty string if the identifier is missing.
42
+ """
43
+
44
+ normalized = normalize_identifier(value)
45
+ if not normalized:
46
+ return ""
47
+ escaped = normalized.replace("`", "``")
48
+ return f"`{escaped}`"
49
+
50
+
51
+ def join_quoted_identifiers(*parts: Any) -> str:
52
+ """
53
+ Joins identifier parts with '.' and ensures each segment is backtick-quoted.
54
+ Empty segments are skipped.
55
+ """
56
+
57
+ quoted_parts = [
58
+ quote_identifier(part)
59
+ for part in parts
60
+ if normalize_identifier(part)
61
+ ]
62
+ return ".".join(part for part in quoted_parts if part)
63
+
64
+
24
65
  def create_fqn_table(fqn_str: str) -> FQNParts:
25
66
  """
26
67
  Splits a fully qualified table name into its ClickZetta components.
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
72
113
  ("schema", schema),
73
114
  ("vcluster", vcluster),
74
115
  ):
75
- session.sql(f"USE {component.upper()} {value.upper()}")
116
+ identifier = quote_identifier(value)
117
+ session.sql(f"USE {component.upper()} {identifier}")
76
118
 
77
119
 
78
120
  def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:
@@ -8,15 +8,37 @@ import sqlglot.expressions
8
8
  from loguru import logger
9
9
  from sqlglot import Dialect
10
10
 
11
- from semantic_model_generator.protos import semantic_model_pb2
12
11
  from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
13
12
  OBJECT_DATATYPES,
14
13
  )
14
+ from semantic_model_generator.clickzetta_utils.utils import (
15
+ join_quoted_identifiers,
16
+ normalize_identifier,
17
+ )
18
+ from semantic_model_generator.protos import semantic_model_pb2
15
19
 
16
20
  _SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
17
21
  ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
18
22
 
19
23
  _LOGICAL_TABLE_PREFIX = "__"
24
+ _SQLGLOT_QUOTE_CHAR = '"'
25
+
26
+
27
+ def _prepare_sql_for_parsing(sql: str) -> str:
28
+ """
29
+ Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
30
+ """
31
+
32
+ return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
33
+
34
+
35
+ def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
36
+ """
37
+ Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
38
+ """
39
+
40
+ rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
41
+ return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
20
42
 
21
43
 
22
44
  def is_logical_table(table_name: str) -> bool:
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
33
55
 
34
56
  def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
35
57
  """Returns fully qualified table name such as my_db.my_schema.my_table"""
36
- fqn = table.table
37
- if len(table.schema) > 0:
38
- fqn = f"{table.schema}.{fqn}"
39
- if len(table.database) > 0:
40
- fqn = f"{table.database}.{fqn}"
41
- return fqn # type: ignore[no-any-return]
58
+ parts = [
59
+ normalize_identifier(component)
60
+ for component in (table.database, table.schema, table.table)
61
+ if component
62
+ ]
63
+ return join_quoted_identifiers(*parts) # type: ignore[no-any-return]
42
64
 
43
65
 
44
66
  def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
@@ -156,8 +178,8 @@ def _generate_cte_for(
156
178
  cte = f"WITH {logical_table_name(table)} AS (\n"
157
179
  cte += "SELECT \n"
158
180
  cte += ",\n".join(expr_columns) + "\n"
159
- cte += f"FROM {fully_qualified_table_name(table.base_table)}"
160
- cte += ")"
181
+ cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
182
+ cte += ")\n"
161
183
  return cte
162
184
 
163
185
 
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
261
283
  str: The SQL statement in ClickZetta syntax.
262
284
  """
263
285
  try:
264
- expression = sqlglot.parse_one(sql, dialect=ClickzettaDialect)
286
+ expression = sqlglot.parse_one(
287
+ _prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
288
+ )
265
289
  except Exception as e:
266
290
  raise ValueError(
267
291
  f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
268
292
  )
269
293
 
270
- return expression.sql(dialect=ClickzettaDialect)
294
+ return _render_clickzetta_sql(expression)
271
295
 
272
296
 
273
297
  def generate_select(
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
332
356
  for cte in ctes:
333
357
  new_withs.append(
334
358
  sqlglot.parse_one(
335
- cte, read=ClickzettaDialect, into=sqlglot.expressions.With
359
+ _prepare_sql_for_parsing(cte),
360
+ read=ClickzettaDialect,
361
+ into=sqlglot.expressions.With,
336
362
  )
337
363
  )
338
364
 
339
365
  # Step 3: Prefix the CTEs to the original query.
340
- ast = sqlglot.parse_one(sql_query, read=ClickzettaDialect)
366
+ ast = sqlglot.parse_one(
367
+ _prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
368
+ )
341
369
  with_ = ast.args.get("with")
342
370
  # If the query doesn't have a WITH clause, then generate one.
343
371
  if with_ is None:
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
349
377
  else:
350
378
  new_ctes = [w.expressions[0] for w in new_withs]
351
379
  with_.set("expressions", new_ctes + with_.expressions)
352
- return ast.sql(dialect=ClickzettaDialect, pretty=True) # type: ignore [no-any-return]
380
+ return _render_clickzetta_sql(
381
+ ast, pretty=True
382
+ ) # type: ignore [no-any-return]
353
383
 
354
384
 
355
385
  def context_to_column_format(