clickzetta-semantic-model-generator 1.0.13__py3-none-any.whl → 1.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: clickzetta-semantic-model-generator
3
- Version: 1.0.13
3
+ Version: 1.0.15
4
4
  Summary: Curate a Semantic Model for ClickZetta Lakehouse
5
5
  License: Apache Software License; BSD License
6
6
  Author: qililiang
@@ -1,5 +1,5 @@
1
1
  semantic_model_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=CwmO0kjQRujtDBaikjUY2h9Y28i4Ui--ojtB5Nr7XJs,36033
2
+ semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=1EnNk6Xr9uwUGdiPvn_HeafxmHBQlkXEEAb3iYHKAnI,37383
3
3
  semantic_model_generator/clickzetta_utils/env_vars.py,sha256=8cbL6R75c1-aVQ2i1TDr9SiHCUjTrgvXbIRz4MbcmbE,7664
4
4
  semantic_model_generator/clickzetta_utils/utils.py,sha256=UBfWy9qOTyut8tL02gOHHbh6Uz8RqRz5Mm2YdKWFN54,4950
5
5
  semantic_model_generator/data_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,8 +17,8 @@ semantic_model_generator/protos/semantic_model.proto,sha256=WZiN4b8vR-ZX-Lj9Vsm6
17
17
  semantic_model_generator/protos/semantic_model_pb2.py,sha256=scbWkW-I-r3_hp_5SHoOWn02p52RJ9DJ0_-nRgr0LHc,25606
18
18
  semantic_model_generator/protos/semantic_model_pb2.pyi,sha256=iiBIZxtX9d6IuUO3aLcsJsHUeZqdi14vYNuUsSM8C0g,18267
19
19
  semantic_model_generator/relationships/__init__.py,sha256=I9-_QJdp36nEllzKTGXi2aWbRjiXrrexQXUfB6mi3Ww,477
20
- semantic_model_generator/relationships/discovery.py,sha256=aw3LrthDZ6ng9P5eI3noxw-1E30csYqe2kyGn6CpLZA,13125
21
- semantic_model_generator/tests/clickzetta_connector_test.py,sha256=Fdx7jooNt1lslKB2Ub51wqOZ8OM0osgZiDDl3bV6riw,3086
20
+ semantic_model_generator/relationships/discovery.py,sha256=JQ1uCMxdrXB66z5QuCSpeP3x8BxC7b9Q51zyrxckME4,13357
21
+ semantic_model_generator/tests/clickzetta_connector_test.py,sha256=e8sr5SzEhgSgshgyibcT_hS9geBbDQpd_6iADQDFj5w,5661
22
22
  semantic_model_generator/tests/cte_utils_test.py,sha256=_9GAJiOPGSagdWmQsoAEOOhEgsBY0LFlr_xtwrlgf4A,17561
23
23
  semantic_model_generator/tests/generate_model_classification_test.py,sha256=Amq29cmeKd0S7iVikJ60RFm9gpWaQv1TijXofp3J-lI,2275
24
24
  semantic_model_generator/tests/llm_enrichment_test.py,sha256=1avLrPWp7J7o_K3PKbI_PIvduM5Id21MmoL0JTeDTfs,15738
@@ -32,7 +32,7 @@ semantic_model_generator/validate/context_length.py,sha256=HL-GfaRXNcVji1-pAFGXG
32
32
  semantic_model_generator/validate/keywords.py,sha256=frZ5HjRXP69K6dYAU5_d86oSp40_3yoLUg1eQwU3oLM,7080
33
33
  semantic_model_generator/validate/schema.py,sha256=eL_wl5yscIeczwNBRUKhF_7QqWW2wSGimkgaOhMFsrA,5893
34
34
  semantic_model_generator/validate_model.py,sha256=Uq-V-GfPeF2Dy4l9uF5Guv104gDCDGh0Cxz1AJOu5dk,836
35
- clickzetta_semantic_model_generator-1.0.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
- clickzetta_semantic_model_generator-1.0.13.dist-info/METADATA,sha256=eWxK-U7IaduyEYAHMppEJx_DxDaTDdgQJPoed52fu-s,7817
37
- clickzetta_semantic_model_generator-1.0.13.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
38
- clickzetta_semantic_model_generator-1.0.13.dist-info/RECORD,,
35
+ clickzetta_semantic_model_generator-1.0.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
+ clickzetta_semantic_model_generator-1.0.15.dist-info/METADATA,sha256=4b41cAmodnWAFM_ci1AZoBqRtoBpA7SjA71nvp3HJcQ,7817
37
+ clickzetta_semantic_model_generator-1.0.15.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
38
+ clickzetta_semantic_model_generator-1.0.15.dist-info/RECORD,,
@@ -4,7 +4,7 @@ import concurrent.futures
4
4
  import re
5
5
  from collections import defaultdict
6
6
  from contextlib import contextmanager
7
- from typing import Any, Dict, Generator, List, Optional, TypeVar, Union
7
+ from typing import Any, Dict, Generator, List, Optional, Tuple, TypeVar, Union
8
8
 
9
9
  import pandas as pd
10
10
  from clickzetta.zettapark.session import Session
@@ -176,6 +176,25 @@ def _sanitize_identifier(value: Any, fallback: str = "") -> str:
176
176
  return normalized
177
177
 
178
178
 
179
+ def _split_identifier(
180
+ identifier: Any,
181
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
182
+ """
183
+ Split a potentially qualified identifier into catalog, schema, and table parts.
184
+ Returns normalized segments without surrounding quotes/backticks.
185
+ """
186
+
187
+ text = normalize_identifier(identifier)
188
+ if not text:
189
+ return None, None, None
190
+ parts = [part.strip() for part in text.split(".") if part.strip()]
191
+ if len(parts) >= 3:
192
+ return parts[-3], parts[-2], parts[-1]
193
+ if len(parts) == 2:
194
+ return None, parts[0], parts[1]
195
+ return None, None, parts[0]
196
+
197
+
179
198
  def _normalize_column_type(raw: Any) -> str:
180
199
  if raw is None:
181
200
  return ""
@@ -449,8 +468,14 @@ def _build_information_schema_query(
449
468
  if table_schema:
450
469
  where_conditions.append(f"upper(t.table_schema) = '{table_schema.upper()}'")
451
470
  if table_names:
452
- formatted_names = ", ".join(f"'{name.upper()}'" for name in table_names)
453
- where_conditions.append(f"upper(t.table_name) IN ({formatted_names})")
471
+ normalized_names: List[str] = []
472
+ for name in table_names:
473
+ _, _, table_only = _split_identifier(name)
474
+ if table_only:
475
+ normalized_names.append(table_only.upper())
476
+ if normalized_names:
477
+ formatted_names = ", ".join(f"'{name}'" for name in normalized_names)
478
+ where_conditions.append(f"upper(t.table_name) IN ({formatted_names})")
454
479
 
455
480
  where_clause = " AND ".join(where_conditions)
456
481
  return f"""
@@ -490,22 +515,39 @@ def _fetch_columns_via_show(
490
515
  if not table_token:
491
516
  continue
492
517
 
518
+ override_catalog, override_schema, override_table = _split_identifier(table_token)
519
+ table_leaf = override_table or table_token
520
+ if not table_leaf:
521
+ continue
522
+
523
+ catalog_token = override_catalog or catalog
524
+ schema_token_override = override_schema or schema_token
525
+
493
526
  identifier_candidates: List[str] = []
494
- fully_qualified = join_quoted_identifiers(
495
- *(part for part in (catalog, schema_token, table_token) if part)
496
- )
497
- if fully_qualified:
498
- identifier_candidates.append(fully_qualified)
499
- schema_qualified = (
500
- join_quoted_identifiers(schema_token, table_token)
501
- if schema_token
502
- else ""
503
- )
504
- if schema_qualified:
505
- identifier_candidates.append(schema_qualified)
506
- bare_identifier = join_quoted_identifiers(table_token)
507
- if bare_identifier:
508
- identifier_candidates.append(bare_identifier)
527
+ seen_identifiers: set[str] = set()
528
+
529
+ def _add_identifier(parts: Tuple[str, ...], *, quoted: bool) -> None:
530
+ tokens = [part.strip() for part in parts if part and part.strip()]
531
+ if not tokens:
532
+ return
533
+ if quoted:
534
+ identifier = ".".join(quote_identifier(token) for token in tokens)
535
+ else:
536
+ identifier = ".".join(tokens)
537
+ if identifier and identifier not in seen_identifiers:
538
+ identifier_candidates.append(identifier)
539
+ seen_identifiers.add(identifier)
540
+
541
+ raw_parts = (catalog_token, schema_token_override, table_leaf)
542
+ schema_parts = (schema_token_override, table_leaf)
543
+ table_parts = (table_leaf,)
544
+
545
+ _add_identifier(raw_parts, quoted=False)
546
+ _add_identifier(schema_parts, quoted=False)
547
+ _add_identifier(table_parts, quoted=False)
548
+ _add_identifier(raw_parts, quoted=True)
549
+ _add_identifier(schema_parts, quoted=True)
550
+ _add_identifier(table_parts, quoted=True)
509
551
 
510
552
  df = pd.DataFrame()
511
553
  df_source = ""
@@ -573,10 +615,10 @@ def _fetch_columns_via_show(
573
615
  normalized[_TABLE_SCHEMA_COL] = (
574
616
  df[schema_col]
575
617
  if schema_col
576
- else (schema_token or table_schema or "")
618
+ else (schema_token_override or table_schema or "")
577
619
  )
578
620
  normalized[_TABLE_NAME_COL] = (
579
- df[table_col] if table_col else table_token
621
+ df[table_col] if table_col else table_leaf
580
622
  )
581
623
  normalized[_COLUMN_NAME_COL] = (
582
624
  df[column_col] if column_col else df.index.astype(str)
@@ -729,17 +771,11 @@ def fetch_tables_views_in_schema(
729
771
 
730
772
  try:
731
773
  if workspace_token and schema_token:
732
- if is_shared_catalog:
733
- scope = ".".join(
734
- part for part in (workspace_token, schema_token) if part
735
- )
736
- df = session.sql(f"SHOW TABLES IN SHARE {scope}").to_pandas()
737
- else:
738
- scope = join_quoted_identifiers(
739
- workspace_token,
740
- schema_token,
741
- )
742
- df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
774
+ scope = join_quoted_identifiers(
775
+ workspace_token,
776
+ schema_token,
777
+ )
778
+ df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
743
779
  else:
744
780
  df = session.sql("SHOW TABLES").to_pandas()
745
781
  except Exception as exc: # pragma: no cover
@@ -50,7 +50,15 @@ class RelationshipDiscoveryResult:
50
50
  def _normalize_table_names(table_names: Optional[Iterable[str]]) -> Optional[List[str]]:
51
51
  if table_names is None:
52
52
  return None
53
- return [name.upper() for name in table_names]
53
+ normalized: List[str] = []
54
+ for name in table_names:
55
+ parts = [
56
+ part.strip().strip("`").strip('"')
57
+ for part in str(name).split(".")
58
+ if part and part.strip()
59
+ ]
60
+ normalized.append(".".join(parts))
61
+ return normalized
54
62
 
55
63
 
56
64
  def _build_tables_from_dataframe(
@@ -86,3 +86,86 @@ def test_get_valid_columns_falls_back_to_show_columns():
86
86
  assert not df.empty
87
87
  assert df["TABLE_NAME"].iloc[0] == "PARTSUPP"
88
88
  assert df["COLUMN_NAME"].iloc[0] == "PS_PARTKEY"
89
+
90
+
91
+ def test_get_valid_columns_handles_fully_qualified_filters():
92
+ class DummyResult:
93
+ def __init__(self, df: pd.DataFrame):
94
+ self._df = df
95
+
96
+ def to_pandas(self) -> pd.DataFrame:
97
+ return self._df
98
+
99
+ table_df = pd.DataFrame(
100
+ {
101
+ "schema_name": ["S1"],
102
+ "table_name": ["TABLE_ONE"],
103
+ "column_name": ["ID"],
104
+ "data_type": ["INT"],
105
+ "comment": [""],
106
+ }
107
+ )
108
+
109
+ call_log: list[str] = []
110
+
111
+ def sql_side_effect(query: str):
112
+ call_log.append(query)
113
+ if "information_schema" in query:
114
+ raise RuntimeError("info schema unavailable")
115
+ if query == "SHOW COLUMNS IN TEST_WS.S1.TABLE_ONE":
116
+ return DummyResult(table_df)
117
+ raise RuntimeError("unsupported query")
118
+
119
+ session = mock.MagicMock()
120
+ session.sql.side_effect = sql_side_effect
121
+ connector._CATALOG_CATEGORY_CACHE.clear()
122
+
123
+ df = connector.get_valid_schemas_tables_columns_df(
124
+ session=session,
125
+ workspace="TEST_WS",
126
+ table_schema="S1",
127
+ table_names=["TEST_WS.S1.TABLE_ONE"],
128
+ )
129
+
130
+ assert not df.empty
131
+ assert any("SHOW COLUMNS IN TEST_WS.S1.TABLE_ONE" in q for q in call_log)
132
+ assert all("TEST_WS.S1.TEST_WS.S1" not in q for q in call_log)
133
+
134
+
135
+ def test_fetch_tables_views_in_schema_shared_catalog_does_not_use_share_clause():
136
+ class DummyResult:
137
+ def __init__(self, df: pd.DataFrame):
138
+ self._df = df
139
+
140
+ def to_pandas(self) -> pd.DataFrame:
141
+ return self._df
142
+
143
+ tables_df = pd.DataFrame(
144
+ {
145
+ "workspace_name": ["lakehouse_ai"],
146
+ "schema_name": ["schema_for_opencatalog"],
147
+ "table_name": ["czcustomer"],
148
+ "is_view": [False],
149
+ "is_materialized_view": [False],
150
+ }
151
+ )
152
+
153
+ executed_queries: list[str] = []
154
+
155
+ def sql_side_effect(query: str):
156
+ executed_queries.append(query)
157
+ if query.startswith("SHOW TABLES IN"):
158
+ return DummyResult(tables_df)
159
+ raise RuntimeError("Unexpected query")
160
+
161
+ session = mock.MagicMock()
162
+ session.sql.side_effect = sql_side_effect
163
+ connector._CATALOG_CATEGORY_CACHE.clear()
164
+
165
+ with mock.patch.object(connector, "_catalog_category", return_value="SHARED"):
166
+ tables = connector.fetch_tables_views_in_schema(
167
+ session=session, schema_name="lakehouse_ai.schema_for_opencatalog"
168
+ )
169
+
170
+ assert tables == ["lakehouse_ai.schema_for_opencatalog.czcustomer"]
171
+ assert all("IN SHARE" not in query for query in executed_queries)