clickzetta-semantic-model-generator 1.0.13__tar.gz → 1.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/PKG-INFO +1 -1
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/pyproject.toml +1 -1
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +67 -31
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/relationships/discovery.py +9 -1
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/clickzetta_connector_test.py +83 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/README.md +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/clickzetta_utils/env_vars.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/clickzetta_utils/utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/data_processing/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/data_processing/cte_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/data_processing/data_types.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/data_processing/proto_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/generate_model.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/llm/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/llm/dashscope_client.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/llm/enrichment.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/llm/progress_tracker.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/output_models/.keep +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/protos/semantic_model.proto +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/relationships/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/generate_model_classification_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/llm_enrichment_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/relationship_discovery_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/relationships_filters_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/validate_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/validate/context_length.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/validate/keywords.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/validate/schema.py +0 -0
- {clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/semantic_model_generator/validate_model.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "clickzetta-semantic-model-generator"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.15"
|
4
4
|
description = "Curate a Semantic Model for ClickZetta Lakehouse"
|
5
5
|
authors = ["qililiang <qililiang@clickzetta.com>"]
|
6
6
|
license = "Apache Software License; BSD License"
|
@@ -4,7 +4,7 @@ import concurrent.futures
|
|
4
4
|
import re
|
5
5
|
from collections import defaultdict
|
6
6
|
from contextlib import contextmanager
|
7
|
-
from typing import Any, Dict, Generator, List, Optional, TypeVar, Union
|
7
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple, TypeVar, Union
|
8
8
|
|
9
9
|
import pandas as pd
|
10
10
|
from clickzetta.zettapark.session import Session
|
@@ -176,6 +176,25 @@ def _sanitize_identifier(value: Any, fallback: str = "") -> str:
|
|
176
176
|
return normalized
|
177
177
|
|
178
178
|
|
179
|
+
def _split_identifier(
|
180
|
+
identifier: Any,
|
181
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
182
|
+
"""
|
183
|
+
Split a potentially qualified identifier into catalog, schema, and table parts.
|
184
|
+
Returns normalized segments without surrounding quotes/backticks.
|
185
|
+
"""
|
186
|
+
|
187
|
+
text = normalize_identifier(identifier)
|
188
|
+
if not text:
|
189
|
+
return None, None, None
|
190
|
+
parts = [part.strip() for part in text.split(".") if part.strip()]
|
191
|
+
if len(parts) >= 3:
|
192
|
+
return parts[-3], parts[-2], parts[-1]
|
193
|
+
if len(parts) == 2:
|
194
|
+
return None, parts[0], parts[1]
|
195
|
+
return None, None, parts[0]
|
196
|
+
|
197
|
+
|
179
198
|
def _normalize_column_type(raw: Any) -> str:
|
180
199
|
if raw is None:
|
181
200
|
return ""
|
@@ -449,8 +468,14 @@ def _build_information_schema_query(
|
|
449
468
|
if table_schema:
|
450
469
|
where_conditions.append(f"upper(t.table_schema) = '{table_schema.upper()}'")
|
451
470
|
if table_names:
|
452
|
-
|
453
|
-
|
471
|
+
normalized_names: List[str] = []
|
472
|
+
for name in table_names:
|
473
|
+
_, _, table_only = _split_identifier(name)
|
474
|
+
if table_only:
|
475
|
+
normalized_names.append(table_only.upper())
|
476
|
+
if normalized_names:
|
477
|
+
formatted_names = ", ".join(f"'{name}'" for name in normalized_names)
|
478
|
+
where_conditions.append(f"upper(t.table_name) IN ({formatted_names})")
|
454
479
|
|
455
480
|
where_clause = " AND ".join(where_conditions)
|
456
481
|
return f"""
|
@@ -490,22 +515,39 @@ def _fetch_columns_via_show(
|
|
490
515
|
if not table_token:
|
491
516
|
continue
|
492
517
|
|
518
|
+
override_catalog, override_schema, override_table = _split_identifier(table_token)
|
519
|
+
table_leaf = override_table or table_token
|
520
|
+
if not table_leaf:
|
521
|
+
continue
|
522
|
+
|
523
|
+
catalog_token = override_catalog or catalog
|
524
|
+
schema_token_override = override_schema or schema_token
|
525
|
+
|
493
526
|
identifier_candidates: List[str] = []
|
494
|
-
|
495
|
-
|
496
|
-
)
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
else
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
527
|
+
seen_identifiers: set[str] = set()
|
528
|
+
|
529
|
+
def _add_identifier(parts: Tuple[str, ...], *, quoted: bool) -> None:
|
530
|
+
tokens = [part.strip() for part in parts if part and part.strip()]
|
531
|
+
if not tokens:
|
532
|
+
return
|
533
|
+
if quoted:
|
534
|
+
identifier = ".".join(quote_identifier(token) for token in tokens)
|
535
|
+
else:
|
536
|
+
identifier = ".".join(tokens)
|
537
|
+
if identifier and identifier not in seen_identifiers:
|
538
|
+
identifier_candidates.append(identifier)
|
539
|
+
seen_identifiers.add(identifier)
|
540
|
+
|
541
|
+
raw_parts = (catalog_token, schema_token_override, table_leaf)
|
542
|
+
schema_parts = (schema_token_override, table_leaf)
|
543
|
+
table_parts = (table_leaf,)
|
544
|
+
|
545
|
+
_add_identifier(raw_parts, quoted=False)
|
546
|
+
_add_identifier(schema_parts, quoted=False)
|
547
|
+
_add_identifier(table_parts, quoted=False)
|
548
|
+
_add_identifier(raw_parts, quoted=True)
|
549
|
+
_add_identifier(schema_parts, quoted=True)
|
550
|
+
_add_identifier(table_parts, quoted=True)
|
509
551
|
|
510
552
|
df = pd.DataFrame()
|
511
553
|
df_source = ""
|
@@ -573,10 +615,10 @@ def _fetch_columns_via_show(
|
|
573
615
|
normalized[_TABLE_SCHEMA_COL] = (
|
574
616
|
df[schema_col]
|
575
617
|
if schema_col
|
576
|
-
else (
|
618
|
+
else (schema_token_override or table_schema or "")
|
577
619
|
)
|
578
620
|
normalized[_TABLE_NAME_COL] = (
|
579
|
-
df[table_col] if table_col else
|
621
|
+
df[table_col] if table_col else table_leaf
|
580
622
|
)
|
581
623
|
normalized[_COLUMN_NAME_COL] = (
|
582
624
|
df[column_col] if column_col else df.index.astype(str)
|
@@ -729,17 +771,11 @@ def fetch_tables_views_in_schema(
|
|
729
771
|
|
730
772
|
try:
|
731
773
|
if workspace_token and schema_token:
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
else:
|
738
|
-
scope = join_quoted_identifiers(
|
739
|
-
workspace_token,
|
740
|
-
schema_token,
|
741
|
-
)
|
742
|
-
df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
|
774
|
+
scope = join_quoted_identifiers(
|
775
|
+
workspace_token,
|
776
|
+
schema_token,
|
777
|
+
)
|
778
|
+
df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
|
743
779
|
else:
|
744
780
|
df = session.sql("SHOW TABLES").to_pandas()
|
745
781
|
except Exception as exc: # pragma: no cover
|
@@ -50,7 +50,15 @@ class RelationshipDiscoveryResult:
|
|
50
50
|
def _normalize_table_names(table_names: Optional[Iterable[str]]) -> Optional[List[str]]:
|
51
51
|
if table_names is None:
|
52
52
|
return None
|
53
|
-
|
53
|
+
normalized: List[str] = []
|
54
|
+
for name in table_names:
|
55
|
+
parts = [
|
56
|
+
part.strip().strip("`").strip('"')
|
57
|
+
for part in str(name).split(".")
|
58
|
+
if part and part.strip()
|
59
|
+
]
|
60
|
+
normalized.append(".".join(parts))
|
61
|
+
return normalized
|
54
62
|
|
55
63
|
|
56
64
|
def _build_tables_from_dataframe(
|
@@ -86,3 +86,86 @@ def test_get_valid_columns_falls_back_to_show_columns():
|
|
86
86
|
assert not df.empty
|
87
87
|
assert df["TABLE_NAME"].iloc[0] == "PARTSUPP"
|
88
88
|
assert df["COLUMN_NAME"].iloc[0] == "PS_PARTKEY"
|
89
|
+
|
90
|
+
|
91
|
+
def test_get_valid_columns_handles_fully_qualified_filters():
|
92
|
+
class DummyResult:
|
93
|
+
def __init__(self, df: pd.DataFrame):
|
94
|
+
self._df = df
|
95
|
+
|
96
|
+
def to_pandas(self) -> pd.DataFrame:
|
97
|
+
return self._df
|
98
|
+
|
99
|
+
table_df = pd.DataFrame(
|
100
|
+
{
|
101
|
+
"schema_name": ["S1"],
|
102
|
+
"table_name": ["TABLE_ONE"],
|
103
|
+
"column_name": ["ID"],
|
104
|
+
"data_type": ["INT"],
|
105
|
+
"comment": [""],
|
106
|
+
}
|
107
|
+
)
|
108
|
+
|
109
|
+
call_log: list[str] = []
|
110
|
+
|
111
|
+
def sql_side_effect(query: str):
|
112
|
+
call_log.append(query)
|
113
|
+
if "information_schema" in query:
|
114
|
+
raise RuntimeError("info schema unavailable")
|
115
|
+
if query == "SHOW COLUMNS IN TEST_WS.S1.TABLE_ONE":
|
116
|
+
return DummyResult(table_df)
|
117
|
+
raise RuntimeError("unsupported query")
|
118
|
+
|
119
|
+
session = mock.MagicMock()
|
120
|
+
session.sql.side_effect = sql_side_effect
|
121
|
+
connector._CATALOG_CATEGORY_CACHE.clear()
|
122
|
+
|
123
|
+
df = connector.get_valid_schemas_tables_columns_df(
|
124
|
+
session=session,
|
125
|
+
workspace="TEST_WS",
|
126
|
+
table_schema="S1",
|
127
|
+
table_names=["TEST_WS.S1.TABLE_ONE"],
|
128
|
+
)
|
129
|
+
|
130
|
+
assert not df.empty
|
131
|
+
assert any("SHOW COLUMNS IN TEST_WS.S1.TABLE_ONE" in q for q in call_log)
|
132
|
+
assert all("TEST_WS.S1.TEST_WS.S1" not in q for q in call_log)
|
133
|
+
|
134
|
+
|
135
|
+
def test_fetch_tables_views_in_schema_shared_catalog_does_not_use_share_clause():
|
136
|
+
class DummyResult:
|
137
|
+
def __init__(self, df: pd.DataFrame):
|
138
|
+
self._df = df
|
139
|
+
|
140
|
+
def to_pandas(self) -> pd.DataFrame:
|
141
|
+
return self._df
|
142
|
+
|
143
|
+
tables_df = pd.DataFrame(
|
144
|
+
{
|
145
|
+
"workspace_name": ["lakehouse_ai"],
|
146
|
+
"schema_name": ["schema_for_opencatalog"],
|
147
|
+
"table_name": ["czcustomer"],
|
148
|
+
"is_view": [False],
|
149
|
+
"is_materialized_view": [False],
|
150
|
+
}
|
151
|
+
)
|
152
|
+
|
153
|
+
executed_queries: list[str] = []
|
154
|
+
|
155
|
+
def sql_side_effect(query: str):
|
156
|
+
executed_queries.append(query)
|
157
|
+
if query.startswith("SHOW TABLES IN"):
|
158
|
+
return DummyResult(tables_df)
|
159
|
+
raise RuntimeError("Unexpected query")
|
160
|
+
|
161
|
+
session = mock.MagicMock()
|
162
|
+
session.sql.side_effect = sql_side_effect
|
163
|
+
connector._CATALOG_CATEGORY_CACHE.clear()
|
164
|
+
|
165
|
+
with mock.patch.object(connector, "_catalog_category", return_value="SHARED"):
|
166
|
+
tables = connector.fetch_tables_views_in_schema(
|
167
|
+
session=session, schema_name="lakehouse_ai.schema_for_opencatalog"
|
168
|
+
)
|
169
|
+
|
170
|
+
assert tables == ["lakehouse_ai.schema_for_opencatalog.czcustomer"]
|
171
|
+
assert all("IN SHARE" not in query for query in executed_queries)
|
{clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/LICENSE
RENAMED
File without changes
|
{clickzetta_semantic_model_generator-1.0.13 → clickzetta_semantic_model_generator-1.0.15}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|