clickzetta-semantic-model-generator 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/PKG-INFO +1 -1
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/pyproject.toml +1 -1
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +63 -42
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/utils.py +44 -2
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/cte_utils.py +43 -13
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/generate_model.py +126 -18
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/relationships/__init__.py +2 -0
- clickzetta_semantic_model_generator-1.0.5/semantic_model_generator/relationships/discovery.py +402 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/cte_utils_test.py +14 -13
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/relationship_discovery_test.py +108 -0
- clickzetta_semantic_model_generator-1.0.3/semantic_model_generator/relationships/discovery.py +0 -202
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/README.md +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/env_vars.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/data_types.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/proto_utils.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/__init__.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/dashscope_client.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/enrichment.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/progress_tracker.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/output_models/.keep +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model.proto +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/clickzetta_connector_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/generate_model_classification_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/llm_enrichment_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/relationships_filters_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/utils_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/validate_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/context_length.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/keywords.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/schema.py +0 -0
- {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate_model.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "clickzetta-semantic-model-generator"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.5"
|
4
4
|
description = "Curate a Semantic Model for ClickZetta Lakehouse"
|
5
5
|
authors = ["qililiang <qililiang@clickzetta.com>"]
|
6
6
|
license = "Apache Software License; BSD License"
|
@@ -11,7 +11,12 @@ from clickzetta.zettapark.session import Session
|
|
11
11
|
from loguru import logger
|
12
12
|
|
13
13
|
from semantic_model_generator.clickzetta_utils import env_vars
|
14
|
-
from semantic_model_generator.clickzetta_utils.utils import
|
14
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
15
|
+
create_session,
|
16
|
+
join_quoted_identifiers,
|
17
|
+
normalize_identifier,
|
18
|
+
quote_identifier,
|
19
|
+
)
|
15
20
|
from semantic_model_generator.data_processing.data_types import Column, Table
|
16
21
|
|
17
22
|
ConnectionType = TypeVar("ConnectionType", bound=Session)
|
@@ -151,18 +156,8 @@ class ClickzettaConnectionProxy:
|
|
151
156
|
self.session.close()
|
152
157
|
|
153
158
|
|
154
|
-
def _quote_identifier(name: str) -> str:
|
155
|
-
return f'"{name}"'
|
156
|
-
|
157
|
-
|
158
159
|
def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
|
159
|
-
return
|
160
|
-
[
|
161
|
-
_quote_identifier(workspace),
|
162
|
-
_quote_identifier(schema_name),
|
163
|
-
_quote_identifier(table_name),
|
164
|
-
]
|
165
|
-
)
|
160
|
+
return join_quoted_identifiers(workspace, schema_name, table_name)
|
166
161
|
|
167
162
|
|
168
163
|
def _value_is_true(value: Any) -> bool:
|
@@ -175,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
|
|
175
170
|
|
176
171
|
|
177
172
|
def _sanitize_identifier(value: Any, fallback: str = "") -> str:
|
178
|
-
|
173
|
+
normalized = normalize_identifier(value)
|
174
|
+
if not normalized:
|
179
175
|
return fallback
|
180
|
-
normalized = str(value).strip()
|
181
|
-
if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
|
182
|
-
normalized = normalized[1:-1]
|
183
176
|
return normalized
|
184
177
|
|
185
178
|
|
@@ -216,21 +209,19 @@ def _fetch_distinct_values(
|
|
216
209
|
column_name: str,
|
217
210
|
ndv: int,
|
218
211
|
) -> Optional[List[str]]:
|
219
|
-
workspace_part = (
|
220
|
-
_sanitize_identifier(workspace, workspace).upper() if workspace else ""
|
221
|
-
)
|
212
|
+
workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
|
222
213
|
schema_part = (
|
223
|
-
_sanitize_identifier(schema_name, schema_name)
|
214
|
+
_sanitize_identifier(schema_name, schema_name) if schema_name else ""
|
224
215
|
)
|
225
|
-
table_part = _sanitize_identifier(table_name, table_name)
|
226
|
-
column_part = _sanitize_identifier(column_name, column_name)
|
216
|
+
table_part = _sanitize_identifier(table_name, table_name)
|
217
|
+
column_part = _sanitize_identifier(column_name, column_name)
|
227
218
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
219
|
+
qualified_table = join_quoted_identifiers(
|
220
|
+
workspace_part, schema_part, table_part
|
221
|
+
)
|
222
|
+
column_expr = quote_identifier(column_part)
|
232
223
|
|
233
|
-
query = f"SELECT DISTINCT {
|
224
|
+
query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
|
234
225
|
try:
|
235
226
|
df = session.sql(query).to_pandas()
|
236
227
|
if df.empty:
|
@@ -489,15 +480,30 @@ def _fetch_columns_via_show(
|
|
489
480
|
return pd.DataFrame()
|
490
481
|
|
491
482
|
rows: List[pd.DataFrame] = []
|
492
|
-
|
493
|
-
|
483
|
+
category = _catalog_category(session, workspace)
|
484
|
+
is_shared_catalog = category in {"SHARED", "EXTERNAL"}
|
485
|
+
catalog = workspace if is_shared_catalog else workspace.upper()
|
486
|
+
schema = (
|
487
|
+
table_schema or ""
|
488
|
+
)
|
489
|
+
if schema and not is_shared_catalog:
|
490
|
+
schema = schema.upper()
|
494
491
|
|
495
492
|
for table_name in table_names:
|
496
493
|
qualified_parts = [
|
497
|
-
part
|
494
|
+
part
|
495
|
+
for part in (
|
496
|
+
catalog,
|
497
|
+
schema,
|
498
|
+
table_name.upper() if not is_shared_catalog else table_name,
|
499
|
+
)
|
500
|
+
if part
|
498
501
|
]
|
499
502
|
qualified_table = ".".join(qualified_parts)
|
500
|
-
|
503
|
+
if is_shared_catalog:
|
504
|
+
query = f"SHOW COLUMNS IN SHARE {qualified_table}"
|
505
|
+
else:
|
506
|
+
query = f"SHOW COLUMNS IN {qualified_table}"
|
501
507
|
try:
|
502
508
|
df = session.sql(query).to_pandas()
|
503
509
|
except Exception as exc:
|
@@ -655,14 +661,25 @@ def fetch_tables_views_in_schema(
|
|
655
661
|
parts = schema_name.split(".", maxsplit=1)
|
656
662
|
workspace = parts[0]
|
657
663
|
schema = parts[1] if len(parts) > 1 else ""
|
658
|
-
|
659
|
-
|
664
|
+
category = _catalog_category(session, workspace)
|
665
|
+
is_shared_catalog = category in {"SHARED", "EXTERNAL"}
|
666
|
+
|
667
|
+
workspace_token = workspace if is_shared_catalog else workspace.upper()
|
668
|
+
schema_token = schema if is_shared_catalog else schema.upper()
|
660
669
|
|
661
670
|
try:
|
662
|
-
if
|
663
|
-
|
664
|
-
|
665
|
-
|
671
|
+
if workspace_token and schema_token:
|
672
|
+
if is_shared_catalog:
|
673
|
+
scope = ".".join(
|
674
|
+
part for part in (workspace_token, schema_token) if part
|
675
|
+
)
|
676
|
+
df = session.sql(f"SHOW TABLES IN SHARE {scope}").to_pandas()
|
677
|
+
else:
|
678
|
+
scope = join_quoted_identifiers(
|
679
|
+
workspace_token,
|
680
|
+
schema_token,
|
681
|
+
)
|
682
|
+
df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
|
666
683
|
else:
|
667
684
|
df = session.sql("SHOW TABLES").to_pandas()
|
668
685
|
except Exception as exc: # pragma: no cover
|
@@ -738,11 +755,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
|
|
738
755
|
|
739
756
|
queries: List[str] = []
|
740
757
|
if schema:
|
741
|
-
|
742
|
-
|
758
|
+
scope = join_quoted_identifiers(workspace, schema)
|
759
|
+
if scope:
|
760
|
+
queries.append(f"SHOW VOLUMES IN {scope}")
|
761
|
+
queries.append(f"SHOW STAGES IN SCHEMA {scope}")
|
743
762
|
else:
|
744
|
-
|
745
|
-
|
763
|
+
workspace_identifier = quote_identifier(workspace)
|
764
|
+
if workspace_identifier:
|
765
|
+
queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
|
766
|
+
queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
|
746
767
|
|
747
768
|
stage_names: List[str] = ["volume:user://~/semantic_models/"]
|
748
769
|
seen: set[str] = set(stage_names)
|
@@ -899,7 +920,7 @@ def create_table_in_schema(
|
|
899
920
|
columns_schema: Dict[str, str],
|
900
921
|
) -> bool:
|
901
922
|
fields = ", ".join(
|
902
|
-
f"{
|
923
|
+
f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
|
903
924
|
)
|
904
925
|
query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
|
905
926
|
try:
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from contextlib import contextmanager
|
4
|
-
from typing import Dict, Iterable
|
4
|
+
from typing import Any, Dict, Iterable
|
5
5
|
|
6
6
|
from clickzetta.zettapark.session import Session
|
7
7
|
|
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
|
|
21
21
|
}
|
22
22
|
|
23
23
|
|
24
|
+
def normalize_identifier(value: Any) -> str:
|
25
|
+
"""
|
26
|
+
Strips outer quotes/backticks and surrounding whitespace from an identifier.
|
27
|
+
Returns an empty string when the identifier is missing.
|
28
|
+
"""
|
29
|
+
|
30
|
+
if value is None:
|
31
|
+
return ""
|
32
|
+
text = str(value).strip()
|
33
|
+
if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
|
34
|
+
return text[1:-1]
|
35
|
+
return text
|
36
|
+
|
37
|
+
|
38
|
+
def quote_identifier(value: Any) -> str:
|
39
|
+
"""
|
40
|
+
Wraps an identifier in backticks, escaping embedded backticks as needed.
|
41
|
+
Returns an empty string if the identifier is missing.
|
42
|
+
"""
|
43
|
+
|
44
|
+
normalized = normalize_identifier(value)
|
45
|
+
if not normalized:
|
46
|
+
return ""
|
47
|
+
escaped = normalized.replace("`", "``")
|
48
|
+
return f"`{escaped}`"
|
49
|
+
|
50
|
+
|
51
|
+
def join_quoted_identifiers(*parts: Any) -> str:
|
52
|
+
"""
|
53
|
+
Joins identifier parts with '.' and ensures each segment is backtick-quoted.
|
54
|
+
Empty segments are skipped.
|
55
|
+
"""
|
56
|
+
|
57
|
+
quoted_parts = [
|
58
|
+
quote_identifier(part)
|
59
|
+
for part in parts
|
60
|
+
if normalize_identifier(part)
|
61
|
+
]
|
62
|
+
return ".".join(part for part in quoted_parts if part)
|
63
|
+
|
64
|
+
|
24
65
|
def create_fqn_table(fqn_str: str) -> FQNParts:
|
25
66
|
"""
|
26
67
|
Splits a fully qualified table name into its ClickZetta components.
|
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
|
|
72
113
|
("schema", schema),
|
73
114
|
("vcluster", vcluster),
|
74
115
|
):
|
75
|
-
|
116
|
+
identifier = quote_identifier(value)
|
117
|
+
session.sql(f"USE {component.upper()} {identifier}")
|
76
118
|
|
77
119
|
|
78
120
|
def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:
|
@@ -11,12 +11,34 @@ from sqlglot import Dialect
|
|
11
11
|
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
12
12
|
OBJECT_DATATYPES,
|
13
13
|
)
|
14
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
15
|
+
join_quoted_identifiers,
|
16
|
+
normalize_identifier,
|
17
|
+
)
|
14
18
|
from semantic_model_generator.protos import semantic_model_pb2
|
15
19
|
|
16
20
|
_SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
|
17
21
|
ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
|
18
22
|
|
19
23
|
_LOGICAL_TABLE_PREFIX = "__"
|
24
|
+
_SQLGLOT_QUOTE_CHAR = '"'
|
25
|
+
|
26
|
+
|
27
|
+
def _prepare_sql_for_parsing(sql: str) -> str:
|
28
|
+
"""
|
29
|
+
Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
|
30
|
+
"""
|
31
|
+
|
32
|
+
return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
|
33
|
+
|
34
|
+
|
35
|
+
def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
|
36
|
+
"""
|
37
|
+
Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
|
38
|
+
"""
|
39
|
+
|
40
|
+
rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
|
41
|
+
return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
|
20
42
|
|
21
43
|
|
22
44
|
def is_logical_table(table_name: str) -> bool:
|
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
|
|
33
55
|
|
34
56
|
def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
|
35
57
|
"""Returns fully qualified table name such as my_db.my_schema.my_table"""
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return
|
58
|
+
parts = [
|
59
|
+
normalize_identifier(component)
|
60
|
+
for component in (table.database, table.schema, table.table)
|
61
|
+
if component
|
62
|
+
]
|
63
|
+
return join_quoted_identifiers(*parts) # type: ignore[no-any-return]
|
42
64
|
|
43
65
|
|
44
66
|
def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
|
@@ -156,8 +178,8 @@ def _generate_cte_for(
|
|
156
178
|
cte = f"WITH {logical_table_name(table)} AS (\n"
|
157
179
|
cte += "SELECT \n"
|
158
180
|
cte += ",\n".join(expr_columns) + "\n"
|
159
|
-
cte += f"FROM {fully_qualified_table_name(table.base_table)}"
|
160
|
-
cte += ")"
|
181
|
+
cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
|
182
|
+
cte += ")\n"
|
161
183
|
return cte
|
162
184
|
|
163
185
|
|
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
|
|
261
283
|
str: The SQL statement in ClickZetta syntax.
|
262
284
|
"""
|
263
285
|
try:
|
264
|
-
expression = sqlglot.parse_one(
|
286
|
+
expression = sqlglot.parse_one(
|
287
|
+
_prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
|
288
|
+
)
|
265
289
|
except Exception as e:
|
266
290
|
raise ValueError(
|
267
291
|
f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
|
268
292
|
)
|
269
293
|
|
270
|
-
return expression
|
294
|
+
return _render_clickzetta_sql(expression)
|
271
295
|
|
272
296
|
|
273
297
|
def generate_select(
|
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
|
|
332
356
|
for cte in ctes:
|
333
357
|
new_withs.append(
|
334
358
|
sqlglot.parse_one(
|
335
|
-
cte,
|
359
|
+
_prepare_sql_for_parsing(cte),
|
360
|
+
read=ClickzettaDialect,
|
361
|
+
into=sqlglot.expressions.With,
|
336
362
|
)
|
337
363
|
)
|
338
364
|
|
339
365
|
# Step 3: Prefix the CTEs to the original query.
|
340
|
-
ast = sqlglot.parse_one(
|
366
|
+
ast = sqlglot.parse_one(
|
367
|
+
_prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
|
368
|
+
)
|
341
369
|
with_ = ast.args.get("with")
|
342
370
|
# If the query doesn't have a WITH clause, then generate one.
|
343
371
|
if with_ is None:
|
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
|
|
349
377
|
else:
|
350
378
|
new_ctes = [w.expressions[0] for w in new_withs]
|
351
379
|
with_.set("expressions", new_ctes + with_.expressions)
|
352
|
-
return
|
380
|
+
return _render_clickzetta_sql(
|
381
|
+
ast, pretty=True
|
382
|
+
) # type: ignore [no-any-return]
|
353
383
|
|
354
384
|
|
355
385
|
def context_to_column_format(
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import math
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import time
|
4
5
|
from collections import defaultdict
|
5
6
|
from datetime import datetime
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
@@ -17,7 +18,12 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
|
17
18
|
get_table_representation,
|
18
19
|
get_valid_schemas_tables_columns_df,
|
19
20
|
)
|
20
|
-
from semantic_model_generator.clickzetta_utils.utils import
|
21
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
22
|
+
create_fqn_table,
|
23
|
+
join_quoted_identifiers,
|
24
|
+
normalize_identifier,
|
25
|
+
quote_identifier,
|
26
|
+
)
|
21
27
|
from semantic_model_generator.data_processing import data_types, proto_utils
|
22
28
|
from semantic_model_generator.llm import (
|
23
29
|
DashscopeClient,
|
@@ -41,6 +47,14 @@ _AUTOGEN_COMMENT_TOKEN = (
|
|
41
47
|
)
|
42
48
|
_DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
|
43
49
|
_AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
|
50
|
+
_GENERIC_IDENTIFIER_TOKENS = {
|
51
|
+
"ID",
|
52
|
+
"NAME",
|
53
|
+
"CODE",
|
54
|
+
"KEY",
|
55
|
+
"VALUE",
|
56
|
+
"NUMBER",
|
57
|
+
}
|
44
58
|
|
45
59
|
|
46
60
|
def _singularize(token: str) -> str:
|
@@ -90,6 +104,14 @@ def _identifier_tokens(
|
|
90
104
|
return tokens
|
91
105
|
|
92
106
|
|
107
|
+
def _is_generic_identifier(name: str) -> bool:
|
108
|
+
tokens = [token for token in _identifier_tokens(name) if token]
|
109
|
+
if not tokens:
|
110
|
+
return True
|
111
|
+
normalized_tokens = {token.upper() for token in tokens}
|
112
|
+
return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
|
113
|
+
|
114
|
+
|
93
115
|
def _sanitize_identifier_name(
|
94
116
|
name: str, prefixes_to_drop: Optional[set[str]] = None
|
95
117
|
) -> str:
|
@@ -354,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
|
|
354
376
|
|
355
377
|
def _format_sql_identifier(name: str) -> str:
|
356
378
|
"""
|
357
|
-
Formats an identifier for SQL
|
379
|
+
Formats an identifier for SQL by wrapping it in backticks.
|
358
380
|
"""
|
359
|
-
|
360
|
-
return ""
|
361
|
-
return str(name).replace('"', "").replace("`", "").strip().upper()
|
381
|
+
return quote_identifier(name)
|
362
382
|
|
363
383
|
|
364
384
|
def _qualified_table_name(fqn: data_types.FQNParts) -> str:
|
365
385
|
"""
|
366
|
-
Builds a fully qualified table name
|
386
|
+
Builds a fully qualified, backtick-quoted table name.
|
367
387
|
"""
|
368
|
-
parts = [part for part in (fqn.database, fqn.schema_name, fqn.table)
|
369
|
-
return
|
388
|
+
parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
|
389
|
+
return join_quoted_identifiers(*(part for part in parts if part))
|
370
390
|
|
371
391
|
|
372
392
|
def _levenshtein_distance(s1: str, s2: str) -> int:
|
@@ -977,6 +997,19 @@ def _calculate_relationship_confidence(
|
|
977
997
|
|
978
998
|
confidence_score += name_confidence
|
979
999
|
|
1000
|
+
generic_pair_count = sum(
|
1001
|
+
1
|
1002
|
+
for left_col, right_col in column_pairs
|
1003
|
+
if _is_generic_identifier(left_col)
|
1004
|
+
and _is_generic_identifier(right_col)
|
1005
|
+
)
|
1006
|
+
if generic_pair_count:
|
1007
|
+
penalty = min(0.15 * generic_pair_count, 0.3)
|
1008
|
+
confidence_score = max(confidence_score - penalty, 0.0)
|
1009
|
+
reasoning_factors.append(
|
1010
|
+
f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
|
1011
|
+
)
|
1012
|
+
|
980
1013
|
# Check for foreign key naming patterns
|
981
1014
|
fk_pattern_confidence = 0.0
|
982
1015
|
for left_col, right_col in column_pairs:
|
@@ -2326,11 +2359,31 @@ def _infer_relationships(
|
|
2326
2359
|
*,
|
2327
2360
|
session: Optional[Session] = None,
|
2328
2361
|
strict_join_inference: bool = False,
|
2362
|
+
status: Optional[Dict[str, bool]] = None,
|
2363
|
+
max_relationships: Optional[int] = None,
|
2364
|
+
min_confidence: float = 0.2,
|
2365
|
+
timeout_seconds: Optional[float] = None,
|
2329
2366
|
) -> List[semantic_model_pb2.Relationship]:
|
2367
|
+
status_dict = status if status is not None else {}
|
2368
|
+
if "limited_by_timeout" not in status_dict:
|
2369
|
+
status_dict["limited_by_timeout"] = False
|
2370
|
+
if "limited_by_max_relationships" not in status_dict:
|
2371
|
+
status_dict["limited_by_max_relationships"] = False
|
2372
|
+
|
2330
2373
|
relationships: List[semantic_model_pb2.Relationship] = []
|
2331
2374
|
if not raw_tables:
|
2332
2375
|
return relationships
|
2333
2376
|
|
2377
|
+
start_time = time.perf_counter()
|
2378
|
+
min_confidence = max(0.0, min(min_confidence, 1.0))
|
2379
|
+
limit_reached = False
|
2380
|
+
|
2381
|
+
def _timed_out() -> bool:
|
2382
|
+
return (
|
2383
|
+
timeout_seconds is not None
|
2384
|
+
and (time.perf_counter() - start_time) >= timeout_seconds
|
2385
|
+
)
|
2386
|
+
|
2334
2387
|
metadata = {}
|
2335
2388
|
prefix_counter: Dict[str, int] = {}
|
2336
2389
|
for _, raw_table in raw_tables:
|
@@ -2392,14 +2445,39 @@ def _infer_relationships(
|
|
2392
2445
|
def _record_pair(
|
2393
2446
|
left_table: str, right_table: str, left_col: str, right_col: str
|
2394
2447
|
) -> None:
|
2448
|
+
nonlocal limit_reached
|
2449
|
+
if limit_reached:
|
2450
|
+
return
|
2451
|
+
if _timed_out():
|
2452
|
+
status_dict["limited_by_timeout"] = True
|
2453
|
+
limit_reached = True
|
2454
|
+
return
|
2455
|
+
|
2395
2456
|
key = (left_table, right_table)
|
2396
2457
|
value = (left_col, right_col)
|
2397
|
-
|
2398
|
-
|
2458
|
+
bucket = pairs.setdefault(key, [])
|
2459
|
+
if value not in bucket:
|
2460
|
+
bucket.append(value)
|
2461
|
+
if (
|
2462
|
+
max_relationships is not None
|
2463
|
+
and len(pairs) >= max_relationships
|
2464
|
+
):
|
2465
|
+
status_dict["limited_by_max_relationships"] = True
|
2466
|
+
limit_reached = True
|
2399
2467
|
|
2400
2468
|
table_names = list(metadata.keys())
|
2401
2469
|
for i in range(len(table_names)):
|
2470
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2471
|
+
break
|
2472
|
+
if _timed_out():
|
2473
|
+
status_dict["limited_by_timeout"] = True
|
2474
|
+
break
|
2402
2475
|
for j in range(i + 1, len(table_names)):
|
2476
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2477
|
+
break
|
2478
|
+
if _timed_out():
|
2479
|
+
status_dict["limited_by_timeout"] = True
|
2480
|
+
break
|
2403
2481
|
table_a_name = table_names[i]
|
2404
2482
|
table_b_name = table_names[j]
|
2405
2483
|
table_a = metadata[table_a_name]
|
@@ -2575,6 +2653,15 @@ def _infer_relationships(
|
|
2575
2653
|
|
2576
2654
|
# Build relationships with inferred cardinality
|
2577
2655
|
for (left_table, right_table), column_pairs in pairs.items():
|
2656
|
+
if _timed_out():
|
2657
|
+
status_dict["limited_by_timeout"] = True
|
2658
|
+
break
|
2659
|
+
if (
|
2660
|
+
max_relationships is not None
|
2661
|
+
and len(relationships) >= max_relationships
|
2662
|
+
):
|
2663
|
+
status_dict["limited_by_max_relationships"] = True
|
2664
|
+
break
|
2578
2665
|
# Infer cardinality based on available metadata
|
2579
2666
|
left_meta = metadata[left_table]
|
2580
2667
|
right_meta = metadata[right_table]
|
@@ -2777,6 +2864,16 @@ def _infer_relationships(
|
|
2777
2864
|
for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
|
2778
2865
|
logger.debug(f" + {factor}")
|
2779
2866
|
|
2867
|
+
if confidence_analysis["confidence_score"] < min_confidence:
|
2868
|
+
logger.debug(
|
2869
|
+
"Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
|
2870
|
+
left_table,
|
2871
|
+
right_table,
|
2872
|
+
confidence_analysis["confidence_score"],
|
2873
|
+
min_confidence,
|
2874
|
+
)
|
2875
|
+
continue
|
2876
|
+
|
2780
2877
|
# Determine relationship type based on cardinality
|
2781
2878
|
if left_card == "1" and right_card == "1":
|
2782
2879
|
rel_type = semantic_model_pb2.RelationshipType.one_to_one
|
@@ -2804,16 +2901,27 @@ def _infer_relationships(
|
|
2804
2901
|
relationships.append(relationship)
|
2805
2902
|
|
2806
2903
|
# Phase 2: Detect many-to-many relationships through bridge table analysis
|
2807
|
-
many_to_many_relationships =
|
2808
|
-
|
2809
|
-
|
2810
|
-
|
2811
|
-
|
2812
|
-
|
2813
|
-
logger.info(
|
2814
|
-
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2904
|
+
many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
|
2905
|
+
if not status_dict["limited_by_timeout"] and (
|
2906
|
+
max_relationships is None or len(relationships) < max_relationships
|
2907
|
+
):
|
2908
|
+
many_to_many_relationships = _detect_many_to_many_relationships(
|
2909
|
+
raw_tables, metadata, relationships
|
2815
2910
|
)
|
2816
2911
|
|
2912
|
+
if many_to_many_relationships and max_relationships is not None:
|
2913
|
+
remaining = max_relationships - len(relationships)
|
2914
|
+
if remaining <= 0:
|
2915
|
+
many_to_many_relationships = []
|
2916
|
+
else:
|
2917
|
+
many_to_many_relationships = many_to_many_relationships[:remaining]
|
2918
|
+
|
2919
|
+
if many_to_many_relationships:
|
2920
|
+
relationships.extend(many_to_many_relationships)
|
2921
|
+
logger.info(
|
2922
|
+
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2923
|
+
)
|
2924
|
+
|
2817
2925
|
logger.info(
|
2818
2926
|
f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
|
2819
2927
|
)
|
@@ -4,6 +4,7 @@ from .discovery import (
|
|
4
4
|
RelationshipDiscoveryResult,
|
5
5
|
RelationshipSummary,
|
6
6
|
discover_relationships_from_schema,
|
7
|
+
discover_relationships_from_table_definitions,
|
7
8
|
discover_relationships_from_tables,
|
8
9
|
)
|
9
10
|
|
@@ -11,5 +12,6 @@ __all__ = [
|
|
11
12
|
"RelationshipDiscoveryResult",
|
12
13
|
"RelationshipSummary",
|
13
14
|
"discover_relationships_from_schema",
|
15
|
+
"discover_relationships_from_table_definitions",
|
14
16
|
"discover_relationships_from_tables",
|
15
17
|
]
|