clickzetta-semantic-model-generator 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/METADATA +1 -1
- {clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/RECORD +12 -12
- semantic_model_generator/clickzetta_utils/clickzetta_connector.py +29 -35
- semantic_model_generator/clickzetta_utils/utils.py +44 -2
- semantic_model_generator/data_processing/cte_utils.py +43 -13
- semantic_model_generator/generate_model.py +126 -18
- semantic_model_generator/relationships/__init__.py +2 -0
- semantic_model_generator/relationships/discovery.py +180 -10
- semantic_model_generator/tests/cte_utils_test.py +14 -13
- semantic_model_generator/tests/relationship_discovery_test.py +64 -0
- {clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/LICENSE +0 -0
- {clickzetta_semantic_model_generator-1.0.3.dist-info → clickzetta_semantic_model_generator-1.0.4.dist-info}/WHEEL +0 -0
@@ -1,13 +1,13 @@
|
|
1
1
|
semantic_model_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=
|
2
|
+
semantic_model_generator/clickzetta_utils/clickzetta_connector.py,sha256=LnGQTBj94aC8Zk9aVe2efA6-3UX_E8Q7ITvnfEoByjw,32819
|
3
3
|
semantic_model_generator/clickzetta_utils/env_vars.py,sha256=8cbL6R75c1-aVQ2i1TDr9SiHCUjTrgvXbIRz4MbcmbE,7664
|
4
|
-
semantic_model_generator/clickzetta_utils/utils.py,sha256=
|
4
|
+
semantic_model_generator/clickzetta_utils/utils.py,sha256=UBfWy9qOTyut8tL02gOHHbh6Uz8RqRz5Mm2YdKWFN54,4950
|
5
5
|
semantic_model_generator/data_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
semantic_model_generator/data_processing/cte_utils.py,sha256
|
6
|
+
semantic_model_generator/data_processing/cte_utils.py,sha256=-kQw_PfPPe3mf7shQf1XV5rqfqYdB9WK4A-EwAcKc_o,16928
|
7
7
|
semantic_model_generator/data_processing/cte_utils_test.py,sha256=l6QkyyH22FexLKjvvbS9Je3YtdTrJE3a-BiknCy1g9s,2822
|
8
8
|
semantic_model_generator/data_processing/data_types.py,sha256=1HsSCkdCWvcXiwN3o1-HVQi_ZVIR0lYevXG9CE1TvRc,1172
|
9
9
|
semantic_model_generator/data_processing/proto_utils.py,sha256=UwqCfQYilTx68KcA4IYZN7PeM4Pz_pK1h0FrVJomzV8,2938
|
10
|
-
semantic_model_generator/generate_model.py,sha256=
|
10
|
+
semantic_model_generator/generate_model.py,sha256=vwISWJzYf4XS1TuLclpxKbberlsRKM99olrFlWaTCUw,125549
|
11
11
|
semantic_model_generator/llm/__init__.py,sha256=rLQt2pzRmxtnBLKjxN_qZ2a_nvkFHtmguU5lyajCldw,1030
|
12
12
|
semantic_model_generator/llm/dashscope_client.py,sha256=lHS36iqNZbFhwgidPpW1Bwwy4S2O7GeLyMSMdlSoBsY,6050
|
13
13
|
semantic_model_generator/llm/enrichment.py,sha256=49e9Jg_jHfhUIEQ3JserEc5DV5sFWA12K76TY4UwnCg,41448
|
@@ -16,13 +16,13 @@ semantic_model_generator/output_models/.keep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
16
16
|
semantic_model_generator/protos/semantic_model.proto,sha256=WZiN4b8vR-ZX-Lj9Vsm6HjZNAyNvM1znIyut_YkPVSI,16473
|
17
17
|
semantic_model_generator/protos/semantic_model_pb2.py,sha256=scbWkW-I-r3_hp_5SHoOWn02p52RJ9DJ0_-nRgr0LHc,25606
|
18
18
|
semantic_model_generator/protos/semantic_model_pb2.pyi,sha256=iiBIZxtX9d6IuUO3aLcsJsHUeZqdi14vYNuUsSM8C0g,18267
|
19
|
-
semantic_model_generator/relationships/__init__.py,sha256=
|
20
|
-
semantic_model_generator/relationships/discovery.py,sha256=
|
19
|
+
semantic_model_generator/relationships/__init__.py,sha256=I9-_QJdp36nEllzKTGXi2aWbRjiXrrexQXUfB6mi3Ww,477
|
20
|
+
semantic_model_generator/relationships/discovery.py,sha256=BdPHIvlE6yuaQv0ELWwQlq0qx0uX7fkoEMfuvK8wO60,12147
|
21
21
|
semantic_model_generator/tests/clickzetta_connector_test.py,sha256=Fdx7jooNt1lslKB2Ub51wqOZ8OM0osgZiDDl3bV6riw,3086
|
22
|
-
semantic_model_generator/tests/cte_utils_test.py,sha256=
|
22
|
+
semantic_model_generator/tests/cte_utils_test.py,sha256=_9GAJiOPGSagdWmQsoAEOOhEgsBY0LFlr_xtwrlgf4A,17561
|
23
23
|
semantic_model_generator/tests/generate_model_classification_test.py,sha256=Amq29cmeKd0S7iVikJ60RFm9gpWaQv1TijXofp3J-lI,2275
|
24
24
|
semantic_model_generator/tests/llm_enrichment_test.py,sha256=1avLrPWp7J7o_K3PKbI_PIvduM5Id21MmoL0JTeDTfs,15738
|
25
|
-
semantic_model_generator/tests/relationship_discovery_test.py,sha256=
|
25
|
+
semantic_model_generator/tests/relationship_discovery_test.py,sha256=OvnK2jhWNFfHI31eeIEmclgaUoFjj_mZuDFAnjLMBpw,5411
|
26
26
|
semantic_model_generator/tests/relationships_filters_test.py,sha256=bUm3r1UGaXca-hJOot7jMPz4It_TVsoddd-Xpk-76zM,10166
|
27
27
|
semantic_model_generator/tests/samples/validate_yamls.py,sha256=262j-2i2oFZtTyK2susOrbxxE5eS-6IN-V0jFEOpt_w,156249
|
28
28
|
semantic_model_generator/tests/utils_test.py,sha256=HWRXR45QYL1f6L8xsMppqLXzF9HAsrMwTMQIKpZrc_M,539
|
@@ -32,7 +32,7 @@ semantic_model_generator/validate/context_length.py,sha256=HL-GfaRXNcVji1-pAFGXG
|
|
32
32
|
semantic_model_generator/validate/keywords.py,sha256=frZ5HjRXP69K6dYAU5_d86oSp40_3yoLUg1eQwU3oLM,7080
|
33
33
|
semantic_model_generator/validate/schema.py,sha256=eL_wl5yscIeczwNBRUKhF_7QqWW2wSGimkgaOhMFsrA,5893
|
34
34
|
semantic_model_generator/validate_model.py,sha256=Uq-V-GfPeF2Dy4l9uF5Guv104gDCDGh0Cxz1AJOu5dk,836
|
35
|
-
clickzetta_semantic_model_generator-1.0.
|
36
|
-
clickzetta_semantic_model_generator-1.0.
|
37
|
-
clickzetta_semantic_model_generator-1.0.
|
38
|
-
clickzetta_semantic_model_generator-1.0.
|
35
|
+
clickzetta_semantic_model_generator-1.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
36
|
+
clickzetta_semantic_model_generator-1.0.4.dist-info/METADATA,sha256=zf4rBVSbisDDtZOnw5SoxGCRrO-PjfMQ66PinfYK3xg,7816
|
37
|
+
clickzetta_semantic_model_generator-1.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
38
|
+
clickzetta_semantic_model_generator-1.0.4.dist-info/RECORD,,
|
@@ -11,7 +11,12 @@ from clickzetta.zettapark.session import Session
|
|
11
11
|
from loguru import logger
|
12
12
|
|
13
13
|
from semantic_model_generator.clickzetta_utils import env_vars
|
14
|
-
from semantic_model_generator.clickzetta_utils.utils import
|
14
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
15
|
+
create_session,
|
16
|
+
join_quoted_identifiers,
|
17
|
+
normalize_identifier,
|
18
|
+
quote_identifier,
|
19
|
+
)
|
15
20
|
from semantic_model_generator.data_processing.data_types import Column, Table
|
16
21
|
|
17
22
|
ConnectionType = TypeVar("ConnectionType", bound=Session)
|
@@ -151,18 +156,8 @@ class ClickzettaConnectionProxy:
|
|
151
156
|
self.session.close()
|
152
157
|
|
153
158
|
|
154
|
-
def _quote_identifier(name: str) -> str:
|
155
|
-
return f'"{name}"'
|
156
|
-
|
157
|
-
|
158
159
|
def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
|
159
|
-
return
|
160
|
-
[
|
161
|
-
_quote_identifier(workspace),
|
162
|
-
_quote_identifier(schema_name),
|
163
|
-
_quote_identifier(table_name),
|
164
|
-
]
|
165
|
-
)
|
160
|
+
return join_quoted_identifiers(workspace, schema_name, table_name)
|
166
161
|
|
167
162
|
|
168
163
|
def _value_is_true(value: Any) -> bool:
|
@@ -175,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
|
|
175
170
|
|
176
171
|
|
177
172
|
def _sanitize_identifier(value: Any, fallback: str = "") -> str:
|
178
|
-
|
173
|
+
normalized = normalize_identifier(value)
|
174
|
+
if not normalized:
|
179
175
|
return fallback
|
180
|
-
normalized = str(value).strip()
|
181
|
-
if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
|
182
|
-
normalized = normalized[1:-1]
|
183
176
|
return normalized
|
184
177
|
|
185
178
|
|
@@ -216,21 +209,19 @@ def _fetch_distinct_values(
|
|
216
209
|
column_name: str,
|
217
210
|
ndv: int,
|
218
211
|
) -> Optional[List[str]]:
|
219
|
-
workspace_part = (
|
220
|
-
_sanitize_identifier(workspace, workspace).upper() if workspace else ""
|
221
|
-
)
|
212
|
+
workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
|
222
213
|
schema_part = (
|
223
|
-
_sanitize_identifier(schema_name, schema_name)
|
214
|
+
_sanitize_identifier(schema_name, schema_name) if schema_name else ""
|
224
215
|
)
|
225
|
-
table_part = _sanitize_identifier(table_name, table_name)
|
226
|
-
column_part = _sanitize_identifier(column_name, column_name)
|
216
|
+
table_part = _sanitize_identifier(table_name, table_name)
|
217
|
+
column_part = _sanitize_identifier(column_name, column_name)
|
227
218
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
219
|
+
qualified_table = join_quoted_identifiers(
|
220
|
+
workspace_part, schema_part, table_part
|
221
|
+
)
|
222
|
+
column_expr = quote_identifier(column_part)
|
232
223
|
|
233
|
-
query = f"SELECT DISTINCT {
|
224
|
+
query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
|
234
225
|
try:
|
235
226
|
df = session.sql(query).to_pandas()
|
236
227
|
if df.empty:
|
@@ -660,9 +651,8 @@ def fetch_tables_views_in_schema(
|
|
660
651
|
|
661
652
|
try:
|
662
653
|
if workspace_upper and schema_upper:
|
663
|
-
|
664
|
-
|
665
|
-
).to_pandas()
|
654
|
+
scope = join_quoted_identifiers(workspace_upper, schema_upper)
|
655
|
+
df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
|
666
656
|
else:
|
667
657
|
df = session.sql("SHOW TABLES").to_pandas()
|
668
658
|
except Exception as exc: # pragma: no cover
|
@@ -738,11 +728,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
|
|
738
728
|
|
739
729
|
queries: List[str] = []
|
740
730
|
if schema:
|
741
|
-
|
742
|
-
|
731
|
+
scope = join_quoted_identifiers(workspace, schema)
|
732
|
+
if scope:
|
733
|
+
queries.append(f"SHOW VOLUMES IN {scope}")
|
734
|
+
queries.append(f"SHOW STAGES IN SCHEMA {scope}")
|
743
735
|
else:
|
744
|
-
|
745
|
-
|
736
|
+
workspace_identifier = quote_identifier(workspace)
|
737
|
+
if workspace_identifier:
|
738
|
+
queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
|
739
|
+
queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
|
746
740
|
|
747
741
|
stage_names: List[str] = ["volume:user://~/semantic_models/"]
|
748
742
|
seen: set[str] = set(stage_names)
|
@@ -899,7 +893,7 @@ def create_table_in_schema(
|
|
899
893
|
columns_schema: Dict[str, str],
|
900
894
|
) -> bool:
|
901
895
|
fields = ", ".join(
|
902
|
-
f"{
|
896
|
+
f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
|
903
897
|
)
|
904
898
|
query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
|
905
899
|
try:
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from contextlib import contextmanager
|
4
|
-
from typing import Dict, Iterable
|
4
|
+
from typing import Any, Dict, Iterable
|
5
5
|
|
6
6
|
from clickzetta.zettapark.session import Session
|
7
7
|
|
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
|
|
21
21
|
}
|
22
22
|
|
23
23
|
|
24
|
+
def normalize_identifier(value: Any) -> str:
|
25
|
+
"""
|
26
|
+
Strips outer quotes/backticks and surrounding whitespace from an identifier.
|
27
|
+
Returns an empty string when the identifier is missing.
|
28
|
+
"""
|
29
|
+
|
30
|
+
if value is None:
|
31
|
+
return ""
|
32
|
+
text = str(value).strip()
|
33
|
+
if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
|
34
|
+
return text[1:-1]
|
35
|
+
return text
|
36
|
+
|
37
|
+
|
38
|
+
def quote_identifier(value: Any) -> str:
|
39
|
+
"""
|
40
|
+
Wraps an identifier in backticks, escaping embedded backticks as needed.
|
41
|
+
Returns an empty string if the identifier is missing.
|
42
|
+
"""
|
43
|
+
|
44
|
+
normalized = normalize_identifier(value)
|
45
|
+
if not normalized:
|
46
|
+
return ""
|
47
|
+
escaped = normalized.replace("`", "``")
|
48
|
+
return f"`{escaped}`"
|
49
|
+
|
50
|
+
|
51
|
+
def join_quoted_identifiers(*parts: Any) -> str:
|
52
|
+
"""
|
53
|
+
Joins identifier parts with '.' and ensures each segment is backtick-quoted.
|
54
|
+
Empty segments are skipped.
|
55
|
+
"""
|
56
|
+
|
57
|
+
quoted_parts = [
|
58
|
+
quote_identifier(part)
|
59
|
+
for part in parts
|
60
|
+
if normalize_identifier(part)
|
61
|
+
]
|
62
|
+
return ".".join(part for part in quoted_parts if part)
|
63
|
+
|
64
|
+
|
24
65
|
def create_fqn_table(fqn_str: str) -> FQNParts:
|
25
66
|
"""
|
26
67
|
Splits a fully qualified table name into its ClickZetta components.
|
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
|
|
72
113
|
("schema", schema),
|
73
114
|
("vcluster", vcluster),
|
74
115
|
):
|
75
|
-
|
116
|
+
identifier = quote_identifier(value)
|
117
|
+
session.sql(f"USE {component.upper()} {identifier}")
|
76
118
|
|
77
119
|
|
78
120
|
def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:
|
@@ -11,12 +11,34 @@ from sqlglot import Dialect
|
|
11
11
|
from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
12
12
|
OBJECT_DATATYPES,
|
13
13
|
)
|
14
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
15
|
+
join_quoted_identifiers,
|
16
|
+
normalize_identifier,
|
17
|
+
)
|
14
18
|
from semantic_model_generator.protos import semantic_model_pb2
|
15
19
|
|
16
20
|
_SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
|
17
21
|
ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
|
18
22
|
|
19
23
|
_LOGICAL_TABLE_PREFIX = "__"
|
24
|
+
_SQLGLOT_QUOTE_CHAR = '"'
|
25
|
+
|
26
|
+
|
27
|
+
def _prepare_sql_for_parsing(sql: str) -> str:
|
28
|
+
"""
|
29
|
+
Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
|
30
|
+
"""
|
31
|
+
|
32
|
+
return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
|
33
|
+
|
34
|
+
|
35
|
+
def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
|
36
|
+
"""
|
37
|
+
Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
|
38
|
+
"""
|
39
|
+
|
40
|
+
rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
|
41
|
+
return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
|
20
42
|
|
21
43
|
|
22
44
|
def is_logical_table(table_name: str) -> bool:
|
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
|
|
33
55
|
|
34
56
|
def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
|
35
57
|
"""Returns fully qualified table name such as my_db.my_schema.my_table"""
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return
|
58
|
+
parts = [
|
59
|
+
normalize_identifier(component)
|
60
|
+
for component in (table.database, table.schema, table.table)
|
61
|
+
if component
|
62
|
+
]
|
63
|
+
return join_quoted_identifiers(*parts) # type: ignore[no-any-return]
|
42
64
|
|
43
65
|
|
44
66
|
def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
|
@@ -156,8 +178,8 @@ def _generate_cte_for(
|
|
156
178
|
cte = f"WITH {logical_table_name(table)} AS (\n"
|
157
179
|
cte += "SELECT \n"
|
158
180
|
cte += ",\n".join(expr_columns) + "\n"
|
159
|
-
cte += f"FROM {fully_qualified_table_name(table.base_table)}"
|
160
|
-
cte += ")"
|
181
|
+
cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
|
182
|
+
cte += ")\n"
|
161
183
|
return cte
|
162
184
|
|
163
185
|
|
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
|
|
261
283
|
str: The SQL statement in ClickZetta syntax.
|
262
284
|
"""
|
263
285
|
try:
|
264
|
-
expression = sqlglot.parse_one(
|
286
|
+
expression = sqlglot.parse_one(
|
287
|
+
_prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
|
288
|
+
)
|
265
289
|
except Exception as e:
|
266
290
|
raise ValueError(
|
267
291
|
f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
|
268
292
|
)
|
269
293
|
|
270
|
-
return expression
|
294
|
+
return _render_clickzetta_sql(expression)
|
271
295
|
|
272
296
|
|
273
297
|
def generate_select(
|
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
|
|
332
356
|
for cte in ctes:
|
333
357
|
new_withs.append(
|
334
358
|
sqlglot.parse_one(
|
335
|
-
cte,
|
359
|
+
_prepare_sql_for_parsing(cte),
|
360
|
+
read=ClickzettaDialect,
|
361
|
+
into=sqlglot.expressions.With,
|
336
362
|
)
|
337
363
|
)
|
338
364
|
|
339
365
|
# Step 3: Prefix the CTEs to the original query.
|
340
|
-
ast = sqlglot.parse_one(
|
366
|
+
ast = sqlglot.parse_one(
|
367
|
+
_prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
|
368
|
+
)
|
341
369
|
with_ = ast.args.get("with")
|
342
370
|
# If the query doesn't have a WITH clause, then generate one.
|
343
371
|
if with_ is None:
|
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
|
|
349
377
|
else:
|
350
378
|
new_ctes = [w.expressions[0] for w in new_withs]
|
351
379
|
with_.set("expressions", new_ctes + with_.expressions)
|
352
|
-
return
|
380
|
+
return _render_clickzetta_sql(
|
381
|
+
ast, pretty=True
|
382
|
+
) # type: ignore [no-any-return]
|
353
383
|
|
354
384
|
|
355
385
|
def context_to_column_format(
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import math
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import time
|
4
5
|
from collections import defaultdict
|
5
6
|
from datetime import datetime
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
@@ -17,7 +18,12 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
|
17
18
|
get_table_representation,
|
18
19
|
get_valid_schemas_tables_columns_df,
|
19
20
|
)
|
20
|
-
from semantic_model_generator.clickzetta_utils.utils import
|
21
|
+
from semantic_model_generator.clickzetta_utils.utils import (
|
22
|
+
create_fqn_table,
|
23
|
+
join_quoted_identifiers,
|
24
|
+
normalize_identifier,
|
25
|
+
quote_identifier,
|
26
|
+
)
|
21
27
|
from semantic_model_generator.data_processing import data_types, proto_utils
|
22
28
|
from semantic_model_generator.llm import (
|
23
29
|
DashscopeClient,
|
@@ -41,6 +47,14 @@ _AUTOGEN_COMMENT_TOKEN = (
|
|
41
47
|
)
|
42
48
|
_DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
|
43
49
|
_AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
|
50
|
+
_GENERIC_IDENTIFIER_TOKENS = {
|
51
|
+
"ID",
|
52
|
+
"NAME",
|
53
|
+
"CODE",
|
54
|
+
"KEY",
|
55
|
+
"VALUE",
|
56
|
+
"NUMBER",
|
57
|
+
}
|
44
58
|
|
45
59
|
|
46
60
|
def _singularize(token: str) -> str:
|
@@ -90,6 +104,14 @@ def _identifier_tokens(
|
|
90
104
|
return tokens
|
91
105
|
|
92
106
|
|
107
|
+
def _is_generic_identifier(name: str) -> bool:
|
108
|
+
tokens = [token for token in _identifier_tokens(name) if token]
|
109
|
+
if not tokens:
|
110
|
+
return True
|
111
|
+
normalized_tokens = {token.upper() for token in tokens}
|
112
|
+
return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
|
113
|
+
|
114
|
+
|
93
115
|
def _sanitize_identifier_name(
|
94
116
|
name: str, prefixes_to_drop: Optional[set[str]] = None
|
95
117
|
) -> str:
|
@@ -354,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
|
|
354
376
|
|
355
377
|
def _format_sql_identifier(name: str) -> str:
|
356
378
|
"""
|
357
|
-
Formats an identifier for SQL
|
379
|
+
Formats an identifier for SQL by wrapping it in backticks.
|
358
380
|
"""
|
359
|
-
|
360
|
-
return ""
|
361
|
-
return str(name).replace('"', "").replace("`", "").strip().upper()
|
381
|
+
return quote_identifier(name)
|
362
382
|
|
363
383
|
|
364
384
|
def _qualified_table_name(fqn: data_types.FQNParts) -> str:
|
365
385
|
"""
|
366
|
-
Builds a fully qualified table name
|
386
|
+
Builds a fully qualified, backtick-quoted table name.
|
367
387
|
"""
|
368
|
-
parts = [part for part in (fqn.database, fqn.schema_name, fqn.table)
|
369
|
-
return
|
388
|
+
parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
|
389
|
+
return join_quoted_identifiers(*(part for part in parts if part))
|
370
390
|
|
371
391
|
|
372
392
|
def _levenshtein_distance(s1: str, s2: str) -> int:
|
@@ -977,6 +997,19 @@ def _calculate_relationship_confidence(
|
|
977
997
|
|
978
998
|
confidence_score += name_confidence
|
979
999
|
|
1000
|
+
generic_pair_count = sum(
|
1001
|
+
1
|
1002
|
+
for left_col, right_col in column_pairs
|
1003
|
+
if _is_generic_identifier(left_col)
|
1004
|
+
and _is_generic_identifier(right_col)
|
1005
|
+
)
|
1006
|
+
if generic_pair_count:
|
1007
|
+
penalty = min(0.15 * generic_pair_count, 0.3)
|
1008
|
+
confidence_score = max(confidence_score - penalty, 0.0)
|
1009
|
+
reasoning_factors.append(
|
1010
|
+
f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
|
1011
|
+
)
|
1012
|
+
|
980
1013
|
# Check for foreign key naming patterns
|
981
1014
|
fk_pattern_confidence = 0.0
|
982
1015
|
for left_col, right_col in column_pairs:
|
@@ -2326,11 +2359,31 @@ def _infer_relationships(
|
|
2326
2359
|
*,
|
2327
2360
|
session: Optional[Session] = None,
|
2328
2361
|
strict_join_inference: bool = False,
|
2362
|
+
status: Optional[Dict[str, bool]] = None,
|
2363
|
+
max_relationships: Optional[int] = None,
|
2364
|
+
min_confidence: float = 0.2,
|
2365
|
+
timeout_seconds: Optional[float] = None,
|
2329
2366
|
) -> List[semantic_model_pb2.Relationship]:
|
2367
|
+
status_dict = status if status is not None else {}
|
2368
|
+
if "limited_by_timeout" not in status_dict:
|
2369
|
+
status_dict["limited_by_timeout"] = False
|
2370
|
+
if "limited_by_max_relationships" not in status_dict:
|
2371
|
+
status_dict["limited_by_max_relationships"] = False
|
2372
|
+
|
2330
2373
|
relationships: List[semantic_model_pb2.Relationship] = []
|
2331
2374
|
if not raw_tables:
|
2332
2375
|
return relationships
|
2333
2376
|
|
2377
|
+
start_time = time.perf_counter()
|
2378
|
+
min_confidence = max(0.0, min(min_confidence, 1.0))
|
2379
|
+
limit_reached = False
|
2380
|
+
|
2381
|
+
def _timed_out() -> bool:
|
2382
|
+
return (
|
2383
|
+
timeout_seconds is not None
|
2384
|
+
and (time.perf_counter() - start_time) >= timeout_seconds
|
2385
|
+
)
|
2386
|
+
|
2334
2387
|
metadata = {}
|
2335
2388
|
prefix_counter: Dict[str, int] = {}
|
2336
2389
|
for _, raw_table in raw_tables:
|
@@ -2392,14 +2445,39 @@ def _infer_relationships(
|
|
2392
2445
|
def _record_pair(
|
2393
2446
|
left_table: str, right_table: str, left_col: str, right_col: str
|
2394
2447
|
) -> None:
|
2448
|
+
nonlocal limit_reached
|
2449
|
+
if limit_reached:
|
2450
|
+
return
|
2451
|
+
if _timed_out():
|
2452
|
+
status_dict["limited_by_timeout"] = True
|
2453
|
+
limit_reached = True
|
2454
|
+
return
|
2455
|
+
|
2395
2456
|
key = (left_table, right_table)
|
2396
2457
|
value = (left_col, right_col)
|
2397
|
-
|
2398
|
-
|
2458
|
+
bucket = pairs.setdefault(key, [])
|
2459
|
+
if value not in bucket:
|
2460
|
+
bucket.append(value)
|
2461
|
+
if (
|
2462
|
+
max_relationships is not None
|
2463
|
+
and len(pairs) >= max_relationships
|
2464
|
+
):
|
2465
|
+
status_dict["limited_by_max_relationships"] = True
|
2466
|
+
limit_reached = True
|
2399
2467
|
|
2400
2468
|
table_names = list(metadata.keys())
|
2401
2469
|
for i in range(len(table_names)):
|
2470
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2471
|
+
break
|
2472
|
+
if _timed_out():
|
2473
|
+
status_dict["limited_by_timeout"] = True
|
2474
|
+
break
|
2402
2475
|
for j in range(i + 1, len(table_names)):
|
2476
|
+
if limit_reached or status_dict["limited_by_timeout"]:
|
2477
|
+
break
|
2478
|
+
if _timed_out():
|
2479
|
+
status_dict["limited_by_timeout"] = True
|
2480
|
+
break
|
2403
2481
|
table_a_name = table_names[i]
|
2404
2482
|
table_b_name = table_names[j]
|
2405
2483
|
table_a = metadata[table_a_name]
|
@@ -2575,6 +2653,15 @@ def _infer_relationships(
|
|
2575
2653
|
|
2576
2654
|
# Build relationships with inferred cardinality
|
2577
2655
|
for (left_table, right_table), column_pairs in pairs.items():
|
2656
|
+
if _timed_out():
|
2657
|
+
status_dict["limited_by_timeout"] = True
|
2658
|
+
break
|
2659
|
+
if (
|
2660
|
+
max_relationships is not None
|
2661
|
+
and len(relationships) >= max_relationships
|
2662
|
+
):
|
2663
|
+
status_dict["limited_by_max_relationships"] = True
|
2664
|
+
break
|
2578
2665
|
# Infer cardinality based on available metadata
|
2579
2666
|
left_meta = metadata[left_table]
|
2580
2667
|
right_meta = metadata[right_table]
|
@@ -2777,6 +2864,16 @@ def _infer_relationships(
|
|
2777
2864
|
for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
|
2778
2865
|
logger.debug(f" + {factor}")
|
2779
2866
|
|
2867
|
+
if confidence_analysis["confidence_score"] < min_confidence:
|
2868
|
+
logger.debug(
|
2869
|
+
"Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
|
2870
|
+
left_table,
|
2871
|
+
right_table,
|
2872
|
+
confidence_analysis["confidence_score"],
|
2873
|
+
min_confidence,
|
2874
|
+
)
|
2875
|
+
continue
|
2876
|
+
|
2780
2877
|
# Determine relationship type based on cardinality
|
2781
2878
|
if left_card == "1" and right_card == "1":
|
2782
2879
|
rel_type = semantic_model_pb2.RelationshipType.one_to_one
|
@@ -2804,16 +2901,27 @@ def _infer_relationships(
|
|
2804
2901
|
relationships.append(relationship)
|
2805
2902
|
|
2806
2903
|
# Phase 2: Detect many-to-many relationships through bridge table analysis
|
2807
|
-
many_to_many_relationships =
|
2808
|
-
|
2809
|
-
|
2810
|
-
|
2811
|
-
|
2812
|
-
|
2813
|
-
logger.info(
|
2814
|
-
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2904
|
+
many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
|
2905
|
+
if not status_dict["limited_by_timeout"] and (
|
2906
|
+
max_relationships is None or len(relationships) < max_relationships
|
2907
|
+
):
|
2908
|
+
many_to_many_relationships = _detect_many_to_many_relationships(
|
2909
|
+
raw_tables, metadata, relationships
|
2815
2910
|
)
|
2816
2911
|
|
2912
|
+
if many_to_many_relationships and max_relationships is not None:
|
2913
|
+
remaining = max_relationships - len(relationships)
|
2914
|
+
if remaining <= 0:
|
2915
|
+
many_to_many_relationships = []
|
2916
|
+
else:
|
2917
|
+
many_to_many_relationships = many_to_many_relationships[:remaining]
|
2918
|
+
|
2919
|
+
if many_to_many_relationships:
|
2920
|
+
relationships.extend(many_to_many_relationships)
|
2921
|
+
logger.info(
|
2922
|
+
f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
|
2923
|
+
)
|
2924
|
+
|
2817
2925
|
logger.info(
|
2818
2926
|
f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
|
2819
2927
|
)
|
@@ -4,6 +4,7 @@ from .discovery import (
|
|
4
4
|
RelationshipDiscoveryResult,
|
5
5
|
RelationshipSummary,
|
6
6
|
discover_relationships_from_schema,
|
7
|
+
discover_relationships_from_table_definitions,
|
7
8
|
discover_relationships_from_tables,
|
8
9
|
)
|
9
10
|
|
@@ -11,5 +12,6 @@ __all__ = [
|
|
11
12
|
"RelationshipDiscoveryResult",
|
12
13
|
"RelationshipSummary",
|
13
14
|
"discover_relationships_from_schema",
|
15
|
+
"discover_relationships_from_table_definitions",
|
14
16
|
"discover_relationships_from_tables",
|
15
17
|
]
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import time
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
5
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
|
6
6
|
|
7
7
|
import pandas as pd
|
8
8
|
from loguru import logger
|
@@ -13,7 +13,7 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
|
|
13
13
|
get_valid_schemas_tables_columns_df,
|
14
14
|
)
|
15
15
|
from semantic_model_generator.data_processing import data_types
|
16
|
-
from semantic_model_generator.data_processing.data_types import FQNParts, Table
|
16
|
+
from semantic_model_generator.data_processing.data_types import Column, FQNParts, Table
|
17
17
|
from semantic_model_generator.generate_model import (
|
18
18
|
_DEFAULT_N_SAMPLE_VALUES_PER_COL,
|
19
19
|
_infer_relationships,
|
@@ -34,6 +34,10 @@ class RelationshipSummary:
|
|
34
34
|
total_columns: int
|
35
35
|
total_relationships_found: int
|
36
36
|
processing_time_ms: int
|
37
|
+
limited_by_timeout: bool = False
|
38
|
+
limited_by_max_relationships: bool = False
|
39
|
+
limited_by_table_cap: bool = False
|
40
|
+
notes: Optional[str] = None
|
37
41
|
|
38
42
|
|
39
43
|
@dataclass
|
@@ -97,20 +101,125 @@ def _build_tables_from_dataframe(
|
|
97
101
|
return tables
|
98
102
|
|
99
103
|
|
104
|
+
def _tables_payload_to_raw_tables(
|
105
|
+
tables: Sequence[Mapping[str, Any]],
|
106
|
+
*,
|
107
|
+
default_workspace: str = "OFFLINE",
|
108
|
+
default_schema: str = "PUBLIC",
|
109
|
+
) -> List[Tuple[FQNParts, Table]]:
|
110
|
+
raw_tables: List[Tuple[FQNParts, Table]] = []
|
111
|
+
for table_index, table_entry in enumerate(tables):
|
112
|
+
if not isinstance(table_entry, Mapping):
|
113
|
+
raise TypeError("Each table definition must be a mapping of table metadata")
|
114
|
+
|
115
|
+
table_name = str(
|
116
|
+
table_entry.get("table_name")
|
117
|
+
or table_entry.get("name")
|
118
|
+
or table_entry.get("table")
|
119
|
+
or ""
|
120
|
+
).strip()
|
121
|
+
if not table_name:
|
122
|
+
raise ValueError("Table definition missing 'table_name'")
|
123
|
+
|
124
|
+
workspace = str(table_entry.get("workspace") or default_workspace).strip() or default_workspace
|
125
|
+
schema = str(
|
126
|
+
table_entry.get("schema")
|
127
|
+
or table_entry.get("schema_name")
|
128
|
+
or default_schema
|
129
|
+
).strip() or default_schema
|
130
|
+
|
131
|
+
columns_payload = table_entry.get("columns")
|
132
|
+
if not isinstance(columns_payload, Sequence) or not columns_payload:
|
133
|
+
raise ValueError(
|
134
|
+
f"Table '{table_name}' must include a non-empty 'columns' list"
|
135
|
+
)
|
136
|
+
|
137
|
+
columns: List[Column] = []
|
138
|
+
for column_index, column_entry in enumerate(columns_payload):
|
139
|
+
if not isinstance(column_entry, Mapping):
|
140
|
+
raise TypeError(
|
141
|
+
f"Column definition for table '{table_name}' must be a mapping"
|
142
|
+
)
|
143
|
+
|
144
|
+
column_name = str(
|
145
|
+
column_entry.get("name")
|
146
|
+
or column_entry.get("column_name")
|
147
|
+
or column_entry.get("field")
|
148
|
+
or ""
|
149
|
+
).strip()
|
150
|
+
if not column_name:
|
151
|
+
raise ValueError(
|
152
|
+
f"Column definition in table '{table_name}' missing 'name'"
|
153
|
+
)
|
154
|
+
|
155
|
+
column_type = str(
|
156
|
+
column_entry.get("type")
|
157
|
+
or column_entry.get("data_type")
|
158
|
+
or "STRING"
|
159
|
+
).strip()
|
160
|
+
|
161
|
+
values = column_entry.get("sample_values") or column_entry.get("values")
|
162
|
+
if isinstance(values, Sequence) and not isinstance(values, (str, bytes)):
|
163
|
+
sample_values = [str(value) for value in values]
|
164
|
+
else:
|
165
|
+
sample_values = None
|
166
|
+
|
167
|
+
is_primary = bool(
|
168
|
+
column_entry.get("is_primary_key")
|
169
|
+
or column_entry.get("primary_key")
|
170
|
+
or column_entry.get("is_primary")
|
171
|
+
)
|
172
|
+
|
173
|
+
columns.append(
|
174
|
+
Column(
|
175
|
+
id_=column_index,
|
176
|
+
column_name=column_name,
|
177
|
+
column_type=column_type,
|
178
|
+
values=sample_values,
|
179
|
+
comment=column_entry.get("comment"),
|
180
|
+
is_primary_key=is_primary,
|
181
|
+
)
|
182
|
+
)
|
183
|
+
|
184
|
+
table_proto = Table(
|
185
|
+
id_=table_index,
|
186
|
+
name=table_name.upper(),
|
187
|
+
columns=columns,
|
188
|
+
comment=table_entry.get("comment"),
|
189
|
+
)
|
190
|
+
fqn = FQNParts(
|
191
|
+
database=workspace.upper(),
|
192
|
+
schema_name=schema.upper(),
|
193
|
+
table=table_name,
|
194
|
+
)
|
195
|
+
raw_tables.append((fqn, table_proto))
|
196
|
+
|
197
|
+
return raw_tables
|
198
|
+
|
199
|
+
|
100
200
|
def _discover_relationships(
|
101
201
|
raw_tables: List[Tuple[FQNParts, Table]],
|
102
202
|
strict_join_inference: bool,
|
103
203
|
session: Optional[Session],
|
104
|
-
|
204
|
+
*,
|
205
|
+
max_relationships: Optional[int] = None,
|
206
|
+
min_confidence: float = 0.5,
|
207
|
+
timeout_seconds: Optional[float] = None,
|
208
|
+
) -> Tuple[List[semantic_model_pb2.Relationship], Dict[str, bool]]:
|
105
209
|
if not raw_tables:
|
106
|
-
return []
|
210
|
+
return [], {"limited_by_timeout": False, "limited_by_max_relationships": False}
|
107
211
|
|
212
|
+
status: Dict[str, bool] = {}
|
108
213
|
relationships = _infer_relationships(
|
109
214
|
raw_tables,
|
110
215
|
session=session if strict_join_inference else None,
|
111
216
|
strict_join_inference=strict_join_inference,
|
217
|
+
status=status,
|
218
|
+
max_relationships=max_relationships,
|
219
|
+
min_confidence=min_confidence,
|
220
|
+
timeout_seconds=timeout_seconds,
|
112
221
|
)
|
113
|
-
return relationships
|
222
|
+
return relationships, status
|
114
223
|
|
115
224
|
|
116
225
|
def discover_relationships_from_tables(
|
@@ -118,33 +227,86 @@ def discover_relationships_from_tables(
|
|
118
227
|
*,
|
119
228
|
strict_join_inference: bool = False,
|
120
229
|
session: Optional[Session] = None,
|
230
|
+
max_relationships: Optional[int] = None,
|
231
|
+
min_confidence: float = 0.5,
|
232
|
+
timeout_seconds: Optional[float] = 30.0,
|
233
|
+
max_tables: Optional[int] = None,
|
121
234
|
) -> RelationshipDiscoveryResult:
|
122
235
|
"""
|
123
236
|
Run relationship inference using pre-constructed table metadata.
|
124
237
|
"""
|
125
238
|
start = time.perf_counter()
|
126
|
-
|
127
|
-
|
239
|
+
raw_tables = list(tables)
|
240
|
+
limited_by_table_cap = False
|
241
|
+
notes: List[str] = []
|
242
|
+
|
243
|
+
if max_tables is not None and len(raw_tables) > max_tables:
|
244
|
+
limited_by_table_cap = True
|
245
|
+
notes.append(
|
246
|
+
f"Input contained {len(raw_tables)} tables; analysis limited to first {max_tables}."
|
247
|
+
)
|
248
|
+
raw_tables = raw_tables[:max_tables]
|
249
|
+
|
250
|
+
relationships, status = _discover_relationships(
|
251
|
+
raw_tables,
|
128
252
|
strict_join_inference=strict_join_inference,
|
129
253
|
session=session,
|
254
|
+
max_relationships=max_relationships,
|
255
|
+
min_confidence=min_confidence,
|
256
|
+
timeout_seconds=timeout_seconds,
|
130
257
|
)
|
131
258
|
end = time.perf_counter()
|
132
259
|
|
133
|
-
all_columns = sum(len(table.columns) for _, table in
|
260
|
+
all_columns = sum(len(table.columns) for _, table in raw_tables)
|
134
261
|
summary = RelationshipSummary(
|
135
|
-
total_tables=len(
|
262
|
+
total_tables=len(raw_tables),
|
136
263
|
total_columns=all_columns,
|
137
264
|
total_relationships_found=len(relationships),
|
138
265
|
processing_time_ms=int((end - start) * 1000),
|
266
|
+
limited_by_timeout=status.get("limited_by_timeout", False),
|
267
|
+
limited_by_max_relationships=status.get("limited_by_max_relationships", False),
|
268
|
+
limited_by_table_cap=limited_by_table_cap,
|
269
|
+
notes=" ".join(notes) if notes else None,
|
139
270
|
)
|
140
271
|
|
141
272
|
return RelationshipDiscoveryResult(
|
142
273
|
relationships=relationships,
|
143
|
-
tables=[table for _, table in
|
274
|
+
tables=[table for _, table in raw_tables],
|
144
275
|
summary=summary,
|
145
276
|
)
|
146
277
|
|
147
278
|
|
279
|
+
def discover_relationships_from_table_definitions(
|
280
|
+
table_definitions: Sequence[Mapping[str, Any]],
|
281
|
+
*,
|
282
|
+
default_workspace: str = "OFFLINE",
|
283
|
+
default_schema: str = "PUBLIC",
|
284
|
+
strict_join_inference: bool = False,
|
285
|
+
session: Optional[Session] = None,
|
286
|
+
max_relationships: Optional[int] = None,
|
287
|
+
min_confidence: float = 0.5,
|
288
|
+
timeout_seconds: Optional[float] = 15.0,
|
289
|
+
max_tables: Optional[int] = None,
|
290
|
+
) -> RelationshipDiscoveryResult:
|
291
|
+
"""Run relationship inference using raw table metadata dictionaries."""
|
292
|
+
|
293
|
+
raw_tables = _tables_payload_to_raw_tables(
|
294
|
+
table_definitions,
|
295
|
+
default_workspace=default_workspace,
|
296
|
+
default_schema=default_schema,
|
297
|
+
)
|
298
|
+
|
299
|
+
return discover_relationships_from_tables(
|
300
|
+
raw_tables,
|
301
|
+
strict_join_inference=strict_join_inference,
|
302
|
+
session=session,
|
303
|
+
max_relationships=max_relationships,
|
304
|
+
min_confidence=min_confidence,
|
305
|
+
timeout_seconds=timeout_seconds,
|
306
|
+
max_tables=max_tables,
|
307
|
+
)
|
308
|
+
|
309
|
+
|
148
310
|
def discover_relationships_from_schema(
|
149
311
|
session: Session,
|
150
312
|
workspace: str,
|
@@ -154,6 +316,10 @@ def discover_relationships_from_schema(
|
|
154
316
|
sample_values_per_column: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
|
155
317
|
strict_join_inference: bool = False,
|
156
318
|
max_workers: int = DEFAULT_MAX_WORKERS,
|
319
|
+
max_relationships: Optional[int] = None,
|
320
|
+
min_confidence: float = 0.5,
|
321
|
+
timeout_seconds: Optional[float] = 30.0,
|
322
|
+
max_tables: Optional[int] = 60,
|
157
323
|
) -> RelationshipDiscoveryResult:
|
158
324
|
"""
|
159
325
|
Discover table relationships for all tables in a ClickZetta schema.
|
@@ -199,4 +365,8 @@ def discover_relationships_from_schema(
|
|
199
365
|
raw_tables,
|
200
366
|
strict_join_inference=strict_join_inference,
|
201
367
|
session=session,
|
368
|
+
max_relationships=max_relationships,
|
369
|
+
min_confidence=min_confidence,
|
370
|
+
timeout_seconds=timeout_seconds,
|
371
|
+
max_tables=max_tables,
|
202
372
|
)
|
@@ -5,6 +5,7 @@ import sqlglot
|
|
5
5
|
|
6
6
|
from semantic_model_generator.data_processing.cte_utils import (
|
7
7
|
ClickzettaDialect,
|
8
|
+
_prepare_sql_for_parsing,
|
8
9
|
_enrich_column_in_expr_with_aggregation,
|
9
10
|
_get_col_expr,
|
10
11
|
_validate_col,
|
@@ -304,7 +305,7 @@ class SemanticModelTest(TestCase):
|
|
304
305
|
col_format_tbl = get_test_table_col_format()
|
305
306
|
got = generate_select(col_format_tbl, 100)
|
306
307
|
want = [
|
307
|
-
"WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM db
|
308
|
+
"WITH __t1 AS (SELECT d1_expr AS d1, d2_expr AS d2 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
|
308
309
|
]
|
309
310
|
assert got == want
|
310
311
|
|
@@ -312,8 +313,8 @@ class SemanticModelTest(TestCase):
|
|
312
313
|
col_format_tbl = get_test_table_col_format_w_agg()
|
313
314
|
got = generate_select(col_format_tbl, 100)
|
314
315
|
want = [
|
315
|
-
"WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db
|
316
|
-
"WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM db
|
316
|
+
"WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
|
317
|
+
"WITH __t1 AS (SELECT d1_expr AS d1, SUM(d3) OVER (PARTITION BY d1) AS d3 FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100",
|
317
318
|
]
|
318
319
|
assert sorted(got) == sorted(want)
|
319
320
|
|
@@ -321,7 +322,7 @@ class SemanticModelTest(TestCase):
|
|
321
322
|
col_format_tbl = get_test_table_col_format_w_agg_only()
|
322
323
|
got = generate_select(col_format_tbl, 100)
|
323
324
|
want = [
|
324
|
-
"WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM db
|
325
|
+
"WITH __t1 AS (SELECT SUM(d2) AS d2_total FROM `db`.`sc`.`t1`) SELECT * FROM __t1 LIMIT 100"
|
325
326
|
]
|
326
327
|
assert sorted(got) == sorted(want)
|
327
328
|
|
@@ -437,21 +438,21 @@ class SemanticModelTest(TestCase):
|
|
437
438
|
want = """WITH __t1 AS (SELECT
|
438
439
|
d1_expr AS d1,
|
439
440
|
d2_expr AS d2
|
440
|
-
FROM db
|
441
|
+
FROM `db`.`sc`.`t1`
|
441
442
|
), __t2 AS (
|
442
443
|
SELECT
|
443
444
|
td1_expr AS td1,
|
444
445
|
m1_expr AS m1,
|
445
446
|
m1_expr AS m2,
|
446
447
|
m3_expr
|
447
|
-
FROM db
|
448
|
+
FROM `db`.`sc`.`t2`
|
448
449
|
)
|
449
450
|
SELECT
|
450
451
|
*
|
451
452
|
FROM __t2"""
|
452
|
-
assert sqlglot.parse_one(
|
453
|
-
|
454
|
-
)
|
453
|
+
assert sqlglot.parse_one(
|
454
|
+
_prepare_sql_for_parsing(want), ClickzettaDialect
|
455
|
+
) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)
|
455
456
|
|
456
457
|
def test_expand_all_logical_tables_as_ctes_with_column_renaming(self) -> None:
|
457
458
|
ctx = semantic_model_pb2.SemanticModel(
|
@@ -465,12 +466,12 @@ FROM __t2"""
|
|
465
466
|
clcks AS clicks,
|
466
467
|
clcks,
|
467
468
|
cst
|
468
|
-
FROM db
|
469
|
+
FROM `db`.`sc`.`t1`
|
469
470
|
)
|
470
471
|
SELECT
|
471
472
|
*
|
472
473
|
FROM __t1
|
473
474
|
"""
|
474
|
-
assert sqlglot.parse_one(
|
475
|
-
|
476
|
-
)
|
475
|
+
assert sqlglot.parse_one(
|
476
|
+
_prepare_sql_for_parsing(want), ClickzettaDialect
|
477
|
+
) == sqlglot.parse_one(_prepare_sql_for_parsing(got), ClickzettaDialect)
|
@@ -6,6 +6,7 @@ import pandas as pd
|
|
6
6
|
|
7
7
|
from semantic_model_generator.relationships.discovery import (
|
8
8
|
discover_relationships_from_schema,
|
9
|
+
discover_relationships_from_table_definitions,
|
9
10
|
)
|
10
11
|
|
11
12
|
|
@@ -112,3 +113,66 @@ def test_discover_relationships_from_schema_builds_relationships():
|
|
112
113
|
right_tables = {rel.right_table for rel in result.relationships}
|
113
114
|
assert "ORDERS" in left_tables
|
114
115
|
assert "CUSTOMER" in right_tables
|
116
|
+
|
117
|
+
|
118
|
+
def test_discover_relationships_from_table_definitions_allows_manual_metadata() -> None:
|
119
|
+
payload = [
|
120
|
+
{
|
121
|
+
"table_name": "orders",
|
122
|
+
"columns": [
|
123
|
+
{"name": "order_id", "type": "NUMBER", "is_primary_key": True},
|
124
|
+
{"name": "customer_id", "type": "NUMBER"},
|
125
|
+
],
|
126
|
+
},
|
127
|
+
{
|
128
|
+
"table_name": "customers",
|
129
|
+
"columns": [
|
130
|
+
{"name": "customer_id", "type": "NUMBER", "is_primary_key": True},
|
131
|
+
{"name": "name", "type": "STRING"},
|
132
|
+
],
|
133
|
+
},
|
134
|
+
]
|
135
|
+
|
136
|
+
result = discover_relationships_from_table_definitions(
|
137
|
+
payload,
|
138
|
+
default_workspace="demo",
|
139
|
+
default_schema="sales",
|
140
|
+
max_relationships=5,
|
141
|
+
timeout_seconds=5.0,
|
142
|
+
)
|
143
|
+
|
144
|
+
assert result.summary.total_tables == 2
|
145
|
+
assert result.summary.total_relationships_found >= 1
|
146
|
+
assert not result.summary.limited_by_timeout
|
147
|
+
assert any(
|
148
|
+
rel.left_table == "ORDERS" and rel.right_table == "CUSTOMERS"
|
149
|
+
for rel in result.relationships
|
150
|
+
)
|
151
|
+
|
152
|
+
|
153
|
+
def test_discover_relationships_from_table_definitions_filters_generic_ids() -> None:
|
154
|
+
payload = [
|
155
|
+
{
|
156
|
+
"table_name": "table_a",
|
157
|
+
"columns": [
|
158
|
+
{"name": "id", "type": "NUMBER", "is_primary_key": True},
|
159
|
+
{"name": "value", "type": "NUMBER"},
|
160
|
+
],
|
161
|
+
},
|
162
|
+
{
|
163
|
+
"table_name": "table_b",
|
164
|
+
"columns": [
|
165
|
+
{"name": "id", "type": "NUMBER", "is_primary_key": True},
|
166
|
+
{"name": "value", "type": "NUMBER"},
|
167
|
+
],
|
168
|
+
},
|
169
|
+
]
|
170
|
+
|
171
|
+
result = discover_relationships_from_table_definitions(
|
172
|
+
payload,
|
173
|
+
min_confidence=0.6,
|
174
|
+
max_relationships=5,
|
175
|
+
)
|
176
|
+
|
177
|
+
assert result.summary.total_relationships_found == 0
|
178
|
+
assert not result.relationships
|
File without changes
|
File without changes
|