clickzetta-semantic-model-generator 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/PKG-INFO +1 -1
  2. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/pyproject.toml +1 -1
  3. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/clickzetta_connector.py +63 -42
  4. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/utils.py +44 -2
  5. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/cte_utils.py +43 -13
  6. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/generate_model.py +126 -18
  7. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/relationships/__init__.py +2 -0
  8. clickzetta_semantic_model_generator-1.0.5/semantic_model_generator/relationships/discovery.py +402 -0
  9. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/cte_utils_test.py +14 -13
  10. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/relationship_discovery_test.py +108 -0
  11. clickzetta_semantic_model_generator-1.0.3/semantic_model_generator/relationships/discovery.py +0 -202
  12. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/LICENSE +0 -0
  13. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/README.md +0 -0
  14. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/__init__.py +0 -0
  15. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/clickzetta_utils/env_vars.py +0 -0
  16. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/__init__.py +0 -0
  17. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/cte_utils_test.py +0 -0
  18. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/data_types.py +0 -0
  19. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/data_processing/proto_utils.py +0 -0
  20. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/__init__.py +0 -0
  21. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/dashscope_client.py +0 -0
  22. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/enrichment.py +0 -0
  23. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/llm/progress_tracker.py +0 -0
  24. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/output_models/.keep +0 -0
  25. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model.proto +0 -0
  26. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model_pb2.py +0 -0
  27. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/protos/semantic_model_pb2.pyi +0 -0
  28. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/clickzetta_connector_test.py +0 -0
  29. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/generate_model_classification_test.py +0 -0
  30. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/llm_enrichment_test.py +0 -0
  31. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/relationships_filters_test.py +0 -0
  32. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/samples/validate_yamls.py +0 -0
  33. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/utils_test.py +0 -0
  34. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/validate_model_test.py +0 -0
  35. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/tests/yaml_to_semantic_model_test.py +0 -0
  36. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/context_length.py +0 -0
  37. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/keywords.py +0 -0
  38. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate/schema.py +0 -0
  39. {clickzetta_semantic_model_generator-1.0.3 → clickzetta_semantic_model_generator-1.0.5}/semantic_model_generator/validate_model.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: clickzetta-semantic-model-generator
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Curate a Semantic Model for ClickZetta Lakehouse
5
5
  License: Apache Software License; BSD License
6
6
  Author: qililiang
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "clickzetta-semantic-model-generator"
3
- version = "1.0.3"
3
+ version = "1.0.5"
4
4
  description = "Curate a Semantic Model for ClickZetta Lakehouse"
5
5
  authors = ["qililiang <qililiang@clickzetta.com>"]
6
6
  license = "Apache Software License; BSD License"
@@ -11,7 +11,12 @@ from clickzetta.zettapark.session import Session
11
11
  from loguru import logger
12
12
 
13
13
  from semantic_model_generator.clickzetta_utils import env_vars
14
- from semantic_model_generator.clickzetta_utils.utils import create_session
14
+ from semantic_model_generator.clickzetta_utils.utils import (
15
+ create_session,
16
+ join_quoted_identifiers,
17
+ normalize_identifier,
18
+ quote_identifier,
19
+ )
15
20
  from semantic_model_generator.data_processing.data_types import Column, Table
16
21
 
17
22
  ConnectionType = TypeVar("ConnectionType", bound=Session)
@@ -151,18 +156,8 @@ class ClickzettaConnectionProxy:
151
156
  self.session.close()
152
157
 
153
158
 
154
- def _quote_identifier(name: str) -> str:
155
- return f'"{name}"'
156
-
157
-
158
159
  def _qualify_table(workspace: str, schema_name: str, table_name: str) -> str:
159
- return ".".join(
160
- [
161
- _quote_identifier(workspace),
162
- _quote_identifier(schema_name),
163
- _quote_identifier(table_name),
164
- ]
165
- )
160
+ return join_quoted_identifiers(workspace, schema_name, table_name)
166
161
 
167
162
 
168
163
  def _value_is_true(value: Any) -> bool:
@@ -175,11 +170,9 @@ def _value_is_true(value: Any) -> bool:
175
170
 
176
171
 
177
172
  def _sanitize_identifier(value: Any, fallback: str = "") -> str:
178
- if value is None or value == "":
173
+ normalized = normalize_identifier(value)
174
+ if not normalized:
179
175
  return fallback
180
- normalized = str(value).strip()
181
- if normalized.startswith('"') and normalized.endswith('"') and len(normalized) >= 2:
182
- normalized = normalized[1:-1]
183
176
  return normalized
184
177
 
185
178
 
@@ -216,21 +209,19 @@ def _fetch_distinct_values(
216
209
  column_name: str,
217
210
  ndv: int,
218
211
  ) -> Optional[List[str]]:
219
- workspace_part = (
220
- _sanitize_identifier(workspace, workspace).upper() if workspace else ""
221
- )
212
+ workspace_part = _sanitize_identifier(workspace, workspace) if workspace else ""
222
213
  schema_part = (
223
- _sanitize_identifier(schema_name, schema_name).upper() if schema_name else ""
214
+ _sanitize_identifier(schema_name, schema_name) if schema_name else ""
224
215
  )
225
- table_part = _sanitize_identifier(table_name, table_name).upper()
226
- column_part = _sanitize_identifier(column_name, column_name).upper()
216
+ table_part = _sanitize_identifier(table_name, table_name)
217
+ column_part = _sanitize_identifier(column_name, column_name)
227
218
 
228
- qualified_parts = [
229
- part for part in (workspace_part, schema_part, table_part) if part
230
- ]
231
- qualified_table = ".".join(qualified_parts)
219
+ qualified_table = join_quoted_identifiers(
220
+ workspace_part, schema_part, table_part
221
+ )
222
+ column_expr = quote_identifier(column_part)
232
223
 
233
- query = f"SELECT DISTINCT {column_part} FROM {qualified_table} LIMIT {ndv}"
224
+ query = f"SELECT DISTINCT {column_expr} FROM {qualified_table} LIMIT {ndv}"
234
225
  try:
235
226
  df = session.sql(query).to_pandas()
236
227
  if df.empty:
@@ -489,15 +480,30 @@ def _fetch_columns_via_show(
489
480
  return pd.DataFrame()
490
481
 
491
482
  rows: List[pd.DataFrame] = []
492
- catalog = workspace.upper()
493
- schema = table_schema.upper() if table_schema else ""
483
+ category = _catalog_category(session, workspace)
484
+ is_shared_catalog = category in {"SHARED", "EXTERNAL"}
485
+ catalog = workspace if is_shared_catalog else workspace.upper()
486
+ schema = (
487
+ table_schema or ""
488
+ )
489
+ if schema and not is_shared_catalog:
490
+ schema = schema.upper()
494
491
 
495
492
  for table_name in table_names:
496
493
  qualified_parts = [
497
- part for part in (catalog, schema, table_name.upper()) if part
494
+ part
495
+ for part in (
496
+ catalog,
497
+ schema,
498
+ table_name.upper() if not is_shared_catalog else table_name,
499
+ )
500
+ if part
498
501
  ]
499
502
  qualified_table = ".".join(qualified_parts)
500
- query = f"SHOW COLUMNS IN {qualified_table}"
503
+ if is_shared_catalog:
504
+ query = f"SHOW COLUMNS IN SHARE {qualified_table}"
505
+ else:
506
+ query = f"SHOW COLUMNS IN {qualified_table}"
501
507
  try:
502
508
  df = session.sql(query).to_pandas()
503
509
  except Exception as exc:
@@ -655,14 +661,25 @@ def fetch_tables_views_in_schema(
655
661
  parts = schema_name.split(".", maxsplit=1)
656
662
  workspace = parts[0]
657
663
  schema = parts[1] if len(parts) > 1 else ""
658
- workspace_upper = workspace.upper()
659
- schema_upper = schema.upper()
664
+ category = _catalog_category(session, workspace)
665
+ is_shared_catalog = category in {"SHARED", "EXTERNAL"}
666
+
667
+ workspace_token = workspace if is_shared_catalog else workspace.upper()
668
+ schema_token = schema if is_shared_catalog else schema.upper()
660
669
 
661
670
  try:
662
- if workspace_upper and schema_upper:
663
- df = session.sql(
664
- f"SHOW TABLES IN {workspace_upper}.{schema_upper}"
665
- ).to_pandas()
671
+ if workspace_token and schema_token:
672
+ if is_shared_catalog:
673
+ scope = ".".join(
674
+ part for part in (workspace_token, schema_token) if part
675
+ )
676
+ df = session.sql(f"SHOW TABLES IN SHARE {scope}").to_pandas()
677
+ else:
678
+ scope = join_quoted_identifiers(
679
+ workspace_token,
680
+ schema_token,
681
+ )
682
+ df = session.sql(f"SHOW TABLES IN {scope}").to_pandas()
666
683
  else:
667
684
  df = session.sql("SHOW TABLES").to_pandas()
668
685
  except Exception as exc: # pragma: no cover
@@ -738,11 +755,15 @@ def fetch_stages_in_schema(connection: Any, schema_name: str) -> List[str]:
738
755
 
739
756
  queries: List[str] = []
740
757
  if schema:
741
- queries.append(f"SHOW VOLUMES IN {workspace}.{schema}")
742
- queries.append(f"SHOW STAGES IN SCHEMA {workspace}.{schema}")
758
+ scope = join_quoted_identifiers(workspace, schema)
759
+ if scope:
760
+ queries.append(f"SHOW VOLUMES IN {scope}")
761
+ queries.append(f"SHOW STAGES IN SCHEMA {scope}")
743
762
  else:
744
- queries.append(f"SHOW VOLUMES IN {workspace}")
745
- queries.append(f"SHOW STAGES IN DATABASE {workspace}")
763
+ workspace_identifier = quote_identifier(workspace)
764
+ if workspace_identifier:
765
+ queries.append(f"SHOW VOLUMES IN {workspace_identifier}")
766
+ queries.append(f"SHOW STAGES IN DATABASE {workspace_identifier}")
746
767
 
747
768
  stage_names: List[str] = ["volume:user://~/semantic_models/"]
748
769
  seen: set[str] = set(stage_names)
@@ -899,7 +920,7 @@ def create_table_in_schema(
899
920
  columns_schema: Dict[str, str],
900
921
  ) -> bool:
901
922
  fields = ", ".join(
902
- f"{_quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
923
+ f"{quote_identifier(name)} {dtype}" for name, dtype in columns_schema.items()
903
924
  )
904
925
  query = f"CREATE TABLE IF NOT EXISTS {table_fqn} ({fields})"
905
926
  try:
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from contextlib import contextmanager
4
- from typing import Dict, Iterable
4
+ from typing import Any, Dict, Iterable
5
5
 
6
6
  from clickzetta.zettapark.session import Session
7
7
 
@@ -21,6 +21,47 @@ DEFAULT_HINTS: Dict[str, str] = {
21
21
  }
22
22
 
23
23
 
24
+ def normalize_identifier(value: Any) -> str:
25
+ """
26
+ Strips outer quotes/backticks and surrounding whitespace from an identifier.
27
+ Returns an empty string when the identifier is missing.
28
+ """
29
+
30
+ if value is None:
31
+ return ""
32
+ text = str(value).strip()
33
+ if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', '`'}:
34
+ return text[1:-1]
35
+ return text
36
+
37
+
38
+ def quote_identifier(value: Any) -> str:
39
+ """
40
+ Wraps an identifier in backticks, escaping embedded backticks as needed.
41
+ Returns an empty string if the identifier is missing.
42
+ """
43
+
44
+ normalized = normalize_identifier(value)
45
+ if not normalized:
46
+ return ""
47
+ escaped = normalized.replace("`", "``")
48
+ return f"`{escaped}`"
49
+
50
+
51
+ def join_quoted_identifiers(*parts: Any) -> str:
52
+ """
53
+ Joins identifier parts with '.' and ensures each segment is backtick-quoted.
54
+ Empty segments are skipped.
55
+ """
56
+
57
+ quoted_parts = [
58
+ quote_identifier(part)
59
+ for part in parts
60
+ if normalize_identifier(part)
61
+ ]
62
+ return ".".join(part for part in quoted_parts if part)
63
+
64
+
24
65
  def create_fqn_table(fqn_str: str) -> FQNParts:
25
66
  """
26
67
  Splits a fully qualified table name into its ClickZetta components.
@@ -72,7 +113,8 @@ def _apply_session_context(session: Session, *, schema: str, vcluster: str) -> N
72
113
  ("schema", schema),
73
114
  ("vcluster", vcluster),
74
115
  ):
75
- session.sql(f"USE {component.upper()} {value.upper()}")
116
+ identifier = quote_identifier(value)
117
+ session.sql(f"USE {component.upper()} {identifier}")
76
118
 
77
119
 
78
120
  def _iter_non_empty(*pairs: tuple[str, str]) -> Iterable[tuple[str, str]]:
@@ -11,12 +11,34 @@ from sqlglot import Dialect
11
11
  from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
12
12
  OBJECT_DATATYPES,
13
13
  )
14
+ from semantic_model_generator.clickzetta_utils.utils import (
15
+ join_quoted_identifiers,
16
+ normalize_identifier,
17
+ )
14
18
  from semantic_model_generator.protos import semantic_model_pb2
15
19
 
16
20
  _SQLGLOT_CLICKZETTA_KEY = "".join(["snow", "flake"])
17
21
  ClickzettaDialect = Dialect.get_or_raise(_SQLGLOT_CLICKZETTA_KEY)
18
22
 
19
23
  _LOGICAL_TABLE_PREFIX = "__"
24
+ _SQLGLOT_QUOTE_CHAR = '"'
25
+
26
+
27
+ def _prepare_sql_for_parsing(sql: str) -> str:
28
+ """
29
+ Converts backtick-quoted identifiers to double quotes for SQLGlot parsing.
30
+ """
31
+
32
+ return sql.replace("`", _SQLGLOT_QUOTE_CHAR)
33
+
34
+
35
+ def _render_clickzetta_sql(expression: sqlglot.Expression, *, pretty: bool = False) -> str:
36
+ """
37
+ Renders a SQLGlot expression using ClickZetta dialect and rewrites identifiers with backticks.
38
+ """
39
+
40
+ rendered = expression.sql(dialect=ClickzettaDialect, pretty=pretty)
41
+ return rendered.replace(_SQLGLOT_QUOTE_CHAR, "`")
20
42
 
21
43
 
22
44
  def is_logical_table(table_name: str) -> bool:
@@ -33,12 +55,12 @@ def logical_table_name(table: semantic_model_pb2.Table) -> str:
33
55
 
34
56
  def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
35
57
  """Returns fully qualified table name such as my_db.my_schema.my_table"""
36
- fqn = table.table
37
- if len(table.schema) > 0:
38
- fqn = f"{table.schema}.{fqn}"
39
- if len(table.database) > 0:
40
- fqn = f"{table.database}.{fqn}"
41
- return fqn # type: ignore[no-any-return]
58
+ parts = [
59
+ normalize_identifier(component)
60
+ for component in (table.database, table.schema, table.table)
61
+ if component
62
+ ]
63
+ return join_quoted_identifiers(*parts) # type: ignore[no-any-return]
42
64
 
43
65
 
44
66
  def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
@@ -156,8 +178,8 @@ def _generate_cte_for(
156
178
  cte = f"WITH {logical_table_name(table)} AS (\n"
157
179
  cte += "SELECT \n"
158
180
  cte += ",\n".join(expr_columns) + "\n"
159
- cte += f"FROM {fully_qualified_table_name(table.base_table)}"
160
- cte += ")"
181
+ cte += f"FROM {fully_qualified_table_name(table.base_table)}\n"
182
+ cte += ")\n"
161
183
  return cte
162
184
 
163
185
 
@@ -261,13 +283,15 @@ def _convert_to_clickzetta_sql(sql: str) -> str:
261
283
  str: The SQL statement in ClickZetta syntax.
262
284
  """
263
285
  try:
264
- expression = sqlglot.parse_one(sql, dialect=ClickzettaDialect)
286
+ expression = sqlglot.parse_one(
287
+ _prepare_sql_for_parsing(sql), dialect=ClickzettaDialect
288
+ )
265
289
  except Exception as e:
266
290
  raise ValueError(
267
291
  f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
268
292
  )
269
293
 
270
- return expression.sql(dialect=ClickzettaDialect)
294
+ return _render_clickzetta_sql(expression)
271
295
 
272
296
 
273
297
  def generate_select(
@@ -332,12 +356,16 @@ def expand_all_logical_tables_as_ctes(
332
356
  for cte in ctes:
333
357
  new_withs.append(
334
358
  sqlglot.parse_one(
335
- cte, read=ClickzettaDialect, into=sqlglot.expressions.With
359
+ _prepare_sql_for_parsing(cte),
360
+ read=ClickzettaDialect,
361
+ into=sqlglot.expressions.With,
336
362
  )
337
363
  )
338
364
 
339
365
  # Step 3: Prefix the CTEs to the original query.
340
- ast = sqlglot.parse_one(sql_query, read=ClickzettaDialect)
366
+ ast = sqlglot.parse_one(
367
+ _prepare_sql_for_parsing(sql_query), read=ClickzettaDialect
368
+ )
341
369
  with_ = ast.args.get("with")
342
370
  # If the query doesn't have a WITH clause, then generate one.
343
371
  if with_ is None:
@@ -349,7 +377,9 @@ def expand_all_logical_tables_as_ctes(
349
377
  else:
350
378
  new_ctes = [w.expressions[0] for w in new_withs]
351
379
  with_.set("expressions", new_ctes + with_.expressions)
352
- return ast.sql(dialect=ClickzettaDialect, pretty=True) # type: ignore [no-any-return]
380
+ return _render_clickzetta_sql(
381
+ ast, pretty=True
382
+ ) # type: ignore [no-any-return]
353
383
 
354
384
 
355
385
  def context_to_column_format(
@@ -1,6 +1,7 @@
1
1
  import math
2
2
  import os
3
3
  import re
4
+ import time
4
5
  from collections import defaultdict
5
6
  from datetime import datetime
6
7
  from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -17,7 +18,12 @@ from semantic_model_generator.clickzetta_utils.clickzetta_connector import (
17
18
  get_table_representation,
18
19
  get_valid_schemas_tables_columns_df,
19
20
  )
20
- from semantic_model_generator.clickzetta_utils.utils import create_fqn_table
21
+ from semantic_model_generator.clickzetta_utils.utils import (
22
+ create_fqn_table,
23
+ join_quoted_identifiers,
24
+ normalize_identifier,
25
+ quote_identifier,
26
+ )
21
27
  from semantic_model_generator.data_processing import data_types, proto_utils
22
28
  from semantic_model_generator.llm import (
23
29
  DashscopeClient,
@@ -41,6 +47,14 @@ _AUTOGEN_COMMENT_TOKEN = (
41
47
  )
42
48
  _DEFAULT_N_SAMPLE_VALUES_PER_COL = 10
43
49
  _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
50
+ _GENERIC_IDENTIFIER_TOKENS = {
51
+ "ID",
52
+ "NAME",
53
+ "CODE",
54
+ "KEY",
55
+ "VALUE",
56
+ "NUMBER",
57
+ }
44
58
 
45
59
 
46
60
  def _singularize(token: str) -> str:
@@ -90,6 +104,14 @@ def _identifier_tokens(
90
104
  return tokens
91
105
 
92
106
 
107
+ def _is_generic_identifier(name: str) -> bool:
108
+ tokens = [token for token in _identifier_tokens(name) if token]
109
+ if not tokens:
110
+ return True
111
+ normalized_tokens = {token.upper() for token in tokens}
112
+ return normalized_tokens.issubset(_GENERIC_IDENTIFIER_TOKENS)
113
+
114
+
93
115
  def _sanitize_identifier_name(
94
116
  name: str, prefixes_to_drop: Optional[set[str]] = None
95
117
  ) -> str:
@@ -354,19 +376,17 @@ def _format_literal(value: str, base_type: str) -> str:
354
376
 
355
377
  def _format_sql_identifier(name: str) -> str:
356
378
  """
357
- Formats an identifier for SQL (without quoting) by stripping quotes and uppercasing.
379
+ Formats an identifier for SQL by wrapping it in backticks.
358
380
  """
359
- if not name:
360
- return ""
361
- return str(name).replace('"', "").replace("`", "").strip().upper()
381
+ return quote_identifier(name)
362
382
 
363
383
 
364
384
  def _qualified_table_name(fqn: data_types.FQNParts) -> str:
365
385
  """
366
- Builds a fully qualified table name without quoting.
386
+ Builds a fully qualified, backtick-quoted table name.
367
387
  """
368
- parts = [part for part in (fqn.database, fqn.schema_name, fqn.table) if part]
369
- return ".".join(_format_sql_identifier(part) for part in parts if part)
388
+ parts = [normalize_identifier(part) for part in (fqn.database, fqn.schema_name, fqn.table)]
389
+ return join_quoted_identifiers(*(part for part in parts if part))
370
390
 
371
391
 
372
392
  def _levenshtein_distance(s1: str, s2: str) -> int:
@@ -977,6 +997,19 @@ def _calculate_relationship_confidence(
977
997
 
978
998
  confidence_score += name_confidence
979
999
 
1000
+ generic_pair_count = sum(
1001
+ 1
1002
+ for left_col, right_col in column_pairs
1003
+ if _is_generic_identifier(left_col)
1004
+ and _is_generic_identifier(right_col)
1005
+ )
1006
+ if generic_pair_count:
1007
+ penalty = min(0.15 * generic_pair_count, 0.3)
1008
+ confidence_score = max(confidence_score - penalty, 0.0)
1009
+ reasoning_factors.append(
1010
+ f"Generic identifier names detected on both sides (-{penalty:.2f} confidence)"
1011
+ )
1012
+
980
1013
  # Check for foreign key naming patterns
981
1014
  fk_pattern_confidence = 0.0
982
1015
  for left_col, right_col in column_pairs:
@@ -2326,11 +2359,31 @@ def _infer_relationships(
2326
2359
  *,
2327
2360
  session: Optional[Session] = None,
2328
2361
  strict_join_inference: bool = False,
2362
+ status: Optional[Dict[str, bool]] = None,
2363
+ max_relationships: Optional[int] = None,
2364
+ min_confidence: float = 0.2,
2365
+ timeout_seconds: Optional[float] = None,
2329
2366
  ) -> List[semantic_model_pb2.Relationship]:
2367
+ status_dict = status if status is not None else {}
2368
+ if "limited_by_timeout" not in status_dict:
2369
+ status_dict["limited_by_timeout"] = False
2370
+ if "limited_by_max_relationships" not in status_dict:
2371
+ status_dict["limited_by_max_relationships"] = False
2372
+
2330
2373
  relationships: List[semantic_model_pb2.Relationship] = []
2331
2374
  if not raw_tables:
2332
2375
  return relationships
2333
2376
 
2377
+ start_time = time.perf_counter()
2378
+ min_confidence = max(0.0, min(min_confidence, 1.0))
2379
+ limit_reached = False
2380
+
2381
+ def _timed_out() -> bool:
2382
+ return (
2383
+ timeout_seconds is not None
2384
+ and (time.perf_counter() - start_time) >= timeout_seconds
2385
+ )
2386
+
2334
2387
  metadata = {}
2335
2388
  prefix_counter: Dict[str, int] = {}
2336
2389
  for _, raw_table in raw_tables:
@@ -2392,14 +2445,39 @@ def _infer_relationships(
2392
2445
  def _record_pair(
2393
2446
  left_table: str, right_table: str, left_col: str, right_col: str
2394
2447
  ) -> None:
2448
+ nonlocal limit_reached
2449
+ if limit_reached:
2450
+ return
2451
+ if _timed_out():
2452
+ status_dict["limited_by_timeout"] = True
2453
+ limit_reached = True
2454
+ return
2455
+
2395
2456
  key = (left_table, right_table)
2396
2457
  value = (left_col, right_col)
2397
- if value not in pairs.setdefault(key, []):
2398
- pairs[key].append(value)
2458
+ bucket = pairs.setdefault(key, [])
2459
+ if value not in bucket:
2460
+ bucket.append(value)
2461
+ if (
2462
+ max_relationships is not None
2463
+ and len(pairs) >= max_relationships
2464
+ ):
2465
+ status_dict["limited_by_max_relationships"] = True
2466
+ limit_reached = True
2399
2467
 
2400
2468
  table_names = list(metadata.keys())
2401
2469
  for i in range(len(table_names)):
2470
+ if limit_reached or status_dict["limited_by_timeout"]:
2471
+ break
2472
+ if _timed_out():
2473
+ status_dict["limited_by_timeout"] = True
2474
+ break
2402
2475
  for j in range(i + 1, len(table_names)):
2476
+ if limit_reached or status_dict["limited_by_timeout"]:
2477
+ break
2478
+ if _timed_out():
2479
+ status_dict["limited_by_timeout"] = True
2480
+ break
2403
2481
  table_a_name = table_names[i]
2404
2482
  table_b_name = table_names[j]
2405
2483
  table_a = metadata[table_a_name]
@@ -2575,6 +2653,15 @@ def _infer_relationships(
2575
2653
 
2576
2654
  # Build relationships with inferred cardinality
2577
2655
  for (left_table, right_table), column_pairs in pairs.items():
2656
+ if _timed_out():
2657
+ status_dict["limited_by_timeout"] = True
2658
+ break
2659
+ if (
2660
+ max_relationships is not None
2661
+ and len(relationships) >= max_relationships
2662
+ ):
2663
+ status_dict["limited_by_max_relationships"] = True
2664
+ break
2578
2665
  # Infer cardinality based on available metadata
2579
2666
  left_meta = metadata[left_table]
2580
2667
  right_meta = metadata[right_table]
@@ -2777,6 +2864,16 @@ def _infer_relationships(
2777
2864
  for factor in confidence_analysis["reasoning_factors"][:3]: # Top 3 factors
2778
2865
  logger.debug(f" + {factor}")
2779
2866
 
2867
+ if confidence_analysis["confidence_score"] < min_confidence:
2868
+ logger.debug(
2869
+ "Dropping relationship {} -> {} due to low confidence {:.2f} (threshold {:.2f})",
2870
+ left_table,
2871
+ right_table,
2872
+ confidence_analysis["confidence_score"],
2873
+ min_confidence,
2874
+ )
2875
+ continue
2876
+
2780
2877
  # Determine relationship type based on cardinality
2781
2878
  if left_card == "1" and right_card == "1":
2782
2879
  rel_type = semantic_model_pb2.RelationshipType.one_to_one
@@ -2804,16 +2901,27 @@ def _infer_relationships(
2804
2901
  relationships.append(relationship)
2805
2902
 
2806
2903
  # Phase 2: Detect many-to-many relationships through bridge table analysis
2807
- many_to_many_relationships = _detect_many_to_many_relationships(
2808
- raw_tables, metadata, relationships
2809
- )
2810
-
2811
- if many_to_many_relationships:
2812
- relationships.extend(many_to_many_relationships)
2813
- logger.info(
2814
- f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
2904
+ many_to_many_relationships: List[semantic_model_pb2.Relationship] = []
2905
+ if not status_dict["limited_by_timeout"] and (
2906
+ max_relationships is None or len(relationships) < max_relationships
2907
+ ):
2908
+ many_to_many_relationships = _detect_many_to_many_relationships(
2909
+ raw_tables, metadata, relationships
2815
2910
  )
2816
2911
 
2912
+ if many_to_many_relationships and max_relationships is not None:
2913
+ remaining = max_relationships - len(relationships)
2914
+ if remaining <= 0:
2915
+ many_to_many_relationships = []
2916
+ else:
2917
+ many_to_many_relationships = many_to_many_relationships[:remaining]
2918
+
2919
+ if many_to_many_relationships:
2920
+ relationships.extend(many_to_many_relationships)
2921
+ logger.info(
2922
+ f"Detected {len(many_to_many_relationships)} many-to-many relationships via bridge tables"
2923
+ )
2924
+
2817
2925
  logger.info(
2818
2926
  f"Inferred {len(relationships)} total relationships across {len(raw_tables)} tables"
2819
2927
  )
@@ -4,6 +4,7 @@ from .discovery import (
4
4
  RelationshipDiscoveryResult,
5
5
  RelationshipSummary,
6
6
  discover_relationships_from_schema,
7
+ discover_relationships_from_table_definitions,
7
8
  discover_relationships_from_tables,
8
9
  )
9
10
 
@@ -11,5 +12,6 @@ __all__ = [
11
12
  "RelationshipDiscoveryResult",
12
13
  "RelationshipSummary",
13
14
  "discover_relationships_from_schema",
15
+ "discover_relationships_from_table_definitions",
14
16
  "discover_relationships_from_tables",
15
17
  ]