databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,162 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Mapping
4
+
5
+ import clickhouse_connect
6
+ from pydantic import Field
7
+
8
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
9
+ from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
10
+ from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
11
+ from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
12
+
13
+
14
+ class ClickhouseConfigFile(BaseDatabaseConfigFile):
15
+ type: str = Field(default="databases/clickhouse")
16
+ connection: dict[str, Any] = Field(
17
+ description="Connection parameters for the Clickhouse database. It can contain any of the keys supported by the Clickhouse connection library (see https://clickhouse.com/docs/integrations/language-clients/python/driver-api#connection-arguments)"
18
+ )
19
+
20
+
21
+ class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
22
+ _IGNORED_SCHEMAS = {"information_schema", "system", "INFORMATION_SCHEMA"}
23
+
24
+ supports_catalogs = True
25
+
26
+ def _connect(self, file_config: ClickhouseConfigFile):
27
+ connection = file_config.connection
28
+ if not isinstance(connection, Mapping):
29
+ raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
30
+
31
+ return clickhouse_connect.get_client(**connection)
32
+
33
+ def _connect_to_catalog(self, file_config: ClickhouseConfigFile, catalog: str):
34
+ return self._connect(file_config)
35
+
36
+ def _get_catalogs(self, connection, file_config: ClickhouseConfigFile) -> list[str]:
37
+ return ["clickhouse"]
38
+
39
+ def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
40
+ return SQLQuery(
41
+ """
42
+ SELECT
43
+ name AS schema_name,
44
+ 'clickhouse' AS catalog_name
45
+ FROM
46
+ system.databases
47
+ """,
48
+ None,
49
+ )
50
+
51
+ def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
52
+ if not schemas:
53
+ return []
54
+
55
+ schemas_sql = ", ".join(self._quote_literal(s) for s in schemas)
56
+
57
+ comps = self._component_queries()
58
+ results: dict[str, list[dict]] = {cq: [] for cq in comps}
59
+ for cq, template_sql in comps.items():
60
+ sql = template_sql.replace("{SCHEMAS}", schemas_sql)
61
+ results[cq] = self._fetchall_dicts(connection, sql, None)
62
+
63
+ return IntrospectionModelBuilder.build_schemas_from_components(
64
+ schemas=schemas,
65
+ rels=results.get("relations", []),
66
+ cols=results.get("columns", []),
67
+ pk_cols=[],
68
+ uq_cols=[],
69
+ checks=[],
70
+ fk_cols=[],
71
+ idx_cols=results.get("idx", []),
72
+ )
73
+
74
+ def _component_queries(self) -> dict[str, str]:
75
+ return {"relations": self._sql_relations(), "columns": self._sql_columns(), "idx": self._sql_indexes()}
76
+
77
+ def _sql_relations(self) -> str:
78
+ return r"""
79
+ SELECT
80
+ t.database AS schema_name,
81
+ t.name AS table_name,
82
+ multiIf(
83
+ t.engine = 'View', 'view',
84
+ t.engine = 'MaterializedView', 'materialized_view',
85
+ t.engine IN (
86
+ 'File', 'URL',
87
+ 'S3', 'S3Queue',
88
+ 'AzureBlobStorage', 'AzureQueue',
89
+ 'HDFS',
90
+ 'MySQL', 'PostgreSQL', 'MongoDB', 'Redis',
91
+ 'JDBC', 'ODBC', 'SQLite',
92
+ 'Kafka', 'RabbitMQ', 'NATS',
93
+ 'ExternalDistributed',
94
+ 'DeltaLake', 'Iceberg', 'Hudi', 'Hive',
95
+ 'MaterializedPostgreSQL',
96
+ 'YTsaurus', 'ArrowFlight'
97
+ ), 'external_table',
98
+ 'table'
99
+ ) AS kind,
100
+ t.comment AS description
101
+ FROM
102
+ system.tables t
103
+ WHERE
104
+ t.database IN ({SCHEMAS})
105
+ ORDER BY
106
+ t.name
107
+ """
108
+
109
+ def _sql_columns(self) -> str:
110
+ return r"""
111
+ SELECT
112
+ c.database AS schema_name,
113
+ c.table AS table_name,
114
+ c.name AS column_name,
115
+ c.position AS ordinal_position,
116
+ c.type AS data_type,
117
+ (c.type LIKE 'Nullable(%') AS is_nullable,
118
+ c.default_expression AS default_expression,
119
+ CASE
120
+ WHEN c.default_kind IN ('MATERIALIZED','ALIAS') THEN 'computed'
121
+ END AS generated,
122
+ c.comment AS description
123
+ FROM
124
+ system.columns c
125
+ WHERE
126
+ c.database IN ({SCHEMAS})
127
+ ORDER BY
128
+ c.table,
129
+ c.position
130
+ """
131
+
132
+ def _sql_indexes(self) -> str:
133
+ return r"""
134
+ SELECT
135
+ i.database AS schema_name,
136
+ i.table AS table_name,
137
+ i.name AS index_name,
138
+ 1 AS position,
139
+ i.expr AS expr,
140
+ 0 AS is_unique,
141
+ i.type AS method,
142
+ NULL AS predicate
143
+ FROM
144
+ system.data_skipping_indices i
145
+ WHERE
146
+ i.database IN ({SCHEMAS})
147
+ ORDER BY
148
+ i.table,
149
+ i.name
150
+ """
151
+
152
+ def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
153
+ sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %s'
154
+ return SQLQuery(sql, (limit,))
155
+
156
+ def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
157
+ res = connection.query(sql, parameters=params) if params else connection.query(sql)
158
+ cols = [c.lower() for c in res.column_names]
159
+ return [dict(zip(cols, row)) for row in res.result_rows]
160
+
161
+ def _quote_literal(self, value: str) -> str:
162
+ return "'" + str(value).replace("'", "\\'") + "'"
@@ -0,0 +1,69 @@
1
+ from dataclasses import dataclass
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
4
+ from databao_context_engine.plugins.databases.databases_types import (
5
+ DatabaseColumn,
6
+ DatabaseIntrospectionResult,
7
+ DatabaseTable,
8
+ )
9
+
10
+
11
+ @dataclass
12
+ class DatabaseTableChunkContent:
13
+ catalog_name: str
14
+ schema_name: str
15
+ table: DatabaseTable
16
+
17
+
18
+ @dataclass
19
+ class DatabaseColumnChunkContent:
20
+ catalog_name: str
21
+ schema_name: str
22
+ table_name: str
23
+ column: DatabaseColumn
24
+
25
+
26
+ def build_database_chunks(result: DatabaseIntrospectionResult) -> list[EmbeddableChunk]:
27
+ chunks = []
28
+ for catalog in result.catalogs:
29
+ for schema in catalog.schemas:
30
+ for table in schema.tables:
31
+ chunks.append(_create_table_chunk(catalog.name, schema.name, table))
32
+
33
+ for column in table.columns:
34
+ chunks.append(_create_column_chunk(catalog.name, schema.name, table.name, column))
35
+
36
+ return chunks
37
+
38
+
39
+ def _create_table_chunk(catalog_name: str, schema_name: str, table: DatabaseTable) -> EmbeddableChunk:
40
+ return EmbeddableChunk(
41
+ embeddable_text=_build_table_chunk_text(table),
42
+ content=DatabaseTableChunkContent(
43
+ catalog_name=catalog_name,
44
+ schema_name=schema_name,
45
+ table=table,
46
+ ),
47
+ )
48
+
49
+
50
+ def _create_column_chunk(
51
+ catalog_name: str, schema_name: str, table_name: str, column: DatabaseColumn
52
+ ) -> EmbeddableChunk:
53
+ return EmbeddableChunk(
54
+ embeddable_text=_build_column_chunk_text(table_name, column),
55
+ content=DatabaseColumnChunkContent(
56
+ catalog_name=catalog_name,
57
+ schema_name=schema_name,
58
+ table_name=table_name,
59
+ column=column,
60
+ ),
61
+ )
62
+
63
+
64
+ def _build_table_chunk_text(database_table: DatabaseTable) -> str:
65
+ return f"Table {database_table.name} with columns {','.join([column.name for column in database_table.columns])}"
66
+
67
+
68
+ def _build_column_chunk_text(table_name: str, database_object: DatabaseColumn) -> str:
69
+ return f"Column {database_object.name} in table {table_name}"
@@ -0,0 +1,114 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Any, Literal
4
+
5
+
6
+ class DatasetKind(str, Enum):
7
+ TABLE = "table"
8
+ VIEW = "view"
9
+ MATERIALIZED_VIEW = "materialized_view"
10
+ EXTERNAL_TABLE = "external_table"
11
+
12
+ @classmethod
13
+ def from_raw(cls, raw: str | None) -> "DatasetKind":
14
+ default = cls.TABLE
15
+ if raw is None:
16
+ return default
17
+ if isinstance(raw, str):
18
+ raw = raw.lower()
19
+ try:
20
+ return cls(raw)
21
+ except Exception:
22
+ return default
23
+
24
+
25
+ @dataclass
26
+ class CheckConstraint:
27
+ name: str | None
28
+ expression: str
29
+ validated: bool | None
30
+
31
+
32
+ @dataclass
33
+ class KeyConstraint:
34
+ name: str | None
35
+ columns: list[str]
36
+ validated: bool | None
37
+
38
+
39
+ @dataclass
40
+ class ForeignKeyColumnMap:
41
+ from_column: str
42
+ to_column: str
43
+
44
+
45
+ @dataclass
46
+ class ForeignKey:
47
+ name: str | None
48
+ mapping: list[ForeignKeyColumnMap]
49
+ referenced_table: str
50
+ enforced: bool | None = None
51
+ validated: bool | None = None
52
+ on_update: str | None = None
53
+ on_delete: str | None = None
54
+ cardinality_inferred: Literal["one_to_one", "many_to_one"] | None = None
55
+
56
+
57
+ @dataclass
58
+ class Index:
59
+ name: str
60
+ columns: list[str]
61
+ unique: bool = False
62
+ method: str | None = None
63
+ predicate: str | None = None
64
+
65
+
66
+ @dataclass
67
+ class DatabaseColumn:
68
+ name: str
69
+ type: str
70
+ nullable: bool
71
+ description: str | None = None
72
+ default_expression: str | None = None
73
+ generated: Literal["identity", "computed"] | None = None
74
+ checks: list[CheckConstraint] = field(default_factory=list)
75
+
76
+
77
+ @dataclass
78
+ class DatabasePartitionInfo:
79
+ meta: dict[str, Any]
80
+ partition_tables: list[str]
81
+
82
+
83
+ @dataclass
84
+ class DatabaseTable:
85
+ name: str
86
+ columns: list[DatabaseColumn]
87
+ samples: list[dict[str, Any]]
88
+ partition_info: DatabasePartitionInfo | None = None
89
+ description: str | None = None
90
+ kind: DatasetKind = DatasetKind.TABLE
91
+ primary_key: KeyConstraint | None = None
92
+ unique_constraints: list[KeyConstraint] = field(default_factory=list)
93
+ checks: list[CheckConstraint] = field(default_factory=list)
94
+ indexes: list[Index] = field(default_factory=list)
95
+ foreign_keys: list[ForeignKey] = field(default_factory=list)
96
+
97
+
98
+ @dataclass
99
+ class DatabaseSchema:
100
+ name: str
101
+ tables: list[DatabaseTable]
102
+ description: str | None = None
103
+
104
+
105
+ @dataclass
106
+ class DatabaseCatalog:
107
+ name: str
108
+ schemas: list[DatabaseSchema]
109
+ description: str | None = None
110
+
111
+
112
+ @dataclass
113
+ class DatabaseIntrospectionResult:
114
+ catalogs: list[DatabaseCatalog]
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ import duckdb
4
+ from pydantic import BaseModel, Field
5
+
6
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
7
+ from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
8
+ from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
9
+ from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
10
+
11
+
12
+ class DuckDBConfigFile(BaseDatabaseConfigFile):
13
+ type: str = Field(default="databases/duckdb")
14
+ connection: DuckDBConnectionConfig
15
+
16
+
17
+ class DuckDBConnectionConfig(BaseModel):
18
+ database: str = Field(description="Path to the DuckDB database file")
19
+
20
+
21
+ class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
22
+ _IGNORED_CATALOGS = {"system", "temp"}
23
+ _IGNORED_SCHEMAS = {"information_schema", "pg_catalog"}
24
+ supports_catalogs = True
25
+
26
+ def _connect(self, file_config: DuckDBConfigFile):
27
+ database_path = str(file_config.connection.database)
28
+ return duckdb.connect(database=database_path)
29
+
30
+ def _connect_to_catalog(self, file_config: DuckDBConfigFile, catalog: str):
31
+ return self._connect(file_config)
32
+
33
+ def _get_catalogs(self, connection, file_config: DuckDBConfigFile) -> list[str]:
34
+ rows = self._fetchall_dicts(connection, "SELECT database_name FROM duckdb_databases();", None)
35
+ catalogs = [r["database_name"] for r in rows if r.get("database_name")]
36
+ catalogs_filtered = [c for c in catalogs if c.lower() not in self._IGNORED_CATALOGS]
37
+ return catalogs_filtered or [self._resolve_pseudo_catalog_name(file_config)]
38
+
39
+ def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
40
+ if not catalogs:
41
+ return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
42
+ sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY(?)"
43
+ return SQLQuery(sql, (catalogs,))
44
+
45
+ def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
46
+ if not schemas:
47
+ return []
48
+
49
+ comps = self._component_queries()
50
+ results: dict[str, list[dict]] = {cq: [] for cq in comps}
51
+
52
+ for cq, sql in comps.items():
53
+ results[cq] = self._fetchall_dicts(connection, sql, (schemas,))
54
+
55
+ return IntrospectionModelBuilder.build_schemas_from_components(
56
+ schemas=schemas,
57
+ rels=results.get("relations", []),
58
+ cols=results.get("columns", []),
59
+ pk_cols=results.get("pk", []),
60
+ uq_cols=results.get("uq", []),
61
+ checks=results.get("checks", []),
62
+ fk_cols=results.get("fks", []),
63
+ idx_cols=results.get("idx", []),
64
+ )
65
+
66
+ def _component_queries(self) -> dict[str, str]:
67
+ return {
68
+ "relations": self._sql_relations(),
69
+ "columns": self._sql_columns(),
70
+ "pk": self._sql_primary_keys(),
71
+ "uq": self._sql_unique(),
72
+ "checks": self._sql_checks(),
73
+ "fks": self._sql_foreign_keys(),
74
+ "idx": self._sql_indexes(),
75
+ }
76
+
77
+ def _sql_relations(self) -> str:
78
+ return r"""
79
+ SELECT
80
+ table_schema AS schema_name,
81
+ table_name,
82
+ CASE table_type
83
+ WHEN 'BASE TABLE' THEN 'table'
84
+ WHEN 'VIEW' THEN 'view'
85
+ WHEN 'MATERIALIZED VIEW' THEN 'materialized_view'
86
+ ELSE lower(table_type)
87
+ END AS kind,
88
+ NULL::VARCHAR AS description
89
+ FROM
90
+ information_schema.tables
91
+ WHERE
92
+ table_schema = ANY(?)
93
+ ORDER BY
94
+ table_name;
95
+ """
96
+
97
+ def _sql_columns(self) -> str:
98
+ return r"""
99
+ SELECT
100
+ c.table_schema AS schema_name,
101
+ c.table_name,
102
+ c.column_name,
103
+ c.ordinal_position AS ordinal_position,
104
+ c.data_type AS data_type,
105
+ CASE
106
+ WHEN c.is_nullable = 'YES' THEN TRUE
107
+ ELSE FALSE
108
+ END AS is_nullable,
109
+ c.column_default AS default_expression,
110
+ NULL::VARCHAR AS generated,
111
+ NULL::VARCHAR AS description
112
+ FROM
113
+ information_schema.columns c
114
+ WHERE
115
+ c.table_schema = ANY(?)
116
+ ORDER BY
117
+ c.table_schema,
118
+ c.table_name,
119
+ c.ordinal_position;
120
+ """
121
+
122
+ def _sql_primary_keys(self) -> str:
123
+ return r"""
124
+ WITH d AS (
125
+ SELECT
126
+ *
127
+ FROM
128
+ duckdb_constraints()
129
+ WHERE
130
+ schema_name = ANY(?)
131
+ AND constraint_type = 'PRIMARY KEY'
132
+ ),
133
+ cols AS (
134
+ SELECT
135
+ d.schema_name,
136
+ d.table_name,
137
+ d.constraint_name,
138
+ r.pos AS position,
139
+ d.constraint_column_names[r.pos] AS column_name
140
+ FROM
141
+ d,
142
+ range(1, length(d.constraint_column_names) + 1) AS r(pos)
143
+ )
144
+ SELECT
145
+ schema_name,
146
+ table_name,
147
+ constraint_name,
148
+ position,
149
+ column_name
150
+ FROM
151
+ cols
152
+ ORDER BY
153
+ schema_name,
154
+ table_name,
155
+ constraint_name,
156
+ position;
157
+ """
158
+
159
+ def _sql_unique(self) -> str:
160
+ return r"""
161
+ WITH d AS (
162
+ SELECT
163
+ *
164
+ FROM
165
+ duckdb_constraints()
166
+ WHERE
167
+ schema_name = ANY(?)
168
+ AND constraint_type = 'UNIQUE'
169
+ ),
170
+ cols AS (
171
+ SELECT
172
+ d.schema_name,
173
+ d.table_name,
174
+ d.constraint_name,
175
+ r.pos AS position,
176
+ d.constraint_column_names[r.pos] AS column_name
177
+ FROM
178
+ d,
179
+ range(1, length(d.constraint_column_names) + 1) AS r(pos)
180
+ )
181
+ SELECT
182
+ schema_name,
183
+ table_name,
184
+ constraint_name,
185
+ position,
186
+ column_name
187
+ FROM
188
+ cols
189
+ ORDER BY
190
+ schema_name,
191
+ table_name,
192
+ constraint_name,
193
+ position;
194
+ """
195
+
196
+ def _sql_checks(self) -> str:
197
+ return r"""
198
+ SELECT
199
+ d.schema_name,
200
+ d.table_name,
201
+ d.constraint_name,
202
+ d.expression AS expression,
203
+ TRUE AS validated
204
+ FROM
205
+ duckdb_constraints() AS d
206
+ WHERE
207
+ d.schema_name = ANY(?)
208
+ AND d.constraint_type = 'CHECK'
209
+ ORDER BY
210
+ d.schema_name,
211
+ d.table_name,
212
+ d.constraint_name;
213
+ """
214
+
215
+ def _sql_foreign_keys(self) -> str:
216
+ return r"""
217
+ WITH d AS (
218
+ SELECT
219
+ *
220
+ FROM
221
+ duckdb_constraints()
222
+ WHERE
223
+ schema_name = ANY(?)
224
+ AND constraint_type = 'FOREIGN KEY'
225
+ ),
226
+ cols AS (
227
+ SELECT
228
+ d.schema_name,
229
+ d.table_name,
230
+ d.constraint_name,
231
+ r.pos AS position,
232
+ d.constraint_column_names[r.pos] AS from_column,
233
+ d.referenced_column_names[r.pos] AS to_column
234
+ FROM
235
+ d,
236
+ range(1, length(d.constraint_column_names) + 1) AS r(pos)
237
+ ),
238
+ ref AS (
239
+ SELECT
240
+ rc.constraint_schema AS schema_name,
241
+ rc.constraint_name,
242
+ tc.table_schema AS ref_schema,
243
+ tc.table_name AS ref_table
244
+ FROM
245
+ information_schema.referential_constraints rc
246
+ JOIN information_schema.table_constraints tc ON
247
+ tc.constraint_schema = rc.unique_constraint_schema
248
+ AND tc.constraint_name = rc.unique_constraint_name
249
+ ),
250
+ rules AS (
251
+ SELECT
252
+ constraint_schema AS schema_name,
253
+ constraint_name,
254
+ lower(update_rule) AS on_update,
255
+ lower(delete_rule) AS on_delete
256
+ FROM
257
+ information_schema.referential_constraints
258
+ )
259
+ SELECT
260
+ c.schema_name,
261
+ c.table_name,
262
+ c.constraint_name,
263
+ c.position,
264
+ c.from_column,
265
+ r.ref_schema,
266
+ r.ref_table,
267
+ c.to_column,
268
+ coalesce(u.on_update, 'no action') AS on_update,
269
+ coalesce(u.on_delete, 'no action') AS on_delete,
270
+ TRUE AS enforced,
271
+ TRUE AS validated
272
+ FROM
273
+ cols c JOIN ref r ON r.schema_name = c.schema_name AND r.constraint_name = c.constraint_name
274
+ LEFT JOIN rules u ON u.schema_name = c.schema_name AND u.constraint_name = c.constraint_name
275
+ ORDER BY
276
+ c.schema_name,
277
+ c.table_name,
278
+ c.constraint_name,
279
+ c.position;
280
+ """
281
+
282
+ def _sql_indexes(self) -> str:
283
+ return r"""
284
+ WITH idx AS (
285
+ SELECT
286
+ schema_name,
287
+ table_name,
288
+ index_name,
289
+ is_unique,
290
+ string_split(trim(BOTH '[]' FROM expressions), ',') AS expr_list
291
+ FROM
292
+ duckdb_indexes()
293
+ WHERE
294
+ schema_name = ANY(?)
295
+ )
296
+ SELECT
297
+ schema_name,
298
+ table_name,
299
+ index_name,
300
+ pos AS position,
301
+ trim(expr_list[pos]) AS expr,
302
+ is_unique
303
+ FROM
304
+ idx,
305
+ range(1, length(expr_list) + 1) AS r(pos)
306
+ ORDER BY
307
+ schema_name,
308
+ table_name,
309
+ index_name,
310
+ position;
311
+ """
312
+
313
+ def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
314
+ sql = f'SELECT * FROM "{schema}"."{table}" LIMIT ?'
315
+ return SQLQuery(sql, (limit,))
316
+
317
+ def _quote_literal(self, value: str) -> str:
318
+ return "'" + str(value).replace("'", "''") + "'"
319
+
320
+ def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
321
+ cur = connection.cursor()
322
+ cur.execute(sql, params or [])
323
+ columns = [desc[0].lower() for desc in cur.description] if cur.description else []
324
+ rows = cur.fetchall()
325
+ return [dict(zip(columns, row)) for row in rows]