databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Mapping
|
|
4
|
+
|
|
5
|
+
import clickhouse_connect
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
9
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
10
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
11
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ClickhouseConfigFile(BaseDatabaseConfigFile):
|
|
15
|
+
type: str = Field(default="databases/clickhouse")
|
|
16
|
+
connection: dict[str, Any] = Field(
|
|
17
|
+
description="Connection parameters for the Clickhouse database. It can contain any of the keys supported by the Clickhouse connection library (see https://clickhouse.com/docs/integrations/language-clients/python/driver-api#connection-arguments)"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ClickhouseIntrospector(BaseIntrospector[ClickhouseConfigFile]):
|
|
22
|
+
_IGNORED_SCHEMAS = {"information_schema", "system", "INFORMATION_SCHEMA"}
|
|
23
|
+
|
|
24
|
+
supports_catalogs = True
|
|
25
|
+
|
|
26
|
+
def _connect(self, file_config: ClickhouseConfigFile):
|
|
27
|
+
connection = file_config.connection
|
|
28
|
+
if not isinstance(connection, Mapping):
|
|
29
|
+
raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
|
|
30
|
+
|
|
31
|
+
return clickhouse_connect.get_client(**connection)
|
|
32
|
+
|
|
33
|
+
def _connect_to_catalog(self, file_config: ClickhouseConfigFile, catalog: str):
|
|
34
|
+
return self._connect(file_config)
|
|
35
|
+
|
|
36
|
+
def _get_catalogs(self, connection, file_config: ClickhouseConfigFile) -> list[str]:
|
|
37
|
+
return ["clickhouse"]
|
|
38
|
+
|
|
39
|
+
def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
|
|
40
|
+
return SQLQuery(
|
|
41
|
+
"""
|
|
42
|
+
SELECT
|
|
43
|
+
name AS schema_name,
|
|
44
|
+
'clickhouse' AS catalog_name
|
|
45
|
+
FROM
|
|
46
|
+
system.databases
|
|
47
|
+
""",
|
|
48
|
+
None,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
52
|
+
if not schemas:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
schemas_sql = ", ".join(self._quote_literal(s) for s in schemas)
|
|
56
|
+
|
|
57
|
+
comps = self._component_queries()
|
|
58
|
+
results: dict[str, list[dict]] = {cq: [] for cq in comps}
|
|
59
|
+
for cq, template_sql in comps.items():
|
|
60
|
+
sql = template_sql.replace("{SCHEMAS}", schemas_sql)
|
|
61
|
+
results[cq] = self._fetchall_dicts(connection, sql, None)
|
|
62
|
+
|
|
63
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
64
|
+
schemas=schemas,
|
|
65
|
+
rels=results.get("relations", []),
|
|
66
|
+
cols=results.get("columns", []),
|
|
67
|
+
pk_cols=[],
|
|
68
|
+
uq_cols=[],
|
|
69
|
+
checks=[],
|
|
70
|
+
fk_cols=[],
|
|
71
|
+
idx_cols=results.get("idx", []),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _component_queries(self) -> dict[str, str]:
|
|
75
|
+
return {"relations": self._sql_relations(), "columns": self._sql_columns(), "idx": self._sql_indexes()}
|
|
76
|
+
|
|
77
|
+
def _sql_relations(self) -> str:
|
|
78
|
+
return r"""
|
|
79
|
+
SELECT
|
|
80
|
+
t.database AS schema_name,
|
|
81
|
+
t.name AS table_name,
|
|
82
|
+
multiIf(
|
|
83
|
+
t.engine = 'View', 'view',
|
|
84
|
+
t.engine = 'MaterializedView', 'materialized_view',
|
|
85
|
+
t.engine IN (
|
|
86
|
+
'File', 'URL',
|
|
87
|
+
'S3', 'S3Queue',
|
|
88
|
+
'AzureBlobStorage', 'AzureQueue',
|
|
89
|
+
'HDFS',
|
|
90
|
+
'MySQL', 'PostgreSQL', 'MongoDB', 'Redis',
|
|
91
|
+
'JDBC', 'ODBC', 'SQLite',
|
|
92
|
+
'Kafka', 'RabbitMQ', 'NATS',
|
|
93
|
+
'ExternalDistributed',
|
|
94
|
+
'DeltaLake', 'Iceberg', 'Hudi', 'Hive',
|
|
95
|
+
'MaterializedPostgreSQL',
|
|
96
|
+
'YTsaurus', 'ArrowFlight'
|
|
97
|
+
), 'external_table',
|
|
98
|
+
'table'
|
|
99
|
+
) AS kind,
|
|
100
|
+
t.comment AS description
|
|
101
|
+
FROM
|
|
102
|
+
system.tables t
|
|
103
|
+
WHERE
|
|
104
|
+
t.database IN ({SCHEMAS})
|
|
105
|
+
ORDER BY
|
|
106
|
+
t.name
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def _sql_columns(self) -> str:
|
|
110
|
+
return r"""
|
|
111
|
+
SELECT
|
|
112
|
+
c.database AS schema_name,
|
|
113
|
+
c.table AS table_name,
|
|
114
|
+
c.name AS column_name,
|
|
115
|
+
c.position AS ordinal_position,
|
|
116
|
+
c.type AS data_type,
|
|
117
|
+
(c.type LIKE 'Nullable(%') AS is_nullable,
|
|
118
|
+
c.default_expression AS default_expression,
|
|
119
|
+
CASE
|
|
120
|
+
WHEN c.default_kind IN ('MATERIALIZED','ALIAS') THEN 'computed'
|
|
121
|
+
END AS generated,
|
|
122
|
+
c.comment AS description
|
|
123
|
+
FROM
|
|
124
|
+
system.columns c
|
|
125
|
+
WHERE
|
|
126
|
+
c.database IN ({SCHEMAS})
|
|
127
|
+
ORDER BY
|
|
128
|
+
c.table,
|
|
129
|
+
c.position
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def _sql_indexes(self) -> str:
|
|
133
|
+
return r"""
|
|
134
|
+
SELECT
|
|
135
|
+
i.database AS schema_name,
|
|
136
|
+
i.table AS table_name,
|
|
137
|
+
i.name AS index_name,
|
|
138
|
+
1 AS position,
|
|
139
|
+
i.expr AS expr,
|
|
140
|
+
0 AS is_unique,
|
|
141
|
+
i.type AS method,
|
|
142
|
+
NULL AS predicate
|
|
143
|
+
FROM
|
|
144
|
+
system.data_skipping_indices i
|
|
145
|
+
WHERE
|
|
146
|
+
i.database IN ({SCHEMAS})
|
|
147
|
+
ORDER BY
|
|
148
|
+
i.table,
|
|
149
|
+
i.name
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
153
|
+
sql = f'SELECT * FROM "{schema}"."{table}" LIMIT %s'
|
|
154
|
+
return SQLQuery(sql, (limit,))
|
|
155
|
+
|
|
156
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
157
|
+
res = connection.query(sql, parameters=params) if params else connection.query(sql)
|
|
158
|
+
cols = [c.lower() for c in res.column_names]
|
|
159
|
+
return [dict(zip(cols, row)) for row in res.result_rows]
|
|
160
|
+
|
|
161
|
+
def _quote_literal(self, value: str) -> str:
|
|
162
|
+
return "'" + str(value).replace("'", "\\'") + "'"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
4
|
+
from databao_context_engine.plugins.databases.databases_types import (
|
|
5
|
+
DatabaseColumn,
|
|
6
|
+
DatabaseIntrospectionResult,
|
|
7
|
+
DatabaseTable,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DatabaseTableChunkContent:
|
|
13
|
+
catalog_name: str
|
|
14
|
+
schema_name: str
|
|
15
|
+
table: DatabaseTable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class DatabaseColumnChunkContent:
|
|
20
|
+
catalog_name: str
|
|
21
|
+
schema_name: str
|
|
22
|
+
table_name: str
|
|
23
|
+
column: DatabaseColumn
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_database_chunks(result: DatabaseIntrospectionResult) -> list[EmbeddableChunk]:
|
|
27
|
+
chunks = []
|
|
28
|
+
for catalog in result.catalogs:
|
|
29
|
+
for schema in catalog.schemas:
|
|
30
|
+
for table in schema.tables:
|
|
31
|
+
chunks.append(_create_table_chunk(catalog.name, schema.name, table))
|
|
32
|
+
|
|
33
|
+
for column in table.columns:
|
|
34
|
+
chunks.append(_create_column_chunk(catalog.name, schema.name, table.name, column))
|
|
35
|
+
|
|
36
|
+
return chunks
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _create_table_chunk(catalog_name: str, schema_name: str, table: DatabaseTable) -> EmbeddableChunk:
|
|
40
|
+
return EmbeddableChunk(
|
|
41
|
+
embeddable_text=_build_table_chunk_text(table),
|
|
42
|
+
content=DatabaseTableChunkContent(
|
|
43
|
+
catalog_name=catalog_name,
|
|
44
|
+
schema_name=schema_name,
|
|
45
|
+
table=table,
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _create_column_chunk(
|
|
51
|
+
catalog_name: str, schema_name: str, table_name: str, column: DatabaseColumn
|
|
52
|
+
) -> EmbeddableChunk:
|
|
53
|
+
return EmbeddableChunk(
|
|
54
|
+
embeddable_text=_build_column_chunk_text(table_name, column),
|
|
55
|
+
content=DatabaseColumnChunkContent(
|
|
56
|
+
catalog_name=catalog_name,
|
|
57
|
+
schema_name=schema_name,
|
|
58
|
+
table_name=table_name,
|
|
59
|
+
column=column,
|
|
60
|
+
),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _build_table_chunk_text(database_table: DatabaseTable) -> str:
|
|
65
|
+
return f"Table {database_table.name} with columns {','.join([column.name for column in database_table.columns])}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _build_column_chunk_text(table_name: str, database_object: DatabaseColumn) -> str:
|
|
69
|
+
return f"Column {database_object.name} in table {table_name}"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DatasetKind(str, Enum):
|
|
7
|
+
TABLE = "table"
|
|
8
|
+
VIEW = "view"
|
|
9
|
+
MATERIALIZED_VIEW = "materialized_view"
|
|
10
|
+
EXTERNAL_TABLE = "external_table"
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def from_raw(cls, raw: str | None) -> "DatasetKind":
|
|
14
|
+
default = cls.TABLE
|
|
15
|
+
if raw is None:
|
|
16
|
+
return default
|
|
17
|
+
if isinstance(raw, str):
|
|
18
|
+
raw = raw.lower()
|
|
19
|
+
try:
|
|
20
|
+
return cls(raw)
|
|
21
|
+
except Exception:
|
|
22
|
+
return default
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class CheckConstraint:
|
|
27
|
+
name: str | None
|
|
28
|
+
expression: str
|
|
29
|
+
validated: bool | None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class KeyConstraint:
|
|
34
|
+
name: str | None
|
|
35
|
+
columns: list[str]
|
|
36
|
+
validated: bool | None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ForeignKeyColumnMap:
|
|
41
|
+
from_column: str
|
|
42
|
+
to_column: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ForeignKey:
|
|
47
|
+
name: str | None
|
|
48
|
+
mapping: list[ForeignKeyColumnMap]
|
|
49
|
+
referenced_table: str
|
|
50
|
+
enforced: bool | None = None
|
|
51
|
+
validated: bool | None = None
|
|
52
|
+
on_update: str | None = None
|
|
53
|
+
on_delete: str | None = None
|
|
54
|
+
cardinality_inferred: Literal["one_to_one", "many_to_one"] | None = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Index:
|
|
59
|
+
name: str
|
|
60
|
+
columns: list[str]
|
|
61
|
+
unique: bool = False
|
|
62
|
+
method: str | None = None
|
|
63
|
+
predicate: str | None = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class DatabaseColumn:
|
|
68
|
+
name: str
|
|
69
|
+
type: str
|
|
70
|
+
nullable: bool
|
|
71
|
+
description: str | None = None
|
|
72
|
+
default_expression: str | None = None
|
|
73
|
+
generated: Literal["identity", "computed"] | None = None
|
|
74
|
+
checks: list[CheckConstraint] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class DatabasePartitionInfo:
|
|
79
|
+
meta: dict[str, Any]
|
|
80
|
+
partition_tables: list[str]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class DatabaseTable:
|
|
85
|
+
name: str
|
|
86
|
+
columns: list[DatabaseColumn]
|
|
87
|
+
samples: list[dict[str, Any]]
|
|
88
|
+
partition_info: DatabasePartitionInfo | None = None
|
|
89
|
+
description: str | None = None
|
|
90
|
+
kind: DatasetKind = DatasetKind.TABLE
|
|
91
|
+
primary_key: KeyConstraint | None = None
|
|
92
|
+
unique_constraints: list[KeyConstraint] = field(default_factory=list)
|
|
93
|
+
checks: list[CheckConstraint] = field(default_factory=list)
|
|
94
|
+
indexes: list[Index] = field(default_factory=list)
|
|
95
|
+
foreign_keys: list[ForeignKey] = field(default_factory=list)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class DatabaseSchema:
|
|
100
|
+
name: str
|
|
101
|
+
tables: list[DatabaseTable]
|
|
102
|
+
description: str | None = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class DatabaseCatalog:
|
|
107
|
+
name: str
|
|
108
|
+
schemas: list[DatabaseSchema]
|
|
109
|
+
description: str | None = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class DatabaseIntrospectionResult:
|
|
114
|
+
catalogs: list[DatabaseCatalog]
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
7
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
8
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
9
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DuckDBConfigFile(BaseDatabaseConfigFile):
|
|
13
|
+
type: str = Field(default="databases/duckdb")
|
|
14
|
+
connection: DuckDBConnectionConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DuckDBConnectionConfig(BaseModel):
|
|
18
|
+
database: str = Field(description="Path to the DuckDB database file")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DuckDBIntrospector(BaseIntrospector[DuckDBConfigFile]):
|
|
22
|
+
_IGNORED_CATALOGS = {"system", "temp"}
|
|
23
|
+
_IGNORED_SCHEMAS = {"information_schema", "pg_catalog"}
|
|
24
|
+
supports_catalogs = True
|
|
25
|
+
|
|
26
|
+
def _connect(self, file_config: DuckDBConfigFile):
|
|
27
|
+
database_path = str(file_config.connection.database)
|
|
28
|
+
return duckdb.connect(database=database_path)
|
|
29
|
+
|
|
30
|
+
def _connect_to_catalog(self, file_config: DuckDBConfigFile, catalog: str):
|
|
31
|
+
return self._connect(file_config)
|
|
32
|
+
|
|
33
|
+
def _get_catalogs(self, connection, file_config: DuckDBConfigFile) -> list[str]:
|
|
34
|
+
rows = self._fetchall_dicts(connection, "SELECT database_name FROM duckdb_databases();", None)
|
|
35
|
+
catalogs = [r["database_name"] for r in rows if r.get("database_name")]
|
|
36
|
+
catalogs_filtered = [c for c in catalogs if c.lower() not in self._IGNORED_CATALOGS]
|
|
37
|
+
return catalogs_filtered or [self._resolve_pseudo_catalog_name(file_config)]
|
|
38
|
+
|
|
39
|
+
def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
|
|
40
|
+
if not catalogs:
|
|
41
|
+
return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
|
|
42
|
+
sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY(?)"
|
|
43
|
+
return SQLQuery(sql, (catalogs,))
|
|
44
|
+
|
|
45
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
46
|
+
if not schemas:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
comps = self._component_queries()
|
|
50
|
+
results: dict[str, list[dict]] = {cq: [] for cq in comps}
|
|
51
|
+
|
|
52
|
+
for cq, sql in comps.items():
|
|
53
|
+
results[cq] = self._fetchall_dicts(connection, sql, (schemas,))
|
|
54
|
+
|
|
55
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
56
|
+
schemas=schemas,
|
|
57
|
+
rels=results.get("relations", []),
|
|
58
|
+
cols=results.get("columns", []),
|
|
59
|
+
pk_cols=results.get("pk", []),
|
|
60
|
+
uq_cols=results.get("uq", []),
|
|
61
|
+
checks=results.get("checks", []),
|
|
62
|
+
fk_cols=results.get("fks", []),
|
|
63
|
+
idx_cols=results.get("idx", []),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _component_queries(self) -> dict[str, str]:
|
|
67
|
+
return {
|
|
68
|
+
"relations": self._sql_relations(),
|
|
69
|
+
"columns": self._sql_columns(),
|
|
70
|
+
"pk": self._sql_primary_keys(),
|
|
71
|
+
"uq": self._sql_unique(),
|
|
72
|
+
"checks": self._sql_checks(),
|
|
73
|
+
"fks": self._sql_foreign_keys(),
|
|
74
|
+
"idx": self._sql_indexes(),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
def _sql_relations(self) -> str:
|
|
78
|
+
return r"""
|
|
79
|
+
SELECT
|
|
80
|
+
table_schema AS schema_name,
|
|
81
|
+
table_name,
|
|
82
|
+
CASE table_type
|
|
83
|
+
WHEN 'BASE TABLE' THEN 'table'
|
|
84
|
+
WHEN 'VIEW' THEN 'view'
|
|
85
|
+
WHEN 'MATERIALIZED VIEW' THEN 'materialized_view'
|
|
86
|
+
ELSE lower(table_type)
|
|
87
|
+
END AS kind,
|
|
88
|
+
NULL::VARCHAR AS description
|
|
89
|
+
FROM
|
|
90
|
+
information_schema.tables
|
|
91
|
+
WHERE
|
|
92
|
+
table_schema = ANY(?)
|
|
93
|
+
ORDER BY
|
|
94
|
+
table_name;
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def _sql_columns(self) -> str:
|
|
98
|
+
return r"""
|
|
99
|
+
SELECT
|
|
100
|
+
c.table_schema AS schema_name,
|
|
101
|
+
c.table_name,
|
|
102
|
+
c.column_name,
|
|
103
|
+
c.ordinal_position AS ordinal_position,
|
|
104
|
+
c.data_type AS data_type,
|
|
105
|
+
CASE
|
|
106
|
+
WHEN c.is_nullable = 'YES' THEN TRUE
|
|
107
|
+
ELSE FALSE
|
|
108
|
+
END AS is_nullable,
|
|
109
|
+
c.column_default AS default_expression,
|
|
110
|
+
NULL::VARCHAR AS generated,
|
|
111
|
+
NULL::VARCHAR AS description
|
|
112
|
+
FROM
|
|
113
|
+
information_schema.columns c
|
|
114
|
+
WHERE
|
|
115
|
+
c.table_schema = ANY(?)
|
|
116
|
+
ORDER BY
|
|
117
|
+
c.table_schema,
|
|
118
|
+
c.table_name,
|
|
119
|
+
c.ordinal_position;
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def _sql_primary_keys(self) -> str:
|
|
123
|
+
return r"""
|
|
124
|
+
WITH d AS (
|
|
125
|
+
SELECT
|
|
126
|
+
*
|
|
127
|
+
FROM
|
|
128
|
+
duckdb_constraints()
|
|
129
|
+
WHERE
|
|
130
|
+
schema_name = ANY(?)
|
|
131
|
+
AND constraint_type = 'PRIMARY KEY'
|
|
132
|
+
),
|
|
133
|
+
cols AS (
|
|
134
|
+
SELECT
|
|
135
|
+
d.schema_name,
|
|
136
|
+
d.table_name,
|
|
137
|
+
d.constraint_name,
|
|
138
|
+
r.pos AS position,
|
|
139
|
+
d.constraint_column_names[r.pos] AS column_name
|
|
140
|
+
FROM
|
|
141
|
+
d,
|
|
142
|
+
range(1, length(d.constraint_column_names) + 1) AS r(pos)
|
|
143
|
+
)
|
|
144
|
+
SELECT
|
|
145
|
+
schema_name,
|
|
146
|
+
table_name,
|
|
147
|
+
constraint_name,
|
|
148
|
+
position,
|
|
149
|
+
column_name
|
|
150
|
+
FROM
|
|
151
|
+
cols
|
|
152
|
+
ORDER BY
|
|
153
|
+
schema_name,
|
|
154
|
+
table_name,
|
|
155
|
+
constraint_name,
|
|
156
|
+
position;
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
def _sql_unique(self) -> str:
|
|
160
|
+
return r"""
|
|
161
|
+
WITH d AS (
|
|
162
|
+
SELECT
|
|
163
|
+
*
|
|
164
|
+
FROM
|
|
165
|
+
duckdb_constraints()
|
|
166
|
+
WHERE
|
|
167
|
+
schema_name = ANY(?)
|
|
168
|
+
AND constraint_type = 'UNIQUE'
|
|
169
|
+
),
|
|
170
|
+
cols AS (
|
|
171
|
+
SELECT
|
|
172
|
+
d.schema_name,
|
|
173
|
+
d.table_name,
|
|
174
|
+
d.constraint_name,
|
|
175
|
+
r.pos AS position,
|
|
176
|
+
d.constraint_column_names[r.pos] AS column_name
|
|
177
|
+
FROM
|
|
178
|
+
d,
|
|
179
|
+
range(1, length(d.constraint_column_names) + 1) AS r(pos)
|
|
180
|
+
)
|
|
181
|
+
SELECT
|
|
182
|
+
schema_name,
|
|
183
|
+
table_name,
|
|
184
|
+
constraint_name,
|
|
185
|
+
position,
|
|
186
|
+
column_name
|
|
187
|
+
FROM
|
|
188
|
+
cols
|
|
189
|
+
ORDER BY
|
|
190
|
+
schema_name,
|
|
191
|
+
table_name,
|
|
192
|
+
constraint_name,
|
|
193
|
+
position;
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
def _sql_checks(self) -> str:
|
|
197
|
+
return r"""
|
|
198
|
+
SELECT
|
|
199
|
+
d.schema_name,
|
|
200
|
+
d.table_name,
|
|
201
|
+
d.constraint_name,
|
|
202
|
+
d.expression AS expression,
|
|
203
|
+
TRUE AS validated
|
|
204
|
+
FROM
|
|
205
|
+
duckdb_constraints() AS d
|
|
206
|
+
WHERE
|
|
207
|
+
d.schema_name = ANY(?)
|
|
208
|
+
AND d.constraint_type = 'CHECK'
|
|
209
|
+
ORDER BY
|
|
210
|
+
d.schema_name,
|
|
211
|
+
d.table_name,
|
|
212
|
+
d.constraint_name;
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
def _sql_foreign_keys(self) -> str:
|
|
216
|
+
return r"""
|
|
217
|
+
WITH d AS (
|
|
218
|
+
SELECT
|
|
219
|
+
*
|
|
220
|
+
FROM
|
|
221
|
+
duckdb_constraints()
|
|
222
|
+
WHERE
|
|
223
|
+
schema_name = ANY(?)
|
|
224
|
+
AND constraint_type = 'FOREIGN KEY'
|
|
225
|
+
),
|
|
226
|
+
cols AS (
|
|
227
|
+
SELECT
|
|
228
|
+
d.schema_name,
|
|
229
|
+
d.table_name,
|
|
230
|
+
d.constraint_name,
|
|
231
|
+
r.pos AS position,
|
|
232
|
+
d.constraint_column_names[r.pos] AS from_column,
|
|
233
|
+
d.referenced_column_names[r.pos] AS to_column
|
|
234
|
+
FROM
|
|
235
|
+
d,
|
|
236
|
+
range(1, length(d.constraint_column_names) + 1) AS r(pos)
|
|
237
|
+
),
|
|
238
|
+
ref AS (
|
|
239
|
+
SELECT
|
|
240
|
+
rc.constraint_schema AS schema_name,
|
|
241
|
+
rc.constraint_name,
|
|
242
|
+
tc.table_schema AS ref_schema,
|
|
243
|
+
tc.table_name AS ref_table
|
|
244
|
+
FROM
|
|
245
|
+
information_schema.referential_constraints rc
|
|
246
|
+
JOIN information_schema.table_constraints tc ON
|
|
247
|
+
tc.constraint_schema = rc.unique_constraint_schema
|
|
248
|
+
AND tc.constraint_name = rc.unique_constraint_name
|
|
249
|
+
),
|
|
250
|
+
rules AS (
|
|
251
|
+
SELECT
|
|
252
|
+
constraint_schema AS schema_name,
|
|
253
|
+
constraint_name,
|
|
254
|
+
lower(update_rule) AS on_update,
|
|
255
|
+
lower(delete_rule) AS on_delete
|
|
256
|
+
FROM
|
|
257
|
+
information_schema.referential_constraints
|
|
258
|
+
)
|
|
259
|
+
SELECT
|
|
260
|
+
c.schema_name,
|
|
261
|
+
c.table_name,
|
|
262
|
+
c.constraint_name,
|
|
263
|
+
c.position,
|
|
264
|
+
c.from_column,
|
|
265
|
+
r.ref_schema,
|
|
266
|
+
r.ref_table,
|
|
267
|
+
c.to_column,
|
|
268
|
+
coalesce(u.on_update, 'no action') AS on_update,
|
|
269
|
+
coalesce(u.on_delete, 'no action') AS on_delete,
|
|
270
|
+
TRUE AS enforced,
|
|
271
|
+
TRUE AS validated
|
|
272
|
+
FROM
|
|
273
|
+
cols c JOIN ref r ON r.schema_name = c.schema_name AND r.constraint_name = c.constraint_name
|
|
274
|
+
LEFT JOIN rules u ON u.schema_name = c.schema_name AND u.constraint_name = c.constraint_name
|
|
275
|
+
ORDER BY
|
|
276
|
+
c.schema_name,
|
|
277
|
+
c.table_name,
|
|
278
|
+
c.constraint_name,
|
|
279
|
+
c.position;
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def _sql_indexes(self) -> str:
|
|
283
|
+
return r"""
|
|
284
|
+
WITH idx AS (
|
|
285
|
+
SELECT
|
|
286
|
+
schema_name,
|
|
287
|
+
table_name,
|
|
288
|
+
index_name,
|
|
289
|
+
is_unique,
|
|
290
|
+
string_split(trim(BOTH '[]' FROM expressions), ',') AS expr_list
|
|
291
|
+
FROM
|
|
292
|
+
duckdb_indexes()
|
|
293
|
+
WHERE
|
|
294
|
+
schema_name = ANY(?)
|
|
295
|
+
)
|
|
296
|
+
SELECT
|
|
297
|
+
schema_name,
|
|
298
|
+
table_name,
|
|
299
|
+
index_name,
|
|
300
|
+
pos AS position,
|
|
301
|
+
trim(expr_list[pos]) AS expr,
|
|
302
|
+
is_unique
|
|
303
|
+
FROM
|
|
304
|
+
idx,
|
|
305
|
+
range(1, length(expr_list) + 1) AS r(pos)
|
|
306
|
+
ORDER BY
|
|
307
|
+
schema_name,
|
|
308
|
+
table_name,
|
|
309
|
+
index_name,
|
|
310
|
+
position;
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
314
|
+
sql = f'SELECT * FROM "{schema}"."{table}" LIMIT ?'
|
|
315
|
+
return SQLQuery(sql, (limit,))
|
|
316
|
+
|
|
317
|
+
def _quote_literal(self, value: str) -> str:
|
|
318
|
+
return "'" + str(value).replace("'", "''") + "'"
|
|
319
|
+
|
|
320
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
321
|
+
cur = connection.cursor()
|
|
322
|
+
cur.execute(sql, params or [])
|
|
323
|
+
columns = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
324
|
+
rows = cur.fetchall()
|
|
325
|
+
return [dict(zip(columns, row)) for row in rows]
|