databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Mapping
|
|
4
|
+
|
|
5
|
+
import snowflake.connector
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
from snowflake.connector import DictCursor
|
|
8
|
+
|
|
9
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
10
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
11
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
12
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SnowflakeConfigFile(BaseDatabaseConfigFile):
|
|
16
|
+
type: str = Field(default="databases/snowflake")
|
|
17
|
+
connection: dict[str, Any] = Field(
|
|
18
|
+
description="Connection parameters for Snowflake. It can contain any of the keys supported by the Snowflake connection library"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SnowflakeIntrospector(BaseIntrospector[SnowflakeConfigFile]):
|
|
23
|
+
_IGNORED_SCHEMAS = {
|
|
24
|
+
"information_schema",
|
|
25
|
+
}
|
|
26
|
+
_IGNORED_CATALOGS = {"STREAMLIT_APPS"}
|
|
27
|
+
supports_catalogs = True
|
|
28
|
+
|
|
29
|
+
def _connect(self, file_config: SnowflakeConfigFile):
|
|
30
|
+
connection = file_config.connection
|
|
31
|
+
if not isinstance(connection, Mapping):
|
|
32
|
+
raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
|
|
33
|
+
snowflake.connector.paramstyle = "qmark"
|
|
34
|
+
return snowflake.connector.connect(
|
|
35
|
+
**connection,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def _connect_to_catalog(self, file_config: SnowflakeConfigFile, catalog: str):
|
|
39
|
+
cfg = dict(file_config.connection or {})
|
|
40
|
+
cfg["database"] = catalog
|
|
41
|
+
return snowflake.connector.connect(**cfg)
|
|
42
|
+
|
|
43
|
+
def _get_catalogs(self, connection, file_config: SnowflakeConfigFile) -> list[str]:
|
|
44
|
+
database = file_config.connection.get("database")
|
|
45
|
+
if database:
|
|
46
|
+
return [database]
|
|
47
|
+
|
|
48
|
+
rows = self._fetchall_dicts(connection, "SHOW DATABASES", None)
|
|
49
|
+
return [r["name"] for r in rows if r["name"] and r["name"].upper() not in self._IGNORED_CATALOGS]
|
|
50
|
+
|
|
51
|
+
def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
|
|
52
|
+
if not catalogs:
|
|
53
|
+
return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
|
|
54
|
+
parts = []
|
|
55
|
+
for catalog in catalogs:
|
|
56
|
+
parts.append(f"SELECT schema_name, catalog_name FROM {catalog}.information_schema.schemata")
|
|
57
|
+
return SQLQuery(" UNION ALL ".join(parts), None)
|
|
58
|
+
|
|
59
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
60
|
+
if not schemas:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
comps = self._component_queries(catalog, schemas)
|
|
64
|
+
|
|
65
|
+
statements = [c["sql"].rstrip().rstrip(";") for c in comps]
|
|
66
|
+
batch_sql = ";\n".join(statements)
|
|
67
|
+
|
|
68
|
+
results: dict[str, list[dict]] = {
|
|
69
|
+
"relations": [],
|
|
70
|
+
"columns": [],
|
|
71
|
+
"pk": [],
|
|
72
|
+
"fks": [],
|
|
73
|
+
"uq": [],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
with connection.cursor(DictCursor) as cur:
|
|
77
|
+
cur.execute(batch_sql, num_statements=len(statements))
|
|
78
|
+
|
|
79
|
+
for ix, comp in enumerate(comps, start=1):
|
|
80
|
+
name = comp["name"]
|
|
81
|
+
|
|
82
|
+
rows = self._lower_keys(cur.fetchall()) if cur.description else []
|
|
83
|
+
|
|
84
|
+
if name:
|
|
85
|
+
results[name].extend(rows)
|
|
86
|
+
|
|
87
|
+
if ix < len(comps):
|
|
88
|
+
ok = cur.nextset()
|
|
89
|
+
if not ok:
|
|
90
|
+
raise RuntimeError(
|
|
91
|
+
f"Snowflake multi-statement batch ended early after component #{ix} '{name}'"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
95
|
+
schemas=schemas,
|
|
96
|
+
rels=results["relations"],
|
|
97
|
+
cols=results["columns"],
|
|
98
|
+
pk_cols=results["pk"],
|
|
99
|
+
uq_cols=results["uq"],
|
|
100
|
+
checks=[],
|
|
101
|
+
fk_cols=results["fks"],
|
|
102
|
+
idx_cols=[],
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _component_queries(self, catalog: str, schemas: list[str]) -> list[dict]:
|
|
106
|
+
schemas_in = ", ".join(self._quote_literal(s) for s in schemas)
|
|
107
|
+
|
|
108
|
+
return [
|
|
109
|
+
{"name": "relations", "sql": self._sql_relations(catalog, schemas_in)},
|
|
110
|
+
{"name": "columns", "sql": self._sql_columns(catalog, schemas_in)},
|
|
111
|
+
{"name": None, "sql": self._sql_pk_show(catalog)},
|
|
112
|
+
{"name": "pk", "sql": self._sql_pk_select(schemas_in)},
|
|
113
|
+
{"name": None, "sql": self._sql_fk_show(catalog)},
|
|
114
|
+
{"name": "fks", "sql": self._sql_fk_select(schemas_in)},
|
|
115
|
+
{"name": None, "sql": self._sql_uq_show(catalog)},
|
|
116
|
+
{"name": "uq", "sql": self._sql_uq_select(schemas_in)},
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
def _sql_relations(self, catalog: str, schemas_in: str) -> str:
|
|
120
|
+
isq = self._qual_is(catalog)
|
|
121
|
+
|
|
122
|
+
return f"""
|
|
123
|
+
SELECT
|
|
124
|
+
t.TABLE_SCHEMA AS "schema_name",
|
|
125
|
+
t.TABLE_NAME AS "table_name",
|
|
126
|
+
CASE t.TABLE_TYPE
|
|
127
|
+
WHEN 'BASE TABLE' THEN 'table'
|
|
128
|
+
WHEN 'VIEW' THEN 'view'
|
|
129
|
+
WHEN 'MATERIALIZED VIEW' THEN 'materialized_view'
|
|
130
|
+
WHEN 'EXTERNAL TABLE' THEN 'external_table'
|
|
131
|
+
ELSE LOWER(t.TABLE_TYPE)
|
|
132
|
+
END AS "kind",
|
|
133
|
+
t.COMMENT AS "description"
|
|
134
|
+
FROM
|
|
135
|
+
{isq}.TABLES AS t
|
|
136
|
+
WHERE
|
|
137
|
+
t.TABLE_SCHEMA IN ({schemas_in})
|
|
138
|
+
ORDER BY
|
|
139
|
+
t.TABLE_SCHEMA,
|
|
140
|
+
t.TABLE_NAME
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def _sql_columns(self, catalog: str, schemas_in: str) -> str:
|
|
144
|
+
isq = self._qual_is(catalog)
|
|
145
|
+
|
|
146
|
+
return f"""
|
|
147
|
+
SELECT
|
|
148
|
+
c.TABLE_SCHEMA AS "schema_name",
|
|
149
|
+
c.TABLE_NAME AS "table_name",
|
|
150
|
+
c.COLUMN_NAME AS "column_name",
|
|
151
|
+
c.ORDINAL_POSITION AS "ordinal_position",
|
|
152
|
+
c.DATA_TYPE AS "data_type",
|
|
153
|
+
IFF(c.IS_NULLABLE = 'YES', TRUE, FALSE) AS "is_nullable",
|
|
154
|
+
c.COLUMN_DEFAULT AS "default_expression",
|
|
155
|
+
IFF(c.IS_IDENTITY = 'YES', 'identity', NULL) AS "generated",
|
|
156
|
+
c.COMMENT AS "description"
|
|
157
|
+
FROM
|
|
158
|
+
{isq}.COLUMNS AS c
|
|
159
|
+
WHERE
|
|
160
|
+
c.TABLE_SCHEMA IN ({schemas_in})
|
|
161
|
+
ORDER BY
|
|
162
|
+
c.TABLE_SCHEMA,
|
|
163
|
+
c.TABLE_NAME,
|
|
164
|
+
c.ORDINAL_POSITION
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
def _sql_pk_show(self, catalog: str) -> str:
|
|
168
|
+
return f"""
|
|
169
|
+
SHOW PRIMARY KEYS IN DATABASE {self._quote_ident(catalog)}
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def _sql_pk_select(self, schemas_in: str) -> str:
|
|
173
|
+
return f"""
|
|
174
|
+
SELECT
|
|
175
|
+
"schema_name" AS schema_name,
|
|
176
|
+
"table_name" AS table_name,
|
|
177
|
+
"constraint_name" AS constraint_name,
|
|
178
|
+
"column_name" AS column_name,
|
|
179
|
+
"key_sequence"::INT AS position
|
|
180
|
+
FROM
|
|
181
|
+
TABLE(RESULT_SCAN(LAST_QUERY_ID()))
|
|
182
|
+
WHERE
|
|
183
|
+
"schema_name" IN ({schemas_in})
|
|
184
|
+
ORDER BY
|
|
185
|
+
table_name,
|
|
186
|
+
constraint_name,
|
|
187
|
+
position
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def _sql_fk_show(self, catalog: str) -> str:
|
|
191
|
+
return f"""
|
|
192
|
+
SHOW IMPORTED KEYS IN DATABASE {self._quote_ident(catalog)}
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def _sql_fk_select(self, schemas_in: str) -> str:
|
|
196
|
+
return f"""
|
|
197
|
+
SELECT
|
|
198
|
+
"fk_schema_name" AS "schema_name",
|
|
199
|
+
"fk_table_name" AS "table_name",
|
|
200
|
+
"fk_name" AS "constraint_name",
|
|
201
|
+
"key_sequence"::INT AS "position",
|
|
202
|
+
"fk_column_name" AS "from_column",
|
|
203
|
+
"pk_schema_name" AS "ref_schema",
|
|
204
|
+
"pk_table_name" AS "ref_table",
|
|
205
|
+
"pk_column_name" AS "to_column",
|
|
206
|
+
LOWER("update_rule") AS "on_update",
|
|
207
|
+
LOWER("delete_rule") AS "on_delete",
|
|
208
|
+
NULL::BOOLEAN AS "enforced",
|
|
209
|
+
NULL::BOOLEAN AS "validated"
|
|
210
|
+
FROM
|
|
211
|
+
TABLE(RESULT_SCAN(LAST_QUERY_ID()))
|
|
212
|
+
WHERE
|
|
213
|
+
"fk_schema_name" IN ({schemas_in})
|
|
214
|
+
ORDER BY
|
|
215
|
+
"schema_name",
|
|
216
|
+
"table_name",
|
|
217
|
+
"constraint_name",
|
|
218
|
+
"position"
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def _sql_uq_show(self, catalog: str) -> str:
|
|
222
|
+
return f"""
|
|
223
|
+
SHOW UNIQUE KEYS IN DATABASE {self._quote_ident(catalog)}
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
def _sql_uq_select(self, schemas_in: str) -> str:
|
|
227
|
+
return f"""
|
|
228
|
+
SELECT
|
|
229
|
+
"schema_name" AS "schema_name",
|
|
230
|
+
"table_name" AS "table_name",
|
|
231
|
+
"constraint_name" AS "constraint_name",
|
|
232
|
+
"column_name" AS "column_name",
|
|
233
|
+
"key_sequence"::INT AS "position"
|
|
234
|
+
FROM
|
|
235
|
+
TABLE(RESULT_SCAN(LAST_QUERY_ID()))
|
|
236
|
+
WHERE
|
|
237
|
+
"schema_name" IN ({schemas_in})
|
|
238
|
+
ORDER BY
|
|
239
|
+
"schema_name",
|
|
240
|
+
"table_name",
|
|
241
|
+
"constraint_name",
|
|
242
|
+
"position"
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
def _sql_checks(self, catalog: str, schemas_in: str) -> str:
|
|
246
|
+
isq = self._qual_is(catalog)
|
|
247
|
+
|
|
248
|
+
return f"""
|
|
249
|
+
SELECT
|
|
250
|
+
tc.TABLE_SCHEMA AS "schema_name",
|
|
251
|
+
tc.TABLE_NAME AS "table_name",
|
|
252
|
+
tc.CONSTRAINT_NAME AS "constraint_name",
|
|
253
|
+
NULL::VARCHAR AS "expression",
|
|
254
|
+
TRUE AS "validated"
|
|
255
|
+
FROM
|
|
256
|
+
{isq}.TABLE_CONSTRAINTS AS tc
|
|
257
|
+
WHERE
|
|
258
|
+
tc.TABLE_SCHEMA IN ({schemas_in})
|
|
259
|
+
AND tc.CONSTRAINT_TYPE = 'CHECK'
|
|
260
|
+
ORDER BY
|
|
261
|
+
tc.TABLE_SCHEMA,
|
|
262
|
+
tc.TABLE_NAME,
|
|
263
|
+
tc.CONSTRAINT_NAME
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
267
|
+
sql = f'SELECT * FROM "{schema}"."{table}" LIMIT ?'
|
|
268
|
+
return SQLQuery(sql, (limit,))
|
|
269
|
+
|
|
270
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
271
|
+
with connection.cursor(snowflake.connector.DictCursor) as cur:
|
|
272
|
+
cur.execute(sql, params)
|
|
273
|
+
rows = cur.fetchall()
|
|
274
|
+
return [{k.lower(): v for k, v in row.items()} for row in rows]
|
|
275
|
+
|
|
276
|
+
def _quote_literal(self, value: str) -> str:
|
|
277
|
+
return "'" + str(value).replace("'", "''") + "'"
|
|
278
|
+
|
|
279
|
+
def _quote_ident(self, ident: str) -> str:
|
|
280
|
+
return '"' + ident.replace('"', '""') + '"'
|
|
281
|
+
|
|
282
|
+
def _qual_is(self, catalog: str) -> str:
|
|
283
|
+
return f"{self._quote_ident(catalog)}.INFORMATION_SCHEMA"
|
|
284
|
+
|
|
285
|
+
@staticmethod
|
|
286
|
+
def _lower_keys(rows: List[Dict]) -> List[Dict]:
|
|
287
|
+
return [{k.lower(): v for k, v in row.items()} for row in rows]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.duckdb_introspector import DuckDBConfigFile, DuckDBIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DuckDbPlugin(BaseDatabasePlugin[DuckDBConfigFile]):
|
|
6
|
+
id = "jetbrains/duckdb"
|
|
7
|
+
name = "DuckDB Plugin"
|
|
8
|
+
supported = {"databases/duckdb"}
|
|
9
|
+
config_file_type = DuckDBConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(DuckDBIntrospector())
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.mssql_introspector import MSSQLConfigFile, MSSQLIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MSSQLDbPlugin(BaseDatabasePlugin[MSSQLConfigFile]):
|
|
6
|
+
id = "jetbrains/mssql"
|
|
7
|
+
name = "MSSQL DB Plugin"
|
|
8
|
+
supported = {"databases/mssql"}
|
|
9
|
+
config_file_type = MSSQLConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(MSSQLIntrospector())
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.mysql_introspector import MySQLConfigFile, MySQLIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MySQLDbPlugin(BaseDatabasePlugin[MySQLConfigFile]):
|
|
6
|
+
id = "jetbrains/mysql"
|
|
7
|
+
name = "MySQL DB Plugin"
|
|
8
|
+
supported = {"databases/mysql"}
|
|
9
|
+
config_file_type = MySQLConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(MySQLIntrospector())
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, EmbeddableChunk
|
|
4
|
+
from databao_context_engine.plugins.resources.parquet_chunker import build_parquet_chunks
|
|
5
|
+
from databao_context_engine.plugins.resources.parquet_introspector import (
|
|
6
|
+
ParquetConfigFile,
|
|
7
|
+
ParquetIntrospector,
|
|
8
|
+
parquet_type,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParquetPlugin(BuildDatasourcePlugin[ParquetConfigFile]):
|
|
13
|
+
id = "jetbrains/parquet"
|
|
14
|
+
name = "Parquet Plugin"
|
|
15
|
+
config_file_type = ParquetConfigFile
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self._introspector = ParquetIntrospector()
|
|
19
|
+
|
|
20
|
+
def supported_types(self) -> set[str]:
|
|
21
|
+
return {parquet_type}
|
|
22
|
+
|
|
23
|
+
def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
|
|
24
|
+
return build_parquet_chunks(context)
|
|
25
|
+
|
|
26
|
+
def build_context(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> Any:
|
|
27
|
+
introspection_result = self._introspector.introspect(file_config)
|
|
28
|
+
|
|
29
|
+
return introspection_result
|
|
30
|
+
|
|
31
|
+
def check_connection(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> None:
|
|
32
|
+
self._introspector.check_connection(file_config)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.build_plugin import (
|
|
4
|
+
BuildDatasourcePlugin,
|
|
5
|
+
BuildFilePlugin,
|
|
6
|
+
BuildPlugin,
|
|
7
|
+
DatasourceType,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DuplicatePluginTypeError(RuntimeError):
|
|
14
|
+
"""Raised when two plugins register the same <main>/<sub> plugin key."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
PluginList = dict[DatasourceType, BuildPlugin]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_all_available_plugin_types(exclude_file_plugins: bool = False) -> set[DatasourceType]:
|
|
21
|
+
return set(load_plugins(exclude_file_plugins=exclude_file_plugins).keys())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_plugin_for_type(datasource_type: DatasourceType) -> BuildPlugin:
|
|
25
|
+
all_plugins = load_plugins()
|
|
26
|
+
|
|
27
|
+
if datasource_type not in all_plugins:
|
|
28
|
+
raise ValueError(f"No plugin found for type '{datasource_type.full_type}'")
|
|
29
|
+
|
|
30
|
+
return load_plugins()[datasource_type]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_plugins(exclude_file_plugins: bool = False) -> PluginList:
|
|
34
|
+
"""
|
|
35
|
+
Loads both builtin and external plugins and merges them into one list
|
|
36
|
+
"""
|
|
37
|
+
builtin_plugins = _load_builtin_plugins(exclude_file_plugins)
|
|
38
|
+
external_plugins = _load_external_plugins(exclude_file_plugins)
|
|
39
|
+
plugins = merge_plugins(builtin_plugins, external_plugins)
|
|
40
|
+
|
|
41
|
+
return plugins
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
|
|
45
|
+
all_builtin_plugins: list[BuildPlugin] = []
|
|
46
|
+
|
|
47
|
+
all_builtin_plugins += _load_builtin_datasource_plugins()
|
|
48
|
+
|
|
49
|
+
if not exclude_file_plugins:
|
|
50
|
+
all_builtin_plugins += _load_builtin_file_plugins()
|
|
51
|
+
|
|
52
|
+
return all_builtin_plugins
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
|
|
56
|
+
from databao_context_engine.plugins.unstructured_files_plugin import InternalUnstructuredFilesPlugin
|
|
57
|
+
|
|
58
|
+
return [
|
|
59
|
+
InternalUnstructuredFilesPlugin(),
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
|
|
64
|
+
"""
|
|
65
|
+
Statically register built-in plugins
|
|
66
|
+
"""
|
|
67
|
+
from databao_context_engine.plugins.athena_db_plugin import AthenaDbPlugin
|
|
68
|
+
from databao_context_engine.plugins.clickhouse_db_plugin import ClickhouseDbPlugin
|
|
69
|
+
from databao_context_engine.plugins.duckdb_db_plugin import DuckDbPlugin
|
|
70
|
+
from databao_context_engine.plugins.mssql_db_plugin import MSSQLDbPlugin
|
|
71
|
+
from databao_context_engine.plugins.mysql_db_plugin import MySQLDbPlugin
|
|
72
|
+
from databao_context_engine.plugins.parquet_plugin import ParquetPlugin
|
|
73
|
+
from databao_context_engine.plugins.postgresql_db_plugin import PostgresqlDbPlugin
|
|
74
|
+
from databao_context_engine.plugins.snowflake_db_plugin import SnowflakeDbPlugin
|
|
75
|
+
|
|
76
|
+
return [
|
|
77
|
+
AthenaDbPlugin(),
|
|
78
|
+
ClickhouseDbPlugin(),
|
|
79
|
+
DuckDbPlugin(),
|
|
80
|
+
MSSQLDbPlugin(),
|
|
81
|
+
MySQLDbPlugin(),
|
|
82
|
+
PostgresqlDbPlugin(),
|
|
83
|
+
SnowflakeDbPlugin(),
|
|
84
|
+
ParquetPlugin(),
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _load_external_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
|
|
89
|
+
"""
|
|
90
|
+
Discover external plugins via entry points
|
|
91
|
+
"""
|
|
92
|
+
# TODO: implement external plugin loading
|
|
93
|
+
return []
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def merge_plugins(*plugin_lists: list[BuildPlugin]) -> PluginList:
|
|
97
|
+
"""
|
|
98
|
+
Merge multiple plugin maps
|
|
99
|
+
"""
|
|
100
|
+
registry: PluginList = {}
|
|
101
|
+
for plugins in plugin_lists:
|
|
102
|
+
for plugin in plugins:
|
|
103
|
+
for full_type in plugin.supported_types():
|
|
104
|
+
datasource_type = DatasourceType(full_type=full_type)
|
|
105
|
+
if datasource_type in registry:
|
|
106
|
+
raise DuplicatePluginTypeError(
|
|
107
|
+
f"Plugin type '{datasource_type.full_type}' is provided by both {type(registry[datasource_type]).__name__} and {type(plugin).__name__}"
|
|
108
|
+
)
|
|
109
|
+
registry[datasource_type] = plugin
|
|
110
|
+
return registry
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.postgresql_introspector import PostgresConfigFile, PostgresqlIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class PostgresqlDbPlugin(BaseDatabasePlugin[PostgresConfigFile]):
|
|
6
|
+
id = "jetbrains/postgres"
|
|
7
|
+
name = "PostgreSQL DB Plugin"
|
|
8
|
+
supported = {"databases/postgres"}
|
|
9
|
+
config_file_type = PostgresConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(PostgresqlIntrospector())
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
4
|
+
from databao_context_engine.plugins.resources.parquet_introspector import ParquetColumn, ParquetIntrospectionResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ParquetColumnChunkContent:
|
|
9
|
+
file_name: str
|
|
10
|
+
column: ParquetColumn
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_parquet_chunks(result: ParquetIntrospectionResult) -> list[EmbeddableChunk]:
|
|
14
|
+
chunks = []
|
|
15
|
+
for file in result.files:
|
|
16
|
+
for column in file.columns:
|
|
17
|
+
chunks.append(
|
|
18
|
+
EmbeddableChunk(
|
|
19
|
+
embeddable_text=f"Column [name = {column.name}, type = {column.type}, number of values = {column.num_values}] in parquet file {file.name}",
|
|
20
|
+
content=ParquetColumnChunkContent(file_name=file.name, column=column),
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
return chunks
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import logging
|
|
3
|
+
import uuid
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from dataclasses import dataclass, replace
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
from _duckdb import DuckDBPyConnection
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from databao_context_engine.pluginlib.config import DuckDBSecret
|
|
13
|
+
|
|
14
|
+
parquet_type = "resources/parquet"
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ParquetConfigFile(BaseModel):
|
|
20
|
+
name: str | None = Field(default=None)
|
|
21
|
+
type: str = Field(default=parquet_type)
|
|
22
|
+
url: str = Field(
|
|
23
|
+
default=type,
|
|
24
|
+
description="Parquet resource location. Should be a valid URL or a path to a local file. "
|
|
25
|
+
"Examples: s3://your_bucket/file.parquet, s3://your-bucket/*.parquet, https://some.url/some_file.parquet, ~/path_to/file.parquet",
|
|
26
|
+
)
|
|
27
|
+
duckdb_secret: DuckDBSecret | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ParquetColumn:
|
|
32
|
+
name: str
|
|
33
|
+
type: str
|
|
34
|
+
row_groups: int
|
|
35
|
+
num_values: int
|
|
36
|
+
stats_min: str
|
|
37
|
+
stats_max: str
|
|
38
|
+
stats_null_count: int | None
|
|
39
|
+
stats_distinct_count: int | None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ParquetFile:
|
|
44
|
+
name: str
|
|
45
|
+
columns: list[ParquetColumn]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ParquetIntrospectionResult:
|
|
50
|
+
files: list[ParquetFile]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def generate_create_secret_sql(secret_name, duckdb_secret: DuckDBSecret) -> str:
|
|
54
|
+
parameters = [("type", duckdb_secret.type)] + list(duckdb_secret.properties.items())
|
|
55
|
+
return f"""CREATE SECRET {secret_name} (
|
|
56
|
+
{", ".join([f"{k} {v}" for (k, v) in parameters])}
|
|
57
|
+
);
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@contextlib.contextmanager
|
|
62
|
+
def _create_secret(conn: DuckDBPyConnection, duckdb_secret: DuckDBSecret):
|
|
63
|
+
secret_name = duckdb_secret.name or "gen_secret_" + str(uuid.uuid4()).replace("-", "_")
|
|
64
|
+
create_secret_sql = generate_create_secret_sql(secret_name, duckdb_secret)
|
|
65
|
+
try:
|
|
66
|
+
logger.debug(f"About to create duckdb secret '{secret_name}' with type {duckdb_secret.type}")
|
|
67
|
+
conn.sql(create_secret_sql)
|
|
68
|
+
yield conn
|
|
69
|
+
finally:
|
|
70
|
+
logger.debug(f"Dropping duckdb secret '{secret_name}'")
|
|
71
|
+
conn.sql(f"DROP SECRET IF EXISTS {secret_name};")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _resolve_url(file_config: ParquetConfigFile) -> str:
|
|
75
|
+
parquet_url = urlparse(file_config.url)
|
|
76
|
+
if parquet_url.scheme == "file":
|
|
77
|
+
return parquet_url.netloc
|
|
78
|
+
return file_config.url
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ParquetIntrospector:
|
|
82
|
+
@contextlib.contextmanager
|
|
83
|
+
def _connect(self, file_config: ParquetConfigFile):
|
|
84
|
+
duckdb_secret = file_config.duckdb_secret
|
|
85
|
+
with duckdb.connect() as conn:
|
|
86
|
+
if duckdb_secret is not None:
|
|
87
|
+
if duckdb_secret.type == "s3":
|
|
88
|
+
conn.execute("INSTALL httpfs;")
|
|
89
|
+
conn.execute("LOAD httpfs;")
|
|
90
|
+
conn.execute("INSTALL s3;")
|
|
91
|
+
conn.execute("LOAD s3;")
|
|
92
|
+
with _create_secret(conn, duckdb_secret):
|
|
93
|
+
yield conn
|
|
94
|
+
else:
|
|
95
|
+
yield conn
|
|
96
|
+
|
|
97
|
+
def check_connection(self, file_config: ParquetConfigFile) -> None:
|
|
98
|
+
with self._connect(file_config) as conn:
|
|
99
|
+
with conn.cursor() as cur:
|
|
100
|
+
resolved_url = _resolve_url(file_config)
|
|
101
|
+
cur.execute(f"SELECT * FROM parquet_file_metadata('{resolved_url}') LIMIT 1")
|
|
102
|
+
columns = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
103
|
+
rows = cur.fetchall()
|
|
104
|
+
parquet_file_metadata = [dict(zip(columns, row)) for row in rows]
|
|
105
|
+
if not parquet_file_metadata:
|
|
106
|
+
raise ValueError(f"No parquet files found by url {resolved_url}")
|
|
107
|
+
if not parquet_file_metadata or not parquet_file_metadata[0]["file_name"]:
|
|
108
|
+
raise ValueError("Parquet resource introspection failed")
|
|
109
|
+
|
|
110
|
+
def introspect(self, file_config: ParquetConfigFile) -> ParquetIntrospectionResult:
|
|
111
|
+
with self._connect(file_config) as conn:
|
|
112
|
+
with conn.cursor() as cur:
|
|
113
|
+
resolved_url = _resolve_url(file_config)
|
|
114
|
+
cur.execute(f"SELECT * from parquet_metadata('{resolved_url}')")
|
|
115
|
+
cols = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
116
|
+
rows = cur.fetchall()
|
|
117
|
+
file_metas = [dict(zip(cols, row)) for row in rows]
|
|
118
|
+
|
|
119
|
+
columns_per_file: dict[str, dict[int, ParquetColumn]] = defaultdict(defaultdict)
|
|
120
|
+
for file_meta in file_metas:
|
|
121
|
+
file_name = file_meta["file_name"]
|
|
122
|
+
column_id = file_meta["column_id"]
|
|
123
|
+
column_name = file_meta["path_in_schema"]
|
|
124
|
+
column_type = file_meta.get("type") or ""
|
|
125
|
+
num_values: int = file_meta["num_values"]
|
|
126
|
+
stats_min = file_meta.get("stats_min") or ""
|
|
127
|
+
stats_max = file_meta.get("stats_max") or ""
|
|
128
|
+
stats_null_count: int | None = file_meta.get("stats_null_count")
|
|
129
|
+
stats_distinct_count: int | None = file_meta.get("stats_distinct_count")
|
|
130
|
+
|
|
131
|
+
columns: dict[int, ParquetColumn] = columns_per_file[file_name]
|
|
132
|
+
column: ParquetColumn | None = columns.get(column_id)
|
|
133
|
+
if column:
|
|
134
|
+
columns[column_id] = replace(
|
|
135
|
+
column, num_values=column.num_values + num_values, row_groups=column.row_groups + 1
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
columns[column_id] = ParquetColumn(
|
|
139
|
+
name=column_name,
|
|
140
|
+
type=column_type,
|
|
141
|
+
row_groups=1,
|
|
142
|
+
num_values=num_values,
|
|
143
|
+
stats_min=stats_min,
|
|
144
|
+
stats_max=stats_max,
|
|
145
|
+
stats_null_count=stats_null_count,
|
|
146
|
+
stats_distinct_count=stats_distinct_count,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return ParquetIntrospectionResult(
|
|
150
|
+
files=[
|
|
151
|
+
ParquetFile(file_name, columns=list(columns.values()))
|
|
152
|
+
for (file_name, columns) in columns_per_file.items()
|
|
153
|
+
]
|
|
154
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.snowflake_introspector import SnowflakeConfigFile, SnowflakeIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SnowflakeDbPlugin(BaseDatabasePlugin[SnowflakeConfigFile]):
|
|
6
|
+
id = "jetbrains/snowflake"
|
|
7
|
+
name = "Snowflake DB Plugin"
|
|
8
|
+
supported = {"databases/snowflake"}
|
|
9
|
+
config_file_type = SnowflakeConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(SnowflakeIntrospector())
|