databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +32 -7
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +82 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
- databao_context_engine/cli/add_datasource_config.py +49 -44
- databao_context_engine/cli/commands.py +40 -55
- databao_context_engine/cli/info.py +3 -2
- databao_context_engine/databao_context_engine.py +127 -0
- databao_context_engine/databao_context_project_manager.py +147 -30
- databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
- databao_context_engine/datasources/datasource_context.py +90 -0
- databao_context_engine/datasources/datasource_discovery.py +143 -0
- databao_context_engine/datasources/types.py +194 -0
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +76 -57
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +26 -30
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +9 -11
- databao_context_engine/plugin_loader.py +110 -0
- databao_context_engine/pluginlib/build_plugin.py +12 -29
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
- databao_context_engine/plugins/databases/base_introspector.py +11 -12
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
- databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/duckdb_tools.py +18 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +58 -52
- databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/info.py +34 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +14 -15
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
- databao_context_engine/serialization/__init__.py +0 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +3 -5
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.5.dist-info/METADATA +228 -0
- databao_context_engine-0.1.5.dist-info/RECORD +135 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/mcp/all_results_tool.py +0 -5
- databao_context_engine/mcp/retrieve_tool.py +0 -22
- databao_context_engine/plugins/databases/athena_introspector.py +0 -101
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/project/datasource_discovery.py +0 -141
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/project/types.py +0 -134
- databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.1.dist-info/METADATA +0 -186
- databao_context_engine-0.1.1.dist-info/RECORD +0 -135
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
- /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
- /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
6
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
7
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
8
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLiteConnectionConfig(BaseModel):
|
|
12
|
+
database_path: str = Field(description="Path to the SQLite database file")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SQLiteConfigFile(BaseDatabaseConfigFile):
|
|
16
|
+
type: str = Field(default="sqlite")
|
|
17
|
+
connection: SQLiteConnectionConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SQLiteIntrospector(BaseIntrospector[SQLiteConfigFile]):
|
|
21
|
+
_IGNORED_SCHEMAS = {"temp", "information_schema"}
|
|
22
|
+
_PSEUDO_SCHEMA = "main"
|
|
23
|
+
supports_catalogs = False
|
|
24
|
+
|
|
25
|
+
def _connect(self, file_config: SQLiteConfigFile, *, catalog: str | None = None):
|
|
26
|
+
database_path = str(file_config.connection.database_path)
|
|
27
|
+
conn = sqlite3.connect(database_path)
|
|
28
|
+
conn.text_factory = str
|
|
29
|
+
return conn
|
|
30
|
+
|
|
31
|
+
def _get_catalogs(self, connection, file_config: SQLiteConfigFile) -> list[str]:
|
|
32
|
+
return [self._resolve_pseudo_catalog_name(file_config)]
|
|
33
|
+
|
|
34
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
35
|
+
cur = connection.cursor()
|
|
36
|
+
if params is None:
|
|
37
|
+
cur.execute(sql)
|
|
38
|
+
else:
|
|
39
|
+
cur.execute(sql, params)
|
|
40
|
+
|
|
41
|
+
if cur.description is None:
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
rows = cur.fetchall()
|
|
45
|
+
out: list[dict] = []
|
|
46
|
+
for r in rows:
|
|
47
|
+
if isinstance(r, sqlite3.Row):
|
|
48
|
+
out.append({k.lower(): r[k] for k in r.keys()})
|
|
49
|
+
else:
|
|
50
|
+
cols = [d[0].lower() for d in cur.description]
|
|
51
|
+
out.append(dict(zip(cols, r)))
|
|
52
|
+
return out
|
|
53
|
+
|
|
54
|
+
def _list_schemas_for_catalog(self, connection, catalog: str) -> list[str]:
|
|
55
|
+
return [self._PSEUDO_SCHEMA]
|
|
56
|
+
|
|
57
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
58
|
+
if not schemas:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
comps = self._component_queries()
|
|
62
|
+
results: dict[str, list[dict]] = {name: [] for name in comps}
|
|
63
|
+
|
|
64
|
+
for name, sql in comps.items():
|
|
65
|
+
results[name] = self._fetchall_dicts(connection, sql, None) or []
|
|
66
|
+
|
|
67
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
68
|
+
schemas=[self._PSEUDO_SCHEMA],
|
|
69
|
+
rels=results.get("relations", []),
|
|
70
|
+
cols=results.get("columns", []),
|
|
71
|
+
pk_cols=results.get("pk", []),
|
|
72
|
+
uq_cols=results.get("uq", []),
|
|
73
|
+
checks=[],
|
|
74
|
+
fk_cols=results.get("fks", []),
|
|
75
|
+
idx_cols=results.get("idx", []),
|
|
76
|
+
partitions=[],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _component_queries(self) -> dict[str, str]:
|
|
80
|
+
return {
|
|
81
|
+
"relations": self._sql_relations(),
|
|
82
|
+
"columns": self._sql_columns(),
|
|
83
|
+
"pk": self._sql_primary_keys(),
|
|
84
|
+
"uq": self._sql_unique(),
|
|
85
|
+
"fks": self._sql_foreign_keys(),
|
|
86
|
+
"idx": self._sql_indexes(),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def _sql_relations(self) -> str:
|
|
90
|
+
return f"""
|
|
91
|
+
SELECT
|
|
92
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
93
|
+
m.name AS table_name,
|
|
94
|
+
CASE m.type
|
|
95
|
+
WHEN 'view' THEN 'view'
|
|
96
|
+
ELSE 'table'
|
|
97
|
+
END AS kind,
|
|
98
|
+
NULL AS description
|
|
99
|
+
FROM
|
|
100
|
+
sqlite_master m
|
|
101
|
+
WHERE
|
|
102
|
+
m.type IN ('table', 'view')
|
|
103
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
104
|
+
ORDER BY
|
|
105
|
+
m.name;
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def _sql_columns(self) -> str:
|
|
109
|
+
return f"""
|
|
110
|
+
SELECT
|
|
111
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
112
|
+
m.name AS table_name,
|
|
113
|
+
c.name AS column_name,
|
|
114
|
+
(c.cid + 1) AS ordinal_position,
|
|
115
|
+
COALESCE(c.type,'') AS data_type,
|
|
116
|
+
CASE
|
|
117
|
+
WHEN c.pk > 0 THEN 0
|
|
118
|
+
WHEN c."notnull" = 0 THEN 1
|
|
119
|
+
ELSE 0
|
|
120
|
+
END AS is_nullable,
|
|
121
|
+
c.dflt_value AS default_expression,
|
|
122
|
+
CASE
|
|
123
|
+
WHEN c.hidden IN (2,3) THEN 'computed'
|
|
124
|
+
END AS generated,
|
|
125
|
+
NULL AS description
|
|
126
|
+
FROM
|
|
127
|
+
sqlite_master m
|
|
128
|
+
JOIN pragma_table_xinfo(m.name) c
|
|
129
|
+
WHERE
|
|
130
|
+
m.type IN ('table','view')
|
|
131
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
132
|
+
ORDER BY
|
|
133
|
+
m.name,
|
|
134
|
+
c.cid;
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def _sql_primary_keys(self) -> str:
|
|
138
|
+
return f"""
|
|
139
|
+
SELECT
|
|
140
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
141
|
+
m.name AS table_name,
|
|
142
|
+
('pk_' || m.name) AS constraint_name,
|
|
143
|
+
c.pk AS position,
|
|
144
|
+
c.name AS column_name
|
|
145
|
+
FROM
|
|
146
|
+
sqlite_master m
|
|
147
|
+
JOIN pragma_table_info(m.name) c
|
|
148
|
+
WHERE
|
|
149
|
+
m.type = 'table'
|
|
150
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
151
|
+
AND c.pk > 0
|
|
152
|
+
ORDER BY
|
|
153
|
+
m.name,
|
|
154
|
+
c.pk;
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def _sql_unique(self) -> str:
|
|
158
|
+
return f"""
|
|
159
|
+
SELECT
|
|
160
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
161
|
+
m.name AS table_name,
|
|
162
|
+
il.name AS constraint_name,
|
|
163
|
+
(ii.seqno + 1) AS position,
|
|
164
|
+
ii.name AS column_name
|
|
165
|
+
FROM
|
|
166
|
+
sqlite_master m
|
|
167
|
+
JOIN pragma_index_list(m.name) il
|
|
168
|
+
JOIN pragma_index_info(il.name) ii
|
|
169
|
+
WHERE
|
|
170
|
+
m.type = 'table'
|
|
171
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
172
|
+
AND il."unique" = 1
|
|
173
|
+
AND il.origin = 'u'
|
|
174
|
+
ORDER BY
|
|
175
|
+
m.name,
|
|
176
|
+
il.name,
|
|
177
|
+
ii.seqno;
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def _sql_foreign_keys(self) -> str:
|
|
181
|
+
return f"""
|
|
182
|
+
SELECT
|
|
183
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
184
|
+
m.name AS table_name,
|
|
185
|
+
('fk_' || m.name || '_' || fk.id) AS constraint_name,
|
|
186
|
+
(fk.seq + 1) AS position,
|
|
187
|
+
fk."from" AS from_column,
|
|
188
|
+
'main' AS ref_schema,
|
|
189
|
+
fk."table" AS ref_table,
|
|
190
|
+
fk."to" AS to_column,
|
|
191
|
+
lower(fk.on_update) AS on_update,
|
|
192
|
+
lower(fk.on_delete) AS on_delete,
|
|
193
|
+
1 AS enforced,
|
|
194
|
+
1 AS validated
|
|
195
|
+
FROM sqlite_master m
|
|
196
|
+
JOIN pragma_foreign_key_list(m.name) fk
|
|
197
|
+
WHERE
|
|
198
|
+
m.type = 'table'
|
|
199
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
200
|
+
ORDER BY
|
|
201
|
+
m.name,
|
|
202
|
+
fk.id,
|
|
203
|
+
fk.seq;
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
def _sql_indexes(self) -> str:
|
|
207
|
+
return f"""
|
|
208
|
+
SELECT
|
|
209
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
210
|
+
m.name AS table_name,
|
|
211
|
+
il.name AS index_name,
|
|
212
|
+
(ix.seqno + 1) AS position,
|
|
213
|
+
ix.name AS expr,
|
|
214
|
+
il."unique" AS is_unique,
|
|
215
|
+
NULL AS method,
|
|
216
|
+
CASE
|
|
217
|
+
WHEN il.partial = 1 AND sm.sql IS NOT NULL AND instr(upper(sm.sql), 'WHERE') > 0
|
|
218
|
+
THEN trim(substr(sm.sql, instr(upper(sm.sql), 'WHERE') + length('WHERE')))
|
|
219
|
+
END AS predicate
|
|
220
|
+
FROM
|
|
221
|
+
sqlite_master m
|
|
222
|
+
JOIN pragma_index_list(m.name) il
|
|
223
|
+
JOIN pragma_index_xinfo(il.name) ix
|
|
224
|
+
LEFT JOIN sqlite_master sm ON sm.type = 'index' AND sm.name = il.name
|
|
225
|
+
WHERE
|
|
226
|
+
m.type='table'
|
|
227
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
228
|
+
AND lower(il.origin) = 'c'
|
|
229
|
+
AND ix.key = 1
|
|
230
|
+
ORDER BY
|
|
231
|
+
m.name,
|
|
232
|
+
il.name,
|
|
233
|
+
ix.seqno;
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
237
|
+
sql = f"SELECT * FROM {self._quote_ident(table)} LIMIT ?"
|
|
238
|
+
return SQLQuery(sql, (limit,))
|
|
239
|
+
|
|
240
|
+
def _quote_ident(self, ident: str) -> str:
|
|
241
|
+
return '"' + str(ident).replace('"', '""') + '"'
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.config import DuckDBSecret
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def generate_create_secret_sql(secret_name, duckdb_secret: DuckDBSecret) -> str:
|
|
7
|
+
parameters = [("type", duckdb_secret.type)] + list(duckdb_secret.properties.items())
|
|
8
|
+
return f"""CREATE SECRET {secret_name} (
|
|
9
|
+
{", ".join([f"{k} '{v}'" for (k, v) in parameters])}
|
|
10
|
+
);
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def fetchall_dicts(cur, sql: str, params=None) -> list[dict[str, Any]]:
|
|
15
|
+
cur.execute(sql, params or [])
|
|
16
|
+
columns = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
17
|
+
rows = cur.fetchall()
|
|
18
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
File without changes
|
databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py}
RENAMED
|
@@ -24,7 +24,7 @@ class InternalUnstructuredFilesPlugin(BuildFilePlugin):
|
|
|
24
24
|
self.tokens_overlap = tokens_overlap or self._DEFAULT_TOKENS_OVERLAP
|
|
25
25
|
|
|
26
26
|
def supported_types(self) -> set[str]:
|
|
27
|
-
return
|
|
27
|
+
return self._SUPPORTED_FILES_EXTENSIONS
|
|
28
28
|
|
|
29
29
|
def build_file_context(self, full_type: str, file_name: str, file_buffer: BufferedReader) -> Any:
|
|
30
30
|
file_content = self._read_file(file_buffer)
|
|
@@ -14,31 +14,12 @@ class DuplicatePluginTypeError(RuntimeError):
|
|
|
14
14
|
"""Raised when two plugins register the same <main>/<sub> plugin key."""
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_all_available_plugin_types(exclude_file_plugins: bool = False) -> set[DatasourceType]:
|
|
21
|
-
return set(load_plugins(exclude_file_plugins=exclude_file_plugins).keys())
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_plugin_for_type(datasource_type: DatasourceType) -> BuildPlugin:
|
|
25
|
-
all_plugins = load_plugins()
|
|
26
|
-
|
|
27
|
-
if datasource_type not in all_plugins:
|
|
28
|
-
raise ValueError(f"No plugin found for type '{datasource_type.full_type}'")
|
|
29
|
-
|
|
30
|
-
return load_plugins()[datasource_type]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def load_plugins(exclude_file_plugins: bool = False) -> PluginList:
|
|
34
|
-
"""
|
|
35
|
-
Loads both builtin and external plugins and merges them into one list
|
|
36
|
-
"""
|
|
17
|
+
def load_plugins(exclude_file_plugins: bool = False) -> dict[DatasourceType, BuildPlugin]:
|
|
18
|
+
"""Load both builtin and external plugins and merges them into one list."""
|
|
37
19
|
builtin_plugins = _load_builtin_plugins(exclude_file_plugins)
|
|
38
20
|
external_plugins = _load_external_plugins(exclude_file_plugins)
|
|
39
|
-
plugins = merge_plugins(builtin_plugins, external_plugins)
|
|
40
21
|
|
|
41
|
-
return
|
|
22
|
+
return _merge_plugins(builtin_plugins, external_plugins)
|
|
42
23
|
|
|
43
24
|
|
|
44
25
|
def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
|
|
@@ -53,7 +34,7 @@ def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugi
|
|
|
53
34
|
|
|
54
35
|
|
|
55
36
|
def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
|
|
56
|
-
from databao_context_engine.plugins.unstructured_files_plugin import InternalUnstructuredFilesPlugin
|
|
37
|
+
from databao_context_engine.plugins.files.unstructured_files_plugin import InternalUnstructuredFilesPlugin
|
|
57
38
|
|
|
58
39
|
return [
|
|
59
40
|
InternalUnstructuredFilesPlugin(),
|
|
@@ -61,43 +42,68 @@ def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
|
|
|
61
42
|
|
|
62
43
|
|
|
63
44
|
def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
from databao_context_engine.plugins.
|
|
68
|
-
from databao_context_engine.plugins.clickhouse_db_plugin import ClickhouseDbPlugin
|
|
69
|
-
from databao_context_engine.plugins.duckdb_db_plugin import DuckDbPlugin
|
|
70
|
-
from databao_context_engine.plugins.mssql_db_plugin import MSSQLDbPlugin
|
|
71
|
-
from databao_context_engine.plugins.mysql_db_plugin import MySQLDbPlugin
|
|
72
|
-
from databao_context_engine.plugins.parquet_plugin import ParquetPlugin
|
|
73
|
-
from databao_context_engine.plugins.postgresql_db_plugin import PostgresqlDbPlugin
|
|
74
|
-
from databao_context_engine.plugins.snowflake_db_plugin import SnowflakeDbPlugin
|
|
45
|
+
"""Statically register built-in plugins."""
|
|
46
|
+
from databao_context_engine.plugins.databases.duckdb.duckdb_db_plugin import DuckDbPlugin
|
|
47
|
+
from databao_context_engine.plugins.databases.sqlite.sqlite_db_plugin import SQLiteDbPlugin
|
|
48
|
+
from databao_context_engine.plugins.resources.parquet_plugin import ParquetPlugin
|
|
75
49
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
50
|
+
# optional plugins are added to the python environment via extras
|
|
51
|
+
optional_plugins: list[BuildDatasourcePlugin] = []
|
|
52
|
+
try:
|
|
53
|
+
from databao_context_engine.plugins.databases.mssql.mssql_db_plugin import MSSQLDbPlugin
|
|
54
|
+
|
|
55
|
+
optional_plugins = [MSSQLDbPlugin()]
|
|
56
|
+
except ImportError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from databao_context_engine.plugins.databases.clickhouse.clickhouse_db_plugin import ClickhouseDbPlugin
|
|
61
|
+
|
|
62
|
+
optional_plugins.append(ClickhouseDbPlugin())
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
from databao_context_engine.plugins.databases.athena.athena_db_plugin import AthenaDbPlugin
|
|
68
|
+
|
|
69
|
+
optional_plugins.append(AthenaDbPlugin())
|
|
70
|
+
except ImportError:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
from databao_context_engine.plugins.databases.snowflake.snowflake_db_plugin import SnowflakeDbPlugin
|
|
75
|
+
|
|
76
|
+
optional_plugins.append(SnowflakeDbPlugin())
|
|
77
|
+
except ImportError:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
from databao_context_engine.plugins.databases.mysql.mysql_db_plugin import MySQLDbPlugin
|
|
82
|
+
|
|
83
|
+
optional_plugins.append(MySQLDbPlugin())
|
|
84
|
+
except ImportError:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
from databao_context_engine.plugins.databases.postgresql.postgresql_db_plugin import PostgresqlDbPlugin
|
|
89
|
+
|
|
90
|
+
optional_plugins.append(PostgresqlDbPlugin())
|
|
91
|
+
except ImportError:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
required_plugins: list[BuildDatasourcePlugin] = [DuckDbPlugin(), ParquetPlugin(), SQLiteDbPlugin()]
|
|
95
|
+
return required_plugins + optional_plugins
|
|
86
96
|
|
|
87
97
|
|
|
88
98
|
def _load_external_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
|
|
89
|
-
"""
|
|
90
|
-
Discover external plugins via entry points
|
|
91
|
-
"""
|
|
99
|
+
"""Discover external plugins via entry points."""
|
|
92
100
|
# TODO: implement external plugin loading
|
|
93
101
|
return []
|
|
94
102
|
|
|
95
103
|
|
|
96
|
-
def
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
"""
|
|
100
|
-
registry: PluginList = {}
|
|
104
|
+
def _merge_plugins(*plugin_lists: list[BuildPlugin]) -> dict[DatasourceType, BuildPlugin]:
|
|
105
|
+
"""Merge multiple plugin maps."""
|
|
106
|
+
registry: dict[DatasourceType, BuildPlugin] = {}
|
|
101
107
|
for plugins in plugin_lists:
|
|
102
108
|
for plugin in plugins:
|
|
103
109
|
for full_type in plugin.supported_types():
|
|
@@ -6,21 +6,21 @@ from dataclasses import dataclass, replace
|
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
import duckdb
|
|
9
|
-
from
|
|
9
|
+
from duckdb import DuckDBPyConnection
|
|
10
10
|
from pydantic import BaseModel, Field
|
|
11
11
|
|
|
12
12
|
from databao_context_engine.pluginlib.config import DuckDBSecret
|
|
13
|
+
from databao_context_engine.plugins.duckdb_tools import fetchall_dicts, generate_create_secret_sql
|
|
13
14
|
|
|
14
|
-
parquet_type = "
|
|
15
|
+
parquet_type = "parquet"
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class ParquetConfigFile(BaseModel):
|
|
20
21
|
name: str | None = Field(default=None)
|
|
21
|
-
type: str = Field(default=
|
|
22
|
+
type: str = Field(default="parquet")
|
|
22
23
|
url: str = Field(
|
|
23
|
-
default=type,
|
|
24
24
|
description="Parquet resource location. Should be a valid URL or a path to a local file. "
|
|
25
25
|
"Examples: s3://your_bucket/file.parquet, s3://your-bucket/*.parquet, https://some.url/some_file.parquet, ~/path_to/file.parquet",
|
|
26
26
|
)
|
|
@@ -50,14 +50,6 @@ class ParquetIntrospectionResult:
|
|
|
50
50
|
files: list[ParquetFile]
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def generate_create_secret_sql(secret_name, duckdb_secret: DuckDBSecret) -> str:
|
|
54
|
-
parameters = [("type", duckdb_secret.type)] + list(duckdb_secret.properties.items())
|
|
55
|
-
return f"""CREATE SECRET {secret_name} (
|
|
56
|
-
{", ".join([f"{k} {v}" for (k, v) in parameters])}
|
|
57
|
-
);
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
53
|
@contextlib.contextmanager
|
|
62
54
|
def _create_secret(conn: DuckDBPyConnection, duckdb_secret: DuckDBSecret):
|
|
63
55
|
secret_name = duckdb_secret.name or "gen_secret_" + str(uuid.uuid4()).replace("-", "_")
|
|
@@ -98,10 +90,9 @@ class ParquetIntrospector:
|
|
|
98
90
|
with self._connect(file_config) as conn:
|
|
99
91
|
with conn.cursor() as cur:
|
|
100
92
|
resolved_url = _resolve_url(file_config)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
parquet_file_metadata = [dict(zip(columns, row)) for row in rows]
|
|
93
|
+
parquet_file_metadata = fetchall_dicts(
|
|
94
|
+
cur, f"SELECT * FROM parquet_file_metadata('{resolved_url}') LIMIT 1"
|
|
95
|
+
)
|
|
105
96
|
if not parquet_file_metadata:
|
|
106
97
|
raise ValueError(f"No parquet files found by url {resolved_url}")
|
|
107
98
|
if not parquet_file_metadata or not parquet_file_metadata[0]["file_name"]:
|
|
@@ -111,10 +102,7 @@ class ParquetIntrospector:
|
|
|
111
102
|
with self._connect(file_config) as conn:
|
|
112
103
|
with conn.cursor() as cur:
|
|
113
104
|
resolved_url = _resolve_url(file_config)
|
|
114
|
-
cur
|
|
115
|
-
cols = [desc[0].lower() for desc in cur.description] if cur.description else []
|
|
116
|
-
rows = cur.fetchall()
|
|
117
|
-
file_metas = [dict(zip(cols, row)) for row in rows]
|
|
105
|
+
file_metas = fetchall_dicts(cur, f"SELECT * from parquet_metadata('{resolved_url}')")
|
|
118
106
|
|
|
119
107
|
columns_per_file: dict[str, dict[int, ParquetColumn]] = defaultdict(defaultdict)
|
|
120
108
|
for file_meta in file_metas:
|
|
@@ -24,9 +24,7 @@ class ParquetPlugin(BuildDatasourcePlugin[ParquetConfigFile]):
|
|
|
24
24
|
return build_parquet_chunks(context)
|
|
25
25
|
|
|
26
26
|
def build_context(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> Any:
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return introspection_result
|
|
27
|
+
return self._introspector.introspect(file_config)
|
|
30
28
|
|
|
31
29
|
def check_connection(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> None:
|
|
32
30
|
self._introspector.check_connection(file_config)
|
|
@@ -3,29 +3,56 @@ from importlib.metadata import version
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
|
+
from databao_context_engine.plugins.plugin_loader import load_plugins
|
|
6
7
|
from databao_context_engine.project.layout import validate_project_dir
|
|
7
8
|
from databao_context_engine.system.properties import get_dce_path
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@dataclass(kw_only=True, frozen=True)
|
|
11
12
|
class DceProjectInfo:
|
|
13
|
+
"""Information about a Databao Context Engine project.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
project_path: The root directory of the Databao Context Engine project.
|
|
17
|
+
is_initialized: Whether the project has been initialized.
|
|
18
|
+
project_id: The UUID of the project, or None if the project has not been initialized.
|
|
19
|
+
"""
|
|
20
|
+
|
|
12
21
|
project_path: Path
|
|
13
|
-
|
|
22
|
+
is_initialized: bool
|
|
14
23
|
project_id: UUID | None
|
|
15
24
|
|
|
16
25
|
|
|
17
26
|
@dataclass(kw_only=True, frozen=True)
|
|
18
27
|
class DceInfo:
|
|
28
|
+
"""Information about the current Databao Context Engine installation and project.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
version: The version of the databao_context_engine package installed on the system.
|
|
32
|
+
dce_path: The path where databao_context_engine stores its global data.
|
|
33
|
+
project_info: Information about the Databao Context Engine project.
|
|
34
|
+
"""
|
|
35
|
+
|
|
19
36
|
version: str
|
|
20
37
|
dce_path: Path
|
|
38
|
+
plugin_ids: list[str]
|
|
21
39
|
|
|
22
40
|
project_info: DceProjectInfo
|
|
23
41
|
|
|
24
42
|
|
|
25
43
|
def get_databao_context_engine_info(project_dir: Path) -> DceInfo:
|
|
44
|
+
"""Return information about the current Databao Context Engine installation and project.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
project_dir: The root directory of the Databao Context Project.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
A DceInfo instance containing information about the Databao Context Engine installation and project.
|
|
51
|
+
"""
|
|
26
52
|
return DceInfo(
|
|
27
53
|
version=get_dce_version(),
|
|
28
54
|
dce_path=get_dce_path(),
|
|
55
|
+
plugin_ids=[plugin.id for plugin in load_plugins().values()],
|
|
29
56
|
project_info=_get_project_info(project_dir),
|
|
30
57
|
)
|
|
31
58
|
|
|
@@ -35,10 +62,15 @@ def _get_project_info(project_dir: Path) -> DceProjectInfo:
|
|
|
35
62
|
|
|
36
63
|
return DceProjectInfo(
|
|
37
64
|
project_path=project_dir,
|
|
38
|
-
|
|
65
|
+
is_initialized=project_layout is not None,
|
|
39
66
|
project_id=project_layout.read_config_file().project_id if project_layout is not None else None,
|
|
40
67
|
)
|
|
41
68
|
|
|
42
69
|
|
|
43
70
|
def get_dce_version() -> str:
|
|
71
|
+
"""Return the installed version of the databao_context_engine package.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
The installed version of the databao_context_engine package.
|
|
75
|
+
"""
|
|
44
76
|
return version("databao_context_engine")
|
|
@@ -13,12 +13,21 @@ from databao_context_engine.project.project_config import ProjectConfig
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class InitErrorReason(Enum):
|
|
16
|
+
"""Reasons for which project initialization can fail."""
|
|
17
|
+
|
|
16
18
|
PROJECT_DIR_DOESNT_EXIST = "PROJECT_DIR_DOESNT_EXIST"
|
|
17
19
|
PROJECT_DIR_NOT_DIRECTORY = "PROJECT_DIR_NOT_DIRECTORY"
|
|
18
|
-
|
|
20
|
+
PROJECT_DIR_ALREADY_INITIALIZED = "PROJECT_DIR_ALREADY_INITIALIZED"
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class InitProjectError(Exception):
|
|
24
|
+
"""Raised when a project can't be initialized.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
message: The error message.
|
|
28
|
+
reason: The reason for the initialization failure.
|
|
29
|
+
"""
|
|
30
|
+
|
|
22
31
|
reason: InitErrorReason
|
|
23
32
|
|
|
24
33
|
def __init__(self, reason: InitErrorReason, message: str | None):
|
|
@@ -65,20 +74,20 @@ class _ProjectCreator:
|
|
|
65
74
|
|
|
66
75
|
if self.config_file.is_file() or self.deprecated_config_file.is_file():
|
|
67
76
|
raise InitProjectError(
|
|
68
|
-
message=f"Can't
|
|
69
|
-
reason=InitErrorReason.
|
|
77
|
+
message=f"Can't initialize a Databao Context Engine project in a folder that already contains a config file. [project_dir: {self.project_dir.resolve()}]",
|
|
78
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALIZED,
|
|
70
79
|
)
|
|
71
80
|
|
|
72
81
|
if self.src_dir.is_dir():
|
|
73
82
|
raise InitProjectError(
|
|
74
|
-
message=f"Can't
|
|
75
|
-
reason=InitErrorReason.
|
|
83
|
+
message=f"Can't initialize a Databao Context Engine project in a folder that already contains a src directory. [project_dir: {self.project_dir.resolve()}]",
|
|
84
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALIZED,
|
|
76
85
|
)
|
|
77
86
|
|
|
78
87
|
if self.examples_dir.is_file():
|
|
79
88
|
raise InitProjectError(
|
|
80
|
-
message=f"Can't
|
|
81
|
-
reason=InitErrorReason.
|
|
89
|
+
message=f"Can't initialize a Databao Context Engine project in a folder that already contains an examples dir. [project_dir: {self.project_dir.resolve()}]",
|
|
90
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALIZED,
|
|
82
91
|
)
|
|
83
92
|
|
|
84
93
|
return True
|