databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,287 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Mapping
4
+
5
+ import snowflake.connector
6
+ from pydantic import Field
7
+ from snowflake.connector import DictCursor
8
+
9
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
10
+ from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
11
+ from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
12
+ from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
13
+
14
+
15
+ class SnowflakeConfigFile(BaseDatabaseConfigFile):
16
+ type: str = Field(default="databases/snowflake")
17
+ connection: dict[str, Any] = Field(
18
+ description="Connection parameters for Snowflake. It can contain any of the keys supported by the Snowflake connection library"
19
+ )
20
+
21
+
22
+ class SnowflakeIntrospector(BaseIntrospector[SnowflakeConfigFile]):
23
+ _IGNORED_SCHEMAS = {
24
+ "information_schema",
25
+ }
26
+ _IGNORED_CATALOGS = {"STREAMLIT_APPS"}
27
+ supports_catalogs = True
28
+
29
+ def _connect(self, file_config: SnowflakeConfigFile):
30
+ connection = file_config.connection
31
+ if not isinstance(connection, Mapping):
32
+ raise ValueError("Invalid YAML config: 'connection' must be a mapping of connection parameters")
33
+ snowflake.connector.paramstyle = "qmark"
34
+ return snowflake.connector.connect(
35
+ **connection,
36
+ )
37
+
38
+ def _connect_to_catalog(self, file_config: SnowflakeConfigFile, catalog: str):
39
+ cfg = dict(file_config.connection or {})
40
+ cfg["database"] = catalog
41
+ return snowflake.connector.connect(**cfg)
42
+
43
+ def _get_catalogs(self, connection, file_config: SnowflakeConfigFile) -> list[str]:
44
+ database = file_config.connection.get("database")
45
+ if database:
46
+ return [database]
47
+
48
+ rows = self._fetchall_dicts(connection, "SHOW DATABASES", None)
49
+ return [r["name"] for r in rows if r["name"] and r["name"].upper() not in self._IGNORED_CATALOGS]
50
+
51
+ def _sql_list_schemas(self, catalogs: list[str] | None) -> SQLQuery:
52
+ if not catalogs:
53
+ return SQLQuery("SELECT schema_name, catalog_name FROM information_schema.schemata", None)
54
+ parts = []
55
+ for catalog in catalogs:
56
+ parts.append(f"SELECT schema_name, catalog_name FROM {catalog}.information_schema.schemata")
57
+ return SQLQuery(" UNION ALL ".join(parts), None)
58
+
59
+ def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
60
+ if not schemas:
61
+ return []
62
+
63
+ comps = self._component_queries(catalog, schemas)
64
+
65
+ statements = [c["sql"].rstrip().rstrip(";") for c in comps]
66
+ batch_sql = ";\n".join(statements)
67
+
68
+ results: dict[str, list[dict]] = {
69
+ "relations": [],
70
+ "columns": [],
71
+ "pk": [],
72
+ "fks": [],
73
+ "uq": [],
74
+ }
75
+
76
+ with connection.cursor(DictCursor) as cur:
77
+ cur.execute(batch_sql, num_statements=len(statements))
78
+
79
+ for ix, comp in enumerate(comps, start=1):
80
+ name = comp["name"]
81
+
82
+ rows = self._lower_keys(cur.fetchall()) if cur.description else []
83
+
84
+ if name:
85
+ results[name].extend(rows)
86
+
87
+ if ix < len(comps):
88
+ ok = cur.nextset()
89
+ if not ok:
90
+ raise RuntimeError(
91
+ f"Snowflake multi-statement batch ended early after component #{ix} '{name}'"
92
+ )
93
+
94
+ return IntrospectionModelBuilder.build_schemas_from_components(
95
+ schemas=schemas,
96
+ rels=results["relations"],
97
+ cols=results["columns"],
98
+ pk_cols=results["pk"],
99
+ uq_cols=results["uq"],
100
+ checks=[],
101
+ fk_cols=results["fks"],
102
+ idx_cols=[],
103
+ )
104
+
105
+ def _component_queries(self, catalog: str, schemas: list[str]) -> list[dict]:
106
+ schemas_in = ", ".join(self._quote_literal(s) for s in schemas)
107
+
108
+ return [
109
+ {"name": "relations", "sql": self._sql_relations(catalog, schemas_in)},
110
+ {"name": "columns", "sql": self._sql_columns(catalog, schemas_in)},
111
+ {"name": None, "sql": self._sql_pk_show(catalog)},
112
+ {"name": "pk", "sql": self._sql_pk_select(schemas_in)},
113
+ {"name": None, "sql": self._sql_fk_show(catalog)},
114
+ {"name": "fks", "sql": self._sql_fk_select(schemas_in)},
115
+ {"name": None, "sql": self._sql_uq_show(catalog)},
116
+ {"name": "uq", "sql": self._sql_uq_select(schemas_in)},
117
+ ]
118
+
119
+ def _sql_relations(self, catalog: str, schemas_in: str) -> str:
120
+ isq = self._qual_is(catalog)
121
+
122
+ return f"""
123
+ SELECT
124
+ t.TABLE_SCHEMA AS "schema_name",
125
+ t.TABLE_NAME AS "table_name",
126
+ CASE t.TABLE_TYPE
127
+ WHEN 'BASE TABLE' THEN 'table'
128
+ WHEN 'VIEW' THEN 'view'
129
+ WHEN 'MATERIALIZED VIEW' THEN 'materialized_view'
130
+ WHEN 'EXTERNAL TABLE' THEN 'external_table'
131
+ ELSE LOWER(t.TABLE_TYPE)
132
+ END AS "kind",
133
+ t.COMMENT AS "description"
134
+ FROM
135
+ {isq}.TABLES AS t
136
+ WHERE
137
+ t.TABLE_SCHEMA IN ({schemas_in})
138
+ ORDER BY
139
+ t.TABLE_SCHEMA,
140
+ t.TABLE_NAME
141
+ """
142
+
143
+ def _sql_columns(self, catalog: str, schemas_in: str) -> str:
144
+ isq = self._qual_is(catalog)
145
+
146
+ return f"""
147
+ SELECT
148
+ c.TABLE_SCHEMA AS "schema_name",
149
+ c.TABLE_NAME AS "table_name",
150
+ c.COLUMN_NAME AS "column_name",
151
+ c.ORDINAL_POSITION AS "ordinal_position",
152
+ c.DATA_TYPE AS "data_type",
153
+ IFF(c.IS_NULLABLE = 'YES', TRUE, FALSE) AS "is_nullable",
154
+ c.COLUMN_DEFAULT AS "default_expression",
155
+ IFF(c.IS_IDENTITY = 'YES', 'identity', NULL) AS "generated",
156
+ c.COMMENT AS "description"
157
+ FROM
158
+ {isq}.COLUMNS AS c
159
+ WHERE
160
+ c.TABLE_SCHEMA IN ({schemas_in})
161
+ ORDER BY
162
+ c.TABLE_SCHEMA,
163
+ c.TABLE_NAME,
164
+ c.ORDINAL_POSITION
165
+ """
166
+
167
+ def _sql_pk_show(self, catalog: str) -> str:
168
+ return f"""
169
+ SHOW PRIMARY KEYS IN DATABASE {self._quote_ident(catalog)}
170
+ """
171
+
172
+ def _sql_pk_select(self, schemas_in: str) -> str:
173
+ return f"""
174
+ SELECT
175
+ "schema_name" AS schema_name,
176
+ "table_name" AS table_name,
177
+ "constraint_name" AS constraint_name,
178
+ "column_name" AS column_name,
179
+ "key_sequence"::INT AS position
180
+ FROM
181
+ TABLE(RESULT_SCAN(LAST_QUERY_ID()))
182
+ WHERE
183
+ "schema_name" IN ({schemas_in})
184
+ ORDER BY
185
+ table_name,
186
+ constraint_name,
187
+ position
188
+ """
189
+
190
+ def _sql_fk_show(self, catalog: str) -> str:
191
+ return f"""
192
+ SHOW IMPORTED KEYS IN DATABASE {self._quote_ident(catalog)}
193
+ """
194
+
195
+ def _sql_fk_select(self, schemas_in: str) -> str:
196
+ return f"""
197
+ SELECT
198
+ "fk_schema_name" AS "schema_name",
199
+ "fk_table_name" AS "table_name",
200
+ "fk_name" AS "constraint_name",
201
+ "key_sequence"::INT AS "position",
202
+ "fk_column_name" AS "from_column",
203
+ "pk_schema_name" AS "ref_schema",
204
+ "pk_table_name" AS "ref_table",
205
+ "pk_column_name" AS "to_column",
206
+ LOWER("update_rule") AS "on_update",
207
+ LOWER("delete_rule") AS "on_delete",
208
+ NULL::BOOLEAN AS "enforced",
209
+ NULL::BOOLEAN AS "validated"
210
+ FROM
211
+ TABLE(RESULT_SCAN(LAST_QUERY_ID()))
212
+ WHERE
213
+ "fk_schema_name" IN ({schemas_in})
214
+ ORDER BY
215
+ "schema_name",
216
+ "table_name",
217
+ "constraint_name",
218
+ "position"
219
+ """
220
+
221
+ def _sql_uq_show(self, catalog: str) -> str:
222
+ return f"""
223
+ SHOW UNIQUE KEYS IN DATABASE {self._quote_ident(catalog)}
224
+ """
225
+
226
+ def _sql_uq_select(self, schemas_in: str) -> str:
227
+ return f"""
228
+ SELECT
229
+ "schema_name" AS "schema_name",
230
+ "table_name" AS "table_name",
231
+ "constraint_name" AS "constraint_name",
232
+ "column_name" AS "column_name",
233
+ "key_sequence"::INT AS "position"
234
+ FROM
235
+ TABLE(RESULT_SCAN(LAST_QUERY_ID()))
236
+ WHERE
237
+ "schema_name" IN ({schemas_in})
238
+ ORDER BY
239
+ "schema_name",
240
+ "table_name",
241
+ "constraint_name",
242
+ "position"
243
+ """
244
+
245
+ def _sql_checks(self, catalog: str, schemas_in: str) -> str:
246
+ isq = self._qual_is(catalog)
247
+
248
+ return f"""
249
+ SELECT
250
+ tc.TABLE_SCHEMA AS "schema_name",
251
+ tc.TABLE_NAME AS "table_name",
252
+ tc.CONSTRAINT_NAME AS "constraint_name",
253
+ NULL::VARCHAR AS "expression",
254
+ TRUE AS "validated"
255
+ FROM
256
+ {isq}.TABLE_CONSTRAINTS AS tc
257
+ WHERE
258
+ tc.TABLE_SCHEMA IN ({schemas_in})
259
+ AND tc.CONSTRAINT_TYPE = 'CHECK'
260
+ ORDER BY
261
+ tc.TABLE_SCHEMA,
262
+ tc.TABLE_NAME,
263
+ tc.CONSTRAINT_NAME
264
+ """
265
+
266
+ def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
267
+ sql = f'SELECT * FROM "{schema}"."{table}" LIMIT ?'
268
+ return SQLQuery(sql, (limit,))
269
+
270
+ def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
271
+ with connection.cursor(snowflake.connector.DictCursor) as cur:
272
+ cur.execute(sql, params)
273
+ rows = cur.fetchall()
274
+ return [{k.lower(): v for k, v in row.items()} for row in rows]
275
+
276
+ def _quote_literal(self, value: str) -> str:
277
+ return "'" + str(value).replace("'", "''") + "'"
278
+
279
+ def _quote_ident(self, ident: str) -> str:
280
+ return '"' + ident.replace('"', '""') + '"'
281
+
282
+ def _qual_is(self, catalog: str) -> str:
283
+ return f"{self._quote_ident(catalog)}.INFORMATION_SCHEMA"
284
+
285
+ @staticmethod
286
+ def _lower_keys(rows: List[Dict]) -> List[Dict]:
287
+ return [{k.lower(): v for k, v in row.items()} for row in rows]
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.duckdb_introspector import DuckDBConfigFile, DuckDBIntrospector
3
+
4
+
5
+ class DuckDbPlugin(BaseDatabasePlugin[DuckDBConfigFile]):
6
+ id = "jetbrains/duckdb"
7
+ name = "DuckDB Plugin"
8
+ supported = {"databases/duckdb"}
9
+ config_file_type = DuckDBConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(DuckDBIntrospector())
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.mssql_introspector import MSSQLConfigFile, MSSQLIntrospector
3
+
4
+
5
+ class MSSQLDbPlugin(BaseDatabasePlugin[MSSQLConfigFile]):
6
+ id = "jetbrains/mssql"
7
+ name = "MSSQL DB Plugin"
8
+ supported = {"databases/mssql"}
9
+ config_file_type = MSSQLConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(MSSQLIntrospector())
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.mysql_introspector import MySQLConfigFile, MySQLIntrospector
3
+
4
+
5
+ class MySQLDbPlugin(BaseDatabasePlugin[MySQLConfigFile]):
6
+ id = "jetbrains/mysql"
7
+ name = "MySQL DB Plugin"
8
+ supported = {"databases/mysql"}
9
+ config_file_type = MySQLConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(MySQLIntrospector())
@@ -0,0 +1,32 @@
1
+ from typing import Any
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, EmbeddableChunk
4
+ from databao_context_engine.plugins.resources.parquet_chunker import build_parquet_chunks
5
+ from databao_context_engine.plugins.resources.parquet_introspector import (
6
+ ParquetConfigFile,
7
+ ParquetIntrospector,
8
+ parquet_type,
9
+ )
10
+
11
+
12
+ class ParquetPlugin(BuildDatasourcePlugin[ParquetConfigFile]):
13
+ id = "jetbrains/parquet"
14
+ name = "Parquet Plugin"
15
+ config_file_type = ParquetConfigFile
16
+
17
+ def __init__(self):
18
+ self._introspector = ParquetIntrospector()
19
+
20
+ def supported_types(self) -> set[str]:
21
+ return {parquet_type}
22
+
23
+ def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
24
+ return build_parquet_chunks(context)
25
+
26
+ def build_context(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> Any:
27
+ introspection_result = self._introspector.introspect(file_config)
28
+
29
+ return introspection_result
30
+
31
+ def check_connection(self, full_type: str, datasource_name: str, file_config: ParquetConfigFile) -> None:
32
+ self._introspector.check_connection(file_config)
@@ -0,0 +1,110 @@
1
+ import logging
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import (
4
+ BuildDatasourcePlugin,
5
+ BuildFilePlugin,
6
+ BuildPlugin,
7
+ DatasourceType,
8
+ )
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class DuplicatePluginTypeError(RuntimeError):
14
+ """Raised when two plugins register the same <main>/<sub> plugin key."""
15
+
16
+
17
+ PluginList = dict[DatasourceType, BuildPlugin]
18
+
19
+
20
+ def get_all_available_plugin_types(exclude_file_plugins: bool = False) -> set[DatasourceType]:
21
+ return set(load_plugins(exclude_file_plugins=exclude_file_plugins).keys())
22
+
23
+
24
+ def get_plugin_for_type(datasource_type: DatasourceType) -> BuildPlugin:
25
+ all_plugins = load_plugins()
26
+
27
+ if datasource_type not in all_plugins:
28
+ raise ValueError(f"No plugin found for type '{datasource_type.full_type}'")
29
+
30
+ return load_plugins()[datasource_type]
31
+
32
+
33
+ def load_plugins(exclude_file_plugins: bool = False) -> PluginList:
34
+ """
35
+ Loads both builtin and external plugins and merges them into one list
36
+ """
37
+ builtin_plugins = _load_builtin_plugins(exclude_file_plugins)
38
+ external_plugins = _load_external_plugins(exclude_file_plugins)
39
+ plugins = merge_plugins(builtin_plugins, external_plugins)
40
+
41
+ return plugins
42
+
43
+
44
+ def _load_builtin_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
45
+ all_builtin_plugins: list[BuildPlugin] = []
46
+
47
+ all_builtin_plugins += _load_builtin_datasource_plugins()
48
+
49
+ if not exclude_file_plugins:
50
+ all_builtin_plugins += _load_builtin_file_plugins()
51
+
52
+ return all_builtin_plugins
53
+
54
+
55
+ def _load_builtin_file_plugins() -> list[BuildFilePlugin]:
56
+ from databao_context_engine.plugins.unstructured_files_plugin import InternalUnstructuredFilesPlugin
57
+
58
+ return [
59
+ InternalUnstructuredFilesPlugin(),
60
+ ]
61
+
62
+
63
+ def _load_builtin_datasource_plugins() -> list[BuildDatasourcePlugin]:
64
+ """
65
+ Statically register built-in plugins
66
+ """
67
+ from databao_context_engine.plugins.athena_db_plugin import AthenaDbPlugin
68
+ from databao_context_engine.plugins.clickhouse_db_plugin import ClickhouseDbPlugin
69
+ from databao_context_engine.plugins.duckdb_db_plugin import DuckDbPlugin
70
+ from databao_context_engine.plugins.mssql_db_plugin import MSSQLDbPlugin
71
+ from databao_context_engine.plugins.mysql_db_plugin import MySQLDbPlugin
72
+ from databao_context_engine.plugins.parquet_plugin import ParquetPlugin
73
+ from databao_context_engine.plugins.postgresql_db_plugin import PostgresqlDbPlugin
74
+ from databao_context_engine.plugins.snowflake_db_plugin import SnowflakeDbPlugin
75
+
76
+ return [
77
+ AthenaDbPlugin(),
78
+ ClickhouseDbPlugin(),
79
+ DuckDbPlugin(),
80
+ MSSQLDbPlugin(),
81
+ MySQLDbPlugin(),
82
+ PostgresqlDbPlugin(),
83
+ SnowflakeDbPlugin(),
84
+ ParquetPlugin(),
85
+ ]
86
+
87
+
88
+ def _load_external_plugins(exclude_file_plugins: bool = False) -> list[BuildPlugin]:
89
+ """
90
+ Discover external plugins via entry points
91
+ """
92
+ # TODO: implement external plugin loading
93
+ return []
94
+
95
+
96
+ def merge_plugins(*plugin_lists: list[BuildPlugin]) -> PluginList:
97
+ """
98
+ Merge multiple plugin maps
99
+ """
100
+ registry: PluginList = {}
101
+ for plugins in plugin_lists:
102
+ for plugin in plugins:
103
+ for full_type in plugin.supported_types():
104
+ datasource_type = DatasourceType(full_type=full_type)
105
+ if datasource_type in registry:
106
+ raise DuplicatePluginTypeError(
107
+ f"Plugin type '{datasource_type.full_type}' is provided by both {type(registry[datasource_type]).__name__} and {type(plugin).__name__}"
108
+ )
109
+ registry[datasource_type] = plugin
110
+ return registry
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.postgresql_introspector import PostgresConfigFile, PostgresqlIntrospector
3
+
4
+
5
+ class PostgresqlDbPlugin(BaseDatabasePlugin[PostgresConfigFile]):
6
+ id = "jetbrains/postgres"
7
+ name = "PostgreSQL DB Plugin"
8
+ supported = {"databases/postgres"}
9
+ config_file_type = PostgresConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(PostgresqlIntrospector())
File without changes
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
4
+ from databao_context_engine.plugins.resources.parquet_introspector import ParquetColumn, ParquetIntrospectionResult
5
+
6
+
7
+ @dataclass
8
+ class ParquetColumnChunkContent:
9
+ file_name: str
10
+ column: ParquetColumn
11
+
12
+
13
+ def build_parquet_chunks(result: ParquetIntrospectionResult) -> list[EmbeddableChunk]:
14
+ chunks = []
15
+ for file in result.files:
16
+ for column in file.columns:
17
+ chunks.append(
18
+ EmbeddableChunk(
19
+ embeddable_text=f"Column [name = {column.name}, type = {column.type}, number of values = {column.num_values}] in parquet file {file.name}",
20
+ content=ParquetColumnChunkContent(file_name=file.name, column=column),
21
+ )
22
+ )
23
+ return chunks
@@ -0,0 +1,154 @@
1
+ import contextlib
2
+ import logging
3
+ import uuid
4
+ from collections import defaultdict
5
+ from dataclasses import dataclass, replace
6
+ from urllib.parse import urlparse
7
+
8
+ import duckdb
9
+ from _duckdb import DuckDBPyConnection
10
+ from pydantic import BaseModel, Field
11
+
12
+ from databao_context_engine.pluginlib.config import DuckDBSecret
13
+
14
+ parquet_type = "resources/parquet"
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ParquetConfigFile(BaseModel):
20
+ name: str | None = Field(default=None)
21
+ type: str = Field(default=parquet_type)
22
+ url: str = Field(
23
+ default=type,
24
+ description="Parquet resource location. Should be a valid URL or a path to a local file. "
25
+ "Examples: s3://your_bucket/file.parquet, s3://your-bucket/*.parquet, https://some.url/some_file.parquet, ~/path_to/file.parquet",
26
+ )
27
+ duckdb_secret: DuckDBSecret | None = None
28
+
29
+
30
+ @dataclass
31
+ class ParquetColumn:
32
+ name: str
33
+ type: str
34
+ row_groups: int
35
+ num_values: int
36
+ stats_min: str
37
+ stats_max: str
38
+ stats_null_count: int | None
39
+ stats_distinct_count: int | None
40
+
41
+
42
+ @dataclass
43
+ class ParquetFile:
44
+ name: str
45
+ columns: list[ParquetColumn]
46
+
47
+
48
+ @dataclass
49
+ class ParquetIntrospectionResult:
50
+ files: list[ParquetFile]
51
+
52
+
53
+ def generate_create_secret_sql(secret_name, duckdb_secret: DuckDBSecret) -> str:
54
+ parameters = [("type", duckdb_secret.type)] + list(duckdb_secret.properties.items())
55
+ return f"""CREATE SECRET {secret_name} (
56
+ {", ".join([f"{k} {v}" for (k, v) in parameters])}
57
+ );
58
+ """
59
+
60
+
61
+ @contextlib.contextmanager
62
+ def _create_secret(conn: DuckDBPyConnection, duckdb_secret: DuckDBSecret):
63
+ secret_name = duckdb_secret.name or "gen_secret_" + str(uuid.uuid4()).replace("-", "_")
64
+ create_secret_sql = generate_create_secret_sql(secret_name, duckdb_secret)
65
+ try:
66
+ logger.debug(f"About to create duckdb secret '{secret_name}' with type {duckdb_secret.type}")
67
+ conn.sql(create_secret_sql)
68
+ yield conn
69
+ finally:
70
+ logger.debug(f"Dropping duckdb secret '{secret_name}'")
71
+ conn.sql(f"DROP SECRET IF EXISTS {secret_name};")
72
+
73
+
74
+ def _resolve_url(file_config: ParquetConfigFile) -> str:
75
+ parquet_url = urlparse(file_config.url)
76
+ if parquet_url.scheme == "file":
77
+ return parquet_url.netloc
78
+ return file_config.url
79
+
80
+
81
+ class ParquetIntrospector:
82
+ @contextlib.contextmanager
83
+ def _connect(self, file_config: ParquetConfigFile):
84
+ duckdb_secret = file_config.duckdb_secret
85
+ with duckdb.connect() as conn:
86
+ if duckdb_secret is not None:
87
+ if duckdb_secret.type == "s3":
88
+ conn.execute("INSTALL httpfs;")
89
+ conn.execute("LOAD httpfs;")
90
+ conn.execute("INSTALL s3;")
91
+ conn.execute("LOAD s3;")
92
+ with _create_secret(conn, duckdb_secret):
93
+ yield conn
94
+ else:
95
+ yield conn
96
+
97
+ def check_connection(self, file_config: ParquetConfigFile) -> None:
98
+ with self._connect(file_config) as conn:
99
+ with conn.cursor() as cur:
100
+ resolved_url = _resolve_url(file_config)
101
+ cur.execute(f"SELECT * FROM parquet_file_metadata('{resolved_url}') LIMIT 1")
102
+ columns = [desc[0].lower() for desc in cur.description] if cur.description else []
103
+ rows = cur.fetchall()
104
+ parquet_file_metadata = [dict(zip(columns, row)) for row in rows]
105
+ if not parquet_file_metadata:
106
+ raise ValueError(f"No parquet files found by url {resolved_url}")
107
+ if not parquet_file_metadata or not parquet_file_metadata[0]["file_name"]:
108
+ raise ValueError("Parquet resource introspection failed")
109
+
110
+ def introspect(self, file_config: ParquetConfigFile) -> ParquetIntrospectionResult:
111
+ with self._connect(file_config) as conn:
112
+ with conn.cursor() as cur:
113
+ resolved_url = _resolve_url(file_config)
114
+ cur.execute(f"SELECT * from parquet_metadata('{resolved_url}')")
115
+ cols = [desc[0].lower() for desc in cur.description] if cur.description else []
116
+ rows = cur.fetchall()
117
+ file_metas = [dict(zip(cols, row)) for row in rows]
118
+
119
+ columns_per_file: dict[str, dict[int, ParquetColumn]] = defaultdict(defaultdict)
120
+ for file_meta in file_metas:
121
+ file_name = file_meta["file_name"]
122
+ column_id = file_meta["column_id"]
123
+ column_name = file_meta["path_in_schema"]
124
+ column_type = file_meta.get("type") or ""
125
+ num_values: int = file_meta["num_values"]
126
+ stats_min = file_meta.get("stats_min") or ""
127
+ stats_max = file_meta.get("stats_max") or ""
128
+ stats_null_count: int | None = file_meta.get("stats_null_count")
129
+ stats_distinct_count: int | None = file_meta.get("stats_distinct_count")
130
+
131
+ columns: dict[int, ParquetColumn] = columns_per_file[file_name]
132
+ column: ParquetColumn | None = columns.get(column_id)
133
+ if column:
134
+ columns[column_id] = replace(
135
+ column, num_values=column.num_values + num_values, row_groups=column.row_groups + 1
136
+ )
137
+ else:
138
+ columns[column_id] = ParquetColumn(
139
+ name=column_name,
140
+ type=column_type,
141
+ row_groups=1,
142
+ num_values=num_values,
143
+ stats_min=stats_min,
144
+ stats_max=stats_max,
145
+ stats_null_count=stats_null_count,
146
+ stats_distinct_count=stats_distinct_count,
147
+ )
148
+
149
+ return ParquetIntrospectionResult(
150
+ files=[
151
+ ParquetFile(file_name, columns=list(columns.values()))
152
+ for (file_name, columns) in columns_per_file.items()
153
+ ]
154
+ )
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.snowflake_introspector import SnowflakeConfigFile, SnowflakeIntrospector
3
+
4
+
5
+ class SnowflakeDbPlugin(BaseDatabasePlugin[SnowflakeConfigFile]):
6
+ id = "jetbrains/snowflake"
7
+ name = "Snowflake DB Plugin"
8
+ supported = {"databases/snowflake"}
9
+ config_file_type = SnowflakeConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(SnowflakeIntrospector())