databao-context-engine 0.1.4.dev1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. databao_context_engine/__init__.py +14 -1
  2. databao_context_engine/build_sources/build_runner.py +7 -7
  3. databao_context_engine/build_sources/build_wiring.py +8 -10
  4. databao_context_engine/build_sources/plugin_execution.py +9 -12
  5. databao_context_engine/cli/add_datasource_config.py +9 -30
  6. databao_context_engine/cli/commands.py +29 -13
  7. databao_context_engine/databao_context_engine.py +3 -13
  8. databao_context_engine/databao_context_project_manager.py +56 -29
  9. databao_context_engine/datasources/check_config.py +13 -16
  10. databao_context_engine/datasources/datasource_context.py +21 -24
  11. databao_context_engine/datasources/datasource_discovery.py +45 -44
  12. databao_context_engine/datasources/types.py +53 -42
  13. databao_context_engine/generate_configs_schemas.py +4 -5
  14. databao_context_engine/introspection/property_extract.py +52 -47
  15. databao_context_engine/llm/__init__.py +10 -0
  16. databao_context_engine/llm/api.py +57 -0
  17. databao_context_engine/llm/descriptions/ollama.py +1 -3
  18. databao_context_engine/llm/factory.py +5 -2
  19. databao_context_engine/llm/install.py +13 -10
  20. databao_context_engine/llm/runtime.py +3 -5
  21. databao_context_engine/mcp/mcp_server.py +1 -3
  22. databao_context_engine/plugin_loader.py +6 -7
  23. databao_context_engine/pluginlib/build_plugin.py +0 -33
  24. databao_context_engine/plugins/databases/athena/__init__.py +0 -0
  25. databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
  26. databao_context_engine/plugins/databases/{athena_introspector.py → athena/athena_introspector.py} +2 -5
  27. databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +1 -3
  28. databao_context_engine/plugins/databases/base_introspector.py +11 -14
  29. databao_context_engine/plugins/databases/clickhouse/__init__.py +0 -0
  30. databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
  31. databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +2 -5
  32. databao_context_engine/plugins/databases/duckdb/__init__.py +0 -0
  33. databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
  34. databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +4 -7
  35. databao_context_engine/plugins/databases/mssql/__init__.py +0 -0
  36. databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
  37. databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +9 -10
  38. databao_context_engine/plugins/databases/mysql/__init__.py +0 -0
  39. databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
  40. databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +8 -8
  41. databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
  42. databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
  43. databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +9 -16
  44. databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
  45. databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
  46. databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +8 -9
  47. databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
  48. databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
  49. databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
  50. databao_context_engine/plugins/dbt/__init__.py +0 -0
  51. databao_context_engine/plugins/dbt/dbt_chunker.py +47 -0
  52. databao_context_engine/plugins/dbt/dbt_context_extractor.py +106 -0
  53. databao_context_engine/plugins/dbt/dbt_plugin.py +25 -0
  54. databao_context_engine/plugins/dbt/types.py +44 -0
  55. databao_context_engine/plugins/dbt/types_artifacts.py +58 -0
  56. databao_context_engine/plugins/files/__init__.py +0 -0
  57. databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
  58. databao_context_engine/plugins/plugin_loader.py +13 -15
  59. databao_context_engine/plugins/resources/parquet_introspector.py +1 -1
  60. databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
  61. databao_context_engine/project/layout.py +12 -13
  62. databao_context_engine/retrieve_embeddings/retrieve_runner.py +3 -16
  63. databao_context_engine/retrieve_embeddings/retrieve_service.py +13 -6
  64. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +4 -7
  65. databao_context_engine/serialization/yaml.py +5 -5
  66. databao_context_engine/storage/migrate.py +1 -1
  67. databao_context_engine/storage/repositories/vector_search_repository.py +18 -6
  68. databao_context_engine-0.1.6.dist-info/METADATA +228 -0
  69. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/RECORD +71 -55
  70. databao_context_engine/datasources/add_config.py +0 -34
  71. databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
  72. databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
  73. databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
  74. databao_context_engine/retrieve_embeddings/export_results.py +0 -12
  75. databao_context_engine-0.1.4.dev1.dist-info/METADATA +0 -75
  76. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/WHEEL +0 -0
  77. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/entry_points.txt +0 -0
@@ -5,7 +5,7 @@ import asyncpg
5
5
  from pydantic import BaseModel, Field
6
6
 
7
7
  from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
8
- from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
8
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
9
9
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
10
10
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
11
11
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
@@ -81,12 +81,14 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
81
81
  if self.supports_catalogs:
82
82
  sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY($1)"
83
83
  return SQLQuery(sql, (catalogs,))
84
- else:
85
- sql = "SELECT schema_name FROM information_schema.schemata"
86
- return SQLQuery(sql, None)
87
84
 
88
- def _connect(self, file_config: PostgresConfigFile):
85
+ sql = "SELECT schema_name FROM information_schema.schemata"
86
+ return SQLQuery(sql, None)
87
+
88
+ def _connect(self, file_config: PostgresConfigFile, *, catalog: str | None = None):
89
89
  kwargs = self._create_connection_kwargs(file_config.connection)
90
+ if catalog:
91
+ kwargs["database"] = catalog
90
92
  return _SyncAsyncpgConnection(kwargs)
91
93
 
92
94
  def _fetchall_dicts(self, connection: _SyncAsyncpgConnection, sql: str, params) -> list[dict]:
@@ -97,13 +99,7 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
97
99
  if database is not None:
98
100
  return [database]
99
101
 
100
- rows = connection.fetch_scalar_values("SELECT datname FROM pg_catalog.pg_database WHERE datistemplate = false")
101
- return rows
102
-
103
- def _connect_to_catalog(self, file_config: PostgresConfigFile, catalog: str):
104
- cfg = file_config.model_copy(deep=True)
105
- cfg.connection.database = catalog
106
- return self._connect(cfg)
102
+ return connection.fetch_scalar_values("SELECT datname FROM pg_catalog.pg_database WHERE datistemplate = false")
107
103
 
108
104
  def collect_catalog_model(
109
105
  self, connection: _SyncAsyncpgConnection, catalog: str, schemas: list[str]
@@ -408,10 +404,7 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
408
404
  }
409
405
  connection_parts.update(connection_config.additional_properties)
410
406
 
411
- connection_string = " ".join(
412
- f"{k}={_escape_pg_value(str(v))}" for k, v in connection_parts.items() if v is not None
413
- )
414
- return connection_string
407
+ return " ".join(f"{k}={_escape_pg_value(str(v))}" for k, v in connection_parts.items() if v is not None)
415
408
 
416
409
  def _create_connection_kwargs(self, connection_config: PostgresConnectionProperties) -> dict[str, Any]:
417
410
  kwargs: dict[str, Any] = {
@@ -0,0 +1,15 @@
1
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.snowflake.snowflake_introspector import (
3
+ SnowflakeConfigFile,
4
+ SnowflakeIntrospector,
5
+ )
6
+
7
+
8
+ class SnowflakeDbPlugin(BaseDatabasePlugin[SnowflakeConfigFile]):
9
+ id = "jetbrains/snowflake"
10
+ name = "Snowflake DB Plugin"
11
+ supported = {"snowflake"}
12
+ config_file_type = SnowflakeConfigFile
13
+
14
+ def __init__(self):
15
+ super().__init__(SnowflakeIntrospector())
@@ -7,7 +7,7 @@ from pydantic import BaseModel, Field
7
7
  from snowflake.connector import DictCursor
8
8
 
9
9
  from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
10
- from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
10
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
11
11
  from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
12
12
  from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
13
13
  from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
@@ -20,7 +20,7 @@ class SnowflakePasswordAuth(BaseModel):
20
20
  class SnowflakeKeyPairAuth(BaseModel):
21
21
  private_key_file: str | None = None
22
22
  private_key_file_pwd: str | None = None
23
- private_key: Annotated[str, ConfigPropertyAnnotation(secret=True)]
23
+ private_key: Annotated[str | None, ConfigPropertyAnnotation(secret=True)] = None
24
24
 
25
25
 
26
26
  class SnowflakeSSOAuth(BaseModel):
@@ -61,18 +61,17 @@ class SnowflakeIntrospector(BaseIntrospector[SnowflakeConfigFile]):
61
61
  _IGNORED_CATALOGS = {"STREAMLIT_APPS"}
62
62
  supports_catalogs = True
63
63
 
64
- def _connect(self, file_config: SnowflakeConfigFile):
64
+ def _connect(self, file_config: SnowflakeConfigFile, *, catalog: str | None = None):
65
65
  connection = file_config.connection
66
66
  snowflake.connector.paramstyle = "qmark"
67
+ connection_kwargs = connection.to_snowflake_kwargs()
68
+ if catalog:
69
+ connection_kwargs["database"] = catalog
70
+
67
71
  return snowflake.connector.connect(
68
- **connection.to_snowflake_kwargs(),
72
+ **connection_kwargs,
69
73
  )
70
74
 
71
- def _connect_to_catalog(self, file_config: SnowflakeConfigFile, catalog: str):
72
- cfg = dict(file_config.connection or {})
73
- cfg["database"] = catalog
74
- return snowflake.connector.connect(**cfg)
75
-
76
75
  def _get_catalogs(self, connection, file_config: SnowflakeConfigFile) -> list[str]:
77
76
  database = file_config.connection.database
78
77
  if database:
@@ -0,0 +1,12 @@
1
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
2
+ from databao_context_engine.plugins.databases.sqlite.sqlite_introspector import SQLiteConfigFile, SQLiteIntrospector
3
+
4
+
5
+ class SQLiteDbPlugin(BaseDatabasePlugin[SQLiteConfigFile]):
6
+ id = "jetbrains/sqlite"
7
+ name = "SQLite Plugin"
8
+ supported = {"sqlite"}
9
+ config_file_type = SQLiteConfigFile
10
+
11
+ def __init__(self):
12
+ super().__init__(SQLiteIntrospector())
@@ -0,0 +1,241 @@
1
+ import sqlite3
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
6
+ from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
7
+ from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
8
+ from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
9
+
10
+
11
+ class SQLiteConnectionConfig(BaseModel):
12
+ database_path: str = Field(description="Path to the SQLite database file")
13
+
14
+
15
+ class SQLiteConfigFile(BaseDatabaseConfigFile):
16
+ type: str = Field(default="sqlite")
17
+ connection: SQLiteConnectionConfig
18
+
19
+
20
+ class SQLiteIntrospector(BaseIntrospector[SQLiteConfigFile]):
21
+ _IGNORED_SCHEMAS = {"temp", "information_schema"}
22
+ _PSEUDO_SCHEMA = "main"
23
+ supports_catalogs = False
24
+
25
+ def _connect(self, file_config: SQLiteConfigFile, *, catalog: str | None = None):
26
+ database_path = str(file_config.connection.database_path)
27
+ conn = sqlite3.connect(database_path)
28
+ conn.text_factory = str
29
+ return conn
30
+
31
+ def _get_catalogs(self, connection, file_config: SQLiteConfigFile) -> list[str]:
32
+ return [self._resolve_pseudo_catalog_name(file_config)]
33
+
34
+ def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
35
+ cur = connection.cursor()
36
+ if params is None:
37
+ cur.execute(sql)
38
+ else:
39
+ cur.execute(sql, params)
40
+
41
+ if cur.description is None:
42
+ return []
43
+
44
+ rows = cur.fetchall()
45
+ out: list[dict] = []
46
+ for r in rows:
47
+ if isinstance(r, sqlite3.Row):
48
+ out.append({k.lower(): r[k] for k in r.keys()})
49
+ else:
50
+ cols = [d[0].lower() for d in cur.description]
51
+ out.append(dict(zip(cols, r)))
52
+ return out
53
+
54
+ def _list_schemas_for_catalog(self, connection, catalog: str) -> list[str]:
55
+ return [self._PSEUDO_SCHEMA]
56
+
57
+ def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
58
+ if not schemas:
59
+ return []
60
+
61
+ comps = self._component_queries()
62
+ results: dict[str, list[dict]] = {name: [] for name in comps}
63
+
64
+ for name, sql in comps.items():
65
+ results[name] = self._fetchall_dicts(connection, sql, None) or []
66
+
67
+ return IntrospectionModelBuilder.build_schemas_from_components(
68
+ schemas=[self._PSEUDO_SCHEMA],
69
+ rels=results.get("relations", []),
70
+ cols=results.get("columns", []),
71
+ pk_cols=results.get("pk", []),
72
+ uq_cols=results.get("uq", []),
73
+ checks=[],
74
+ fk_cols=results.get("fks", []),
75
+ idx_cols=results.get("idx", []),
76
+ partitions=[],
77
+ )
78
+
79
+ def _component_queries(self) -> dict[str, str]:
80
+ return {
81
+ "relations": self._sql_relations(),
82
+ "columns": self._sql_columns(),
83
+ "pk": self._sql_primary_keys(),
84
+ "uq": self._sql_unique(),
85
+ "fks": self._sql_foreign_keys(),
86
+ "idx": self._sql_indexes(),
87
+ }
88
+
89
+ def _sql_relations(self) -> str:
90
+ return f"""
91
+ SELECT
92
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
93
+ m.name AS table_name,
94
+ CASE m.type
95
+ WHEN 'view' THEN 'view'
96
+ ELSE 'table'
97
+ END AS kind,
98
+ NULL AS description
99
+ FROM
100
+ sqlite_master m
101
+ WHERE
102
+ m.type IN ('table', 'view')
103
+ AND m.name NOT LIKE 'sqlite_%'
104
+ ORDER BY
105
+ m.name;
106
+ """
107
+
108
+ def _sql_columns(self) -> str:
109
+ return f"""
110
+ SELECT
111
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
112
+ m.name AS table_name,
113
+ c.name AS column_name,
114
+ (c.cid + 1) AS ordinal_position,
115
+ COALESCE(c.type,'') AS data_type,
116
+ CASE
117
+ WHEN c.pk > 0 THEN 0
118
+ WHEN c."notnull" = 0 THEN 1
119
+ ELSE 0
120
+ END AS is_nullable,
121
+ c.dflt_value AS default_expression,
122
+ CASE
123
+ WHEN c.hidden IN (2,3) THEN 'computed'
124
+ END AS generated,
125
+ NULL AS description
126
+ FROM
127
+ sqlite_master m
128
+ JOIN pragma_table_xinfo(m.name) c
129
+ WHERE
130
+ m.type IN ('table','view')
131
+ AND m.name NOT LIKE 'sqlite_%'
132
+ ORDER BY
133
+ m.name,
134
+ c.cid;
135
+ """
136
+
137
+ def _sql_primary_keys(self) -> str:
138
+ return f"""
139
+ SELECT
140
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
141
+ m.name AS table_name,
142
+ ('pk_' || m.name) AS constraint_name,
143
+ c.pk AS position,
144
+ c.name AS column_name
145
+ FROM
146
+ sqlite_master m
147
+ JOIN pragma_table_info(m.name) c
148
+ WHERE
149
+ m.type = 'table'
150
+ AND m.name NOT LIKE 'sqlite_%'
151
+ AND c.pk > 0
152
+ ORDER BY
153
+ m.name,
154
+ c.pk;
155
+ """
156
+
157
+ def _sql_unique(self) -> str:
158
+ return f"""
159
+ SELECT
160
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
161
+ m.name AS table_name,
162
+ il.name AS constraint_name,
163
+ (ii.seqno + 1) AS position,
164
+ ii.name AS column_name
165
+ FROM
166
+ sqlite_master m
167
+ JOIN pragma_index_list(m.name) il
168
+ JOIN pragma_index_info(il.name) ii
169
+ WHERE
170
+ m.type = 'table'
171
+ AND m.name NOT LIKE 'sqlite_%'
172
+ AND il."unique" = 1
173
+ AND il.origin = 'u'
174
+ ORDER BY
175
+ m.name,
176
+ il.name,
177
+ ii.seqno;
178
+ """
179
+
180
+ def _sql_foreign_keys(self) -> str:
181
+ return f"""
182
+ SELECT
183
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
184
+ m.name AS table_name,
185
+ ('fk_' || m.name || '_' || fk.id) AS constraint_name,
186
+ (fk.seq + 1) AS position,
187
+ fk."from" AS from_column,
188
+ 'main' AS ref_schema,
189
+ fk."table" AS ref_table,
190
+ fk."to" AS to_column,
191
+ lower(fk.on_update) AS on_update,
192
+ lower(fk.on_delete) AS on_delete,
193
+ 1 AS enforced,
194
+ 1 AS validated
195
+ FROM sqlite_master m
196
+ JOIN pragma_foreign_key_list(m.name) fk
197
+ WHERE
198
+ m.type = 'table'
199
+ AND m.name NOT LIKE 'sqlite_%'
200
+ ORDER BY
201
+ m.name,
202
+ fk.id,
203
+ fk.seq;
204
+ """
205
+
206
+ def _sql_indexes(self) -> str:
207
+ return f"""
208
+ SELECT
209
+ '{self._PSEUDO_SCHEMA}' AS schema_name,
210
+ m.name AS table_name,
211
+ il.name AS index_name,
212
+ (ix.seqno + 1) AS position,
213
+ ix.name AS expr,
214
+ il."unique" AS is_unique,
215
+ NULL AS method,
216
+ CASE
217
+ WHEN il.partial = 1 AND sm.sql IS NOT NULL AND instr(upper(sm.sql), 'WHERE') > 0
218
+ THEN trim(substr(sm.sql, instr(upper(sm.sql), 'WHERE') + length('WHERE')))
219
+ END AS predicate
220
+ FROM
221
+ sqlite_master m
222
+ JOIN pragma_index_list(m.name) il
223
+ JOIN pragma_index_xinfo(il.name) ix
224
+ LEFT JOIN sqlite_master sm ON sm.type = 'index' AND sm.name = il.name
225
+ WHERE
226
+ m.type='table'
227
+ AND m.name NOT LIKE 'sqlite_%'
228
+ AND lower(il.origin) = 'c'
229
+ AND ix.key = 1
230
+ ORDER BY
231
+ m.name,
232
+ il.name,
233
+ ix.seqno;
234
+ """
235
+
236
+ def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
237
+ sql = f"SELECT * FROM {self._quote_ident(table)} LIMIT ?"
238
+ return SQLQuery(sql, (limit,))
239
+
240
+ def _quote_ident(self, ident: str) -> str:
241
+ return '"' + str(ident).replace('"', '""') + '"'
File without changes
@@ -0,0 +1,47 @@
1
+ from dataclasses import dataclass
2
+
3
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
4
+ from databao_context_engine.plugins.dbt.types import DbtColumn, DbtContext, DbtModel
5
+
6
+
7
+ @dataclass
8
+ class DbtColumnChunkContent:
9
+ database_name: str
10
+ schema_name: str
11
+ model_name: str
12
+ column: DbtColumn
13
+
14
+
15
+ def build_dbt_chunks(context: DbtContext) -> list[EmbeddableChunk]:
16
+ chunks = []
17
+
18
+ for model in context.models:
19
+ chunks.append(_create_model_chunk(model))
20
+
21
+ for column in model.columns:
22
+ chunks.append(_create_column_chunk(model, column))
23
+
24
+ return chunks
25
+
26
+
27
+ def _create_model_chunk(model: DbtModel) -> EmbeddableChunk:
28
+ return EmbeddableChunk(embeddable_text=_build_model_chunk_text(model), content=model)
29
+
30
+
31
+ def _build_model_chunk_text(model: DbtModel) -> str:
32
+ # TODO: Use description and potentially other infos?
33
+ return f"Model {model.name} in database {model.database} and schema {model.schema}, with unique id {model.id}"
34
+
35
+
36
+ def _create_column_chunk(model: DbtModel, column: DbtColumn) -> EmbeddableChunk:
37
+ return EmbeddableChunk(
38
+ embeddable_text=_build_column_chunk_text(model, column),
39
+ content=DbtColumnChunkContent(
40
+ database_name=model.database, schema_name=model.schema, model_name=model.name, column=column
41
+ ),
42
+ )
43
+
44
+
45
+ def _build_column_chunk_text(model: DbtModel, column: DbtColumn) -> str:
46
+ # TODO: Use description and potentially other infos?
47
+ return f"Column {column.name} in model {model.id}"
@@ -0,0 +1,106 @@
1
+ from pathlib import Path
2
+
3
+ from databao_context_engine.plugins.dbt.types import (
4
+ DbtColumn,
5
+ DbtConfigFile,
6
+ DbtContext,
7
+ DbtMaterialization,
8
+ DbtModel,
9
+ )
10
+ from databao_context_engine.plugins.dbt.types_artifacts import (
11
+ DbtArtifacts,
12
+ DbtCatalog,
13
+ DbtCatalogColumn,
14
+ DbtCatalogNode,
15
+ DbtManifest,
16
+ DbtManifestColumn,
17
+ DbtManifestModel,
18
+ )
19
+
20
+
21
+ def check_connection(config_file: DbtConfigFile) -> None:
22
+ _read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
23
+
24
+
25
+ def extract_context(config_file: DbtConfigFile) -> DbtContext:
26
+ artifacts = _read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
27
+
28
+ return _extract_context_from_artifacts(artifacts)
29
+
30
+
31
+ def _read_dbt_artifacts(dbt_target_folder_path: Path) -> DbtArtifacts:
32
+ if not dbt_target_folder_path.is_dir():
33
+ raise ValueError(f'Invalid "dbt_target_folder_path": not a directory ({dbt_target_folder_path})')
34
+
35
+ # TODO: Check the manifest schema version?
36
+ manifest_file = dbt_target_folder_path.joinpath("manifest.json")
37
+ if not manifest_file.is_file():
38
+ raise ValueError(f'Invalid "dbt_target_folder_path": missing manifest.json file ({manifest_file})')
39
+
40
+ manifest = DbtManifest.model_validate_json(manifest_file.read_text())
41
+
42
+ catalog_file = dbt_target_folder_path.joinpath("catalog.json")
43
+ catalog = DbtCatalog.model_validate_json(catalog_file.read_text()) if catalog_file.is_file() else None
44
+
45
+ return DbtArtifacts(manifest=manifest, catalog=catalog)
46
+
47
+
48
+ def _extract_context_from_artifacts(artifacts: DbtArtifacts) -> DbtContext:
49
+ manifest_models = [
50
+ manifest_model
51
+ for manifest_model in artifacts.manifest.nodes.values()
52
+ if isinstance(manifest_model, DbtManifestModel)
53
+ ]
54
+
55
+ catalog_nodes = artifacts.catalog.nodes if artifacts.catalog else {}
56
+
57
+ # TODO: Extract the stages? Or at least the "highest-level" models (= marts?)
58
+ # TODO: Extract the constraints
59
+ # TODO: Organize the models by schemas? Or by stages?
60
+ return DbtContext(
61
+ models=[
62
+ _manifest_model_to_dbt_model(manifest_model, catalog_nodes.get(manifest_model.unique_id, None))
63
+ for manifest_model in manifest_models
64
+ ],
65
+ )
66
+
67
+
68
+ def _manifest_model_to_dbt_model(manifest_model: DbtManifestModel, catalog_node: DbtCatalogNode | None) -> DbtModel:
69
+ catalog_columns = catalog_node.columns if catalog_node else {}
70
+
71
+ return DbtModel(
72
+ id=manifest_model.unique_id,
73
+ name=manifest_model.name,
74
+ database=manifest_model.database,
75
+ schema=manifest_model.schema_,
76
+ description=manifest_model.description,
77
+ columns=[
78
+ _manifest_column_to_dbt_column(manifest_column, catalog_columns.get(manifest_column.name))
79
+ for manifest_column in manifest_model.columns.values()
80
+ ],
81
+ materialization=_manifest_materialization_to_dbt_materializaton(
82
+ manifest_model.config.materialized if manifest_model.config else None
83
+ ),
84
+ primary_key=manifest_model.primary_key,
85
+ depends_on_nodes=manifest_model.depends_on.get("nodes", []) if manifest_model.depends_on else [],
86
+ )
87
+
88
+
89
+ def _manifest_column_to_dbt_column(
90
+ manifest_column: DbtManifestColumn, catalog_column: DbtCatalogColumn | None
91
+ ) -> DbtColumn:
92
+ return DbtColumn(
93
+ name=manifest_column.name,
94
+ description=manifest_column.description,
95
+ type=catalog_column.type if catalog_column else manifest_column.data_type,
96
+ )
97
+
98
+
99
+ def _manifest_materialization_to_dbt_materializaton(materialized: str | None) -> DbtMaterialization | None:
100
+ if materialized is None:
101
+ return None
102
+
103
+ try:
104
+ return DbtMaterialization(materialized)
105
+ except ValueError:
106
+ return None
@@ -0,0 +1,25 @@
1
+ from typing import Any
2
+
3
+ from databao_context_engine import BuildDatasourcePlugin
4
+ from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
5
+ from databao_context_engine.plugins.dbt.dbt_chunker import build_dbt_chunks
6
+ from databao_context_engine.plugins.dbt.dbt_context_extractor import check_connection, extract_context
7
+ from databao_context_engine.plugins.dbt.types import DbtConfigFile
8
+
9
+
10
+ class DbtPlugin(BuildDatasourcePlugin[DbtConfigFile]):
11
+ id = "jetbrains/dbt"
12
+ name = "Dbt Plugin"
13
+ config_file_type = DbtConfigFile
14
+
15
+ def supported_types(self) -> set[str]:
16
+ return {"dbt"}
17
+
18
+ def build_context(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> Any:
19
+ return extract_context(file_config)
20
+
21
+ def check_connection(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> None:
22
+ check_connection(file_config)
23
+
24
+ def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
25
+ return build_dbt_chunks(context)
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class DbtConfigFile(BaseModel):
9
+ name: str | None = Field(default=None)
10
+ type: str = Field(default="dbt")
11
+ dbt_target_folder_path: Path
12
+
13
+
14
+ class DbtMaterialization(str, Enum):
15
+ TABLE = "table"
16
+ VIEW = "view"
17
+
18
+ def __str__(self):
19
+ return self.value
20
+
21
+
22
+ @dataclass(kw_only=True)
23
+ class DbtColumn:
24
+ name: str
25
+ type: str | None = None
26
+ description: str | None = None
27
+
28
+
29
+ @dataclass(kw_only=True)
30
+ class DbtModel:
31
+ id: str
32
+ name: str
33
+ database: str
34
+ schema: str
35
+ columns: list[DbtColumn]
36
+ description: str | None = None
37
+ materialization: DbtMaterialization | None = None
38
+ primary_key: list[str] | None = None
39
+ depends_on_nodes: list[str]
40
+
41
+
42
+ @dataclass(kw_only=True)
43
+ class DbtContext:
44
+ models: list[DbtModel]
@@ -0,0 +1,58 @@
1
+ from dataclasses import dataclass
2
+ from typing import Annotated, Literal
3
+
4
+ from pydantic import BaseModel, Discriminator, Field
5
+
6
+
7
+ class DbtManifestNodeConfig(BaseModel):
8
+ materialized: str
9
+
10
+
11
+ class DbtManifestColumn(BaseModel):
12
+ name: str
13
+ description: str | None = None
14
+ data_type: str | None = None
15
+
16
+
17
+ class DbtManifestModel(BaseModel):
18
+ resource_type: Literal["model"]
19
+ unique_id: str
20
+ name: str
21
+ database: str
22
+ schema_: str = Field(alias="schema")
23
+ description: str | None = None
24
+ config: DbtManifestNodeConfig | None = None
25
+ columns: dict[str, DbtManifestColumn]
26
+ depends_on: dict[str, list[str]] | None = None
27
+ primary_key: list[str] | None = None
28
+
29
+
30
+ class DbtManifestOtherNode(BaseModel):
31
+ resource_type: Literal["seed", "analysis", "test", "operation", "sql_operation", "snapshot"]
32
+
33
+
34
+ DbtManifestNode = Annotated[DbtManifestModel | DbtManifestOtherNode, Discriminator("resource_type")]
35
+
36
+
37
+ class DbtManifest(BaseModel):
38
+ nodes: dict[str, DbtManifestNode]
39
+
40
+
41
+ class DbtCatalogColumn(BaseModel):
42
+ name: str
43
+ type: str
44
+
45
+
46
+ class DbtCatalogNode(BaseModel):
47
+ unique_id: str | None = None
48
+ columns: dict[str, DbtCatalogColumn]
49
+
50
+
51
+ class DbtCatalog(BaseModel):
52
+ nodes: dict[str, DbtCatalogNode]
53
+
54
+
55
+ @dataclass(kw_only=True)
56
+ class DbtArtifacts:
57
+ manifest: DbtManifest
58
+ catalog: DbtCatalog | None
File without changes
@@ -24,7 +24,7 @@ class InternalUnstructuredFilesPlugin(BuildFilePlugin):
24
24
  self.tokens_overlap = tokens_overlap or self._DEFAULT_TOKENS_OVERLAP
25
25
 
26
26
  def supported_types(self) -> set[str]:
27
- return {f"files/{extension}" for extension in self._SUPPORTED_FILES_EXTENSIONS}
27
+ return self._SUPPORTED_FILES_EXTENSIONS
28
28
 
29
29
  def build_file_context(self, full_type: str, file_name: str, file_buffer: BufferedReader) -> Any:
30
30
  file_content = self._read_file(file_buffer)