databao-context-engine 0.1.4.dev1__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +14 -1
- databao_context_engine/build_sources/build_runner.py +7 -7
- databao_context_engine/build_sources/build_wiring.py +8 -10
- databao_context_engine/build_sources/plugin_execution.py +9 -12
- databao_context_engine/cli/add_datasource_config.py +9 -30
- databao_context_engine/cli/commands.py +29 -13
- databao_context_engine/databao_context_engine.py +3 -13
- databao_context_engine/databao_context_project_manager.py +56 -29
- databao_context_engine/datasources/check_config.py +13 -16
- databao_context_engine/datasources/datasource_context.py +21 -24
- databao_context_engine/datasources/datasource_discovery.py +45 -44
- databao_context_engine/datasources/types.py +53 -42
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/introspection/property_extract.py +52 -47
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +13 -10
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/mcp/mcp_server.py +1 -3
- databao_context_engine/plugin_loader.py +6 -7
- databao_context_engine/pluginlib/build_plugin.py +0 -33
- databao_context_engine/plugins/databases/athena/__init__.py +0 -0
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{athena_introspector.py → athena/athena_introspector.py} +2 -5
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +1 -3
- databao_context_engine/plugins/databases/base_introspector.py +11 -14
- databao_context_engine/plugins/databases/clickhouse/__init__.py +0 -0
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +2 -5
- databao_context_engine/plugins/databases/duckdb/__init__.py +0 -0
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +4 -7
- databao_context_engine/plugins/databases/mssql/__init__.py +0 -0
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +9 -10
- databao_context_engine/plugins/databases/mysql/__init__.py +0 -0
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +8 -8
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +9 -16
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +8 -9
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/dbt/__init__.py +0 -0
- databao_context_engine/plugins/dbt/dbt_chunker.py +47 -0
- databao_context_engine/plugins/dbt/dbt_context_extractor.py +106 -0
- databao_context_engine/plugins/dbt/dbt_plugin.py +25 -0
- databao_context_engine/plugins/dbt/types.py +44 -0
- databao_context_engine/plugins/dbt/types_artifacts.py +58 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +13 -15
- databao_context_engine/plugins/resources/parquet_introspector.py +1 -1
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/layout.py +12 -13
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +3 -16
- databao_context_engine/retrieve_embeddings/retrieve_service.py +13 -6
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +4 -7
- databao_context_engine/serialization/yaml.py +5 -5
- databao_context_engine/storage/migrate.py +1 -1
- databao_context_engine/storage/repositories/vector_search_repository.py +18 -6
- databao_context_engine-0.1.6.dist-info/METADATA +228 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/RECORD +71 -55
- databao_context_engine/datasources/add_config.py +0 -34
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/retrieve_embeddings/export_results.py +0 -12
- databao_context_engine-0.1.4.dev1.dist-info/METADATA +0 -75
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/WHEEL +0 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/entry_points.txt +0 -0
|
@@ -5,7 +5,7 @@ import asyncpg
|
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
|
|
7
7
|
from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
|
|
8
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
8
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
9
9
|
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
10
10
|
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
11
11
|
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
@@ -81,12 +81,14 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
|
|
|
81
81
|
if self.supports_catalogs:
|
|
82
82
|
sql = "SELECT catalog_name, schema_name FROM information_schema.schemata WHERE catalog_name = ANY($1)"
|
|
83
83
|
return SQLQuery(sql, (catalogs,))
|
|
84
|
-
else:
|
|
85
|
-
sql = "SELECT schema_name FROM information_schema.schemata"
|
|
86
|
-
return SQLQuery(sql, None)
|
|
87
84
|
|
|
88
|
-
|
|
85
|
+
sql = "SELECT schema_name FROM information_schema.schemata"
|
|
86
|
+
return SQLQuery(sql, None)
|
|
87
|
+
|
|
88
|
+
def _connect(self, file_config: PostgresConfigFile, *, catalog: str | None = None):
|
|
89
89
|
kwargs = self._create_connection_kwargs(file_config.connection)
|
|
90
|
+
if catalog:
|
|
91
|
+
kwargs["database"] = catalog
|
|
90
92
|
return _SyncAsyncpgConnection(kwargs)
|
|
91
93
|
|
|
92
94
|
def _fetchall_dicts(self, connection: _SyncAsyncpgConnection, sql: str, params) -> list[dict]:
|
|
@@ -97,13 +99,7 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
|
|
|
97
99
|
if database is not None:
|
|
98
100
|
return [database]
|
|
99
101
|
|
|
100
|
-
|
|
101
|
-
return rows
|
|
102
|
-
|
|
103
|
-
def _connect_to_catalog(self, file_config: PostgresConfigFile, catalog: str):
|
|
104
|
-
cfg = file_config.model_copy(deep=True)
|
|
105
|
-
cfg.connection.database = catalog
|
|
106
|
-
return self._connect(cfg)
|
|
102
|
+
return connection.fetch_scalar_values("SELECT datname FROM pg_catalog.pg_database WHERE datistemplate = false")
|
|
107
103
|
|
|
108
104
|
def collect_catalog_model(
|
|
109
105
|
self, connection: _SyncAsyncpgConnection, catalog: str, schemas: list[str]
|
|
@@ -408,10 +404,7 @@ class PostgresqlIntrospector(BaseIntrospector[PostgresConfigFile]):
|
|
|
408
404
|
}
|
|
409
405
|
connection_parts.update(connection_config.additional_properties)
|
|
410
406
|
|
|
411
|
-
|
|
412
|
-
f"{k}={_escape_pg_value(str(v))}" for k, v in connection_parts.items() if v is not None
|
|
413
|
-
)
|
|
414
|
-
return connection_string
|
|
407
|
+
return " ".join(f"{k}={_escape_pg_value(str(v))}" for k, v in connection_parts.items() if v is not None)
|
|
415
408
|
|
|
416
409
|
def _create_connection_kwargs(self, connection_config: PostgresConnectionProperties) -> dict[str, Any]:
|
|
417
410
|
kwargs: dict[str, Any] = {
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.snowflake.snowflake_introspector import (
|
|
3
|
+
SnowflakeConfigFile,
|
|
4
|
+
SnowflakeIntrospector,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SnowflakeDbPlugin(BaseDatabasePlugin[SnowflakeConfigFile]):
|
|
9
|
+
id = "jetbrains/snowflake"
|
|
10
|
+
name = "Snowflake DB Plugin"
|
|
11
|
+
supported = {"snowflake"}
|
|
12
|
+
config_file_type = SnowflakeConfigFile
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super().__init__(SnowflakeIntrospector())
|
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel, Field
|
|
|
7
7
|
from snowflake.connector import DictCursor
|
|
8
8
|
|
|
9
9
|
from databao_context_engine.pluginlib.config import ConfigPropertyAnnotation
|
|
10
|
-
from databao_context_engine.plugins.base_db_plugin import BaseDatabaseConfigFile
|
|
10
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
11
11
|
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
12
12
|
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
13
13
|
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
@@ -20,7 +20,7 @@ class SnowflakePasswordAuth(BaseModel):
|
|
|
20
20
|
class SnowflakeKeyPairAuth(BaseModel):
|
|
21
21
|
private_key_file: str | None = None
|
|
22
22
|
private_key_file_pwd: str | None = None
|
|
23
|
-
private_key: Annotated[str, ConfigPropertyAnnotation(secret=True)]
|
|
23
|
+
private_key: Annotated[str | None, ConfigPropertyAnnotation(secret=True)] = None
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class SnowflakeSSOAuth(BaseModel):
|
|
@@ -61,18 +61,17 @@ class SnowflakeIntrospector(BaseIntrospector[SnowflakeConfigFile]):
|
|
|
61
61
|
_IGNORED_CATALOGS = {"STREAMLIT_APPS"}
|
|
62
62
|
supports_catalogs = True
|
|
63
63
|
|
|
64
|
-
def _connect(self, file_config: SnowflakeConfigFile):
|
|
64
|
+
def _connect(self, file_config: SnowflakeConfigFile, *, catalog: str | None = None):
|
|
65
65
|
connection = file_config.connection
|
|
66
66
|
snowflake.connector.paramstyle = "qmark"
|
|
67
|
+
connection_kwargs = connection.to_snowflake_kwargs()
|
|
68
|
+
if catalog:
|
|
69
|
+
connection_kwargs["database"] = catalog
|
|
70
|
+
|
|
67
71
|
return snowflake.connector.connect(
|
|
68
|
-
**
|
|
72
|
+
**connection_kwargs,
|
|
69
73
|
)
|
|
70
74
|
|
|
71
|
-
def _connect_to_catalog(self, file_config: SnowflakeConfigFile, catalog: str):
|
|
72
|
-
cfg = dict(file_config.connection or {})
|
|
73
|
-
cfg["database"] = catalog
|
|
74
|
-
return snowflake.connector.connect(**cfg)
|
|
75
|
-
|
|
76
75
|
def _get_catalogs(self, connection, file_config: SnowflakeConfigFile) -> list[str]:
|
|
77
76
|
database = file_config.connection.database
|
|
78
77
|
if database:
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabasePlugin
|
|
2
|
+
from databao_context_engine.plugins.databases.sqlite.sqlite_introspector import SQLiteConfigFile, SQLiteIntrospector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SQLiteDbPlugin(BaseDatabasePlugin[SQLiteConfigFile]):
|
|
6
|
+
id = "jetbrains/sqlite"
|
|
7
|
+
name = "SQLite Plugin"
|
|
8
|
+
supported = {"sqlite"}
|
|
9
|
+
config_file_type = SQLiteConfigFile
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__(SQLiteIntrospector())
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.plugins.databases.base_db_plugin import BaseDatabaseConfigFile
|
|
6
|
+
from databao_context_engine.plugins.databases.base_introspector import BaseIntrospector, SQLQuery
|
|
7
|
+
from databao_context_engine.plugins.databases.databases_types import DatabaseSchema
|
|
8
|
+
from databao_context_engine.plugins.databases.introspection_model_builder import IntrospectionModelBuilder
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLiteConnectionConfig(BaseModel):
|
|
12
|
+
database_path: str = Field(description="Path to the SQLite database file")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SQLiteConfigFile(BaseDatabaseConfigFile):
|
|
16
|
+
type: str = Field(default="sqlite")
|
|
17
|
+
connection: SQLiteConnectionConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SQLiteIntrospector(BaseIntrospector[SQLiteConfigFile]):
|
|
21
|
+
_IGNORED_SCHEMAS = {"temp", "information_schema"}
|
|
22
|
+
_PSEUDO_SCHEMA = "main"
|
|
23
|
+
supports_catalogs = False
|
|
24
|
+
|
|
25
|
+
def _connect(self, file_config: SQLiteConfigFile, *, catalog: str | None = None):
|
|
26
|
+
database_path = str(file_config.connection.database_path)
|
|
27
|
+
conn = sqlite3.connect(database_path)
|
|
28
|
+
conn.text_factory = str
|
|
29
|
+
return conn
|
|
30
|
+
|
|
31
|
+
def _get_catalogs(self, connection, file_config: SQLiteConfigFile) -> list[str]:
|
|
32
|
+
return [self._resolve_pseudo_catalog_name(file_config)]
|
|
33
|
+
|
|
34
|
+
def _fetchall_dicts(self, connection, sql: str, params) -> list[dict]:
|
|
35
|
+
cur = connection.cursor()
|
|
36
|
+
if params is None:
|
|
37
|
+
cur.execute(sql)
|
|
38
|
+
else:
|
|
39
|
+
cur.execute(sql, params)
|
|
40
|
+
|
|
41
|
+
if cur.description is None:
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
rows = cur.fetchall()
|
|
45
|
+
out: list[dict] = []
|
|
46
|
+
for r in rows:
|
|
47
|
+
if isinstance(r, sqlite3.Row):
|
|
48
|
+
out.append({k.lower(): r[k] for k in r.keys()})
|
|
49
|
+
else:
|
|
50
|
+
cols = [d[0].lower() for d in cur.description]
|
|
51
|
+
out.append(dict(zip(cols, r)))
|
|
52
|
+
return out
|
|
53
|
+
|
|
54
|
+
def _list_schemas_for_catalog(self, connection, catalog: str) -> list[str]:
|
|
55
|
+
return [self._PSEUDO_SCHEMA]
|
|
56
|
+
|
|
57
|
+
def collect_catalog_model(self, connection, catalog: str, schemas: list[str]) -> list[DatabaseSchema] | None:
|
|
58
|
+
if not schemas:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
comps = self._component_queries()
|
|
62
|
+
results: dict[str, list[dict]] = {name: [] for name in comps}
|
|
63
|
+
|
|
64
|
+
for name, sql in comps.items():
|
|
65
|
+
results[name] = self._fetchall_dicts(connection, sql, None) or []
|
|
66
|
+
|
|
67
|
+
return IntrospectionModelBuilder.build_schemas_from_components(
|
|
68
|
+
schemas=[self._PSEUDO_SCHEMA],
|
|
69
|
+
rels=results.get("relations", []),
|
|
70
|
+
cols=results.get("columns", []),
|
|
71
|
+
pk_cols=results.get("pk", []),
|
|
72
|
+
uq_cols=results.get("uq", []),
|
|
73
|
+
checks=[],
|
|
74
|
+
fk_cols=results.get("fks", []),
|
|
75
|
+
idx_cols=results.get("idx", []),
|
|
76
|
+
partitions=[],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _component_queries(self) -> dict[str, str]:
|
|
80
|
+
return {
|
|
81
|
+
"relations": self._sql_relations(),
|
|
82
|
+
"columns": self._sql_columns(),
|
|
83
|
+
"pk": self._sql_primary_keys(),
|
|
84
|
+
"uq": self._sql_unique(),
|
|
85
|
+
"fks": self._sql_foreign_keys(),
|
|
86
|
+
"idx": self._sql_indexes(),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def _sql_relations(self) -> str:
|
|
90
|
+
return f"""
|
|
91
|
+
SELECT
|
|
92
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
93
|
+
m.name AS table_name,
|
|
94
|
+
CASE m.type
|
|
95
|
+
WHEN 'view' THEN 'view'
|
|
96
|
+
ELSE 'table'
|
|
97
|
+
END AS kind,
|
|
98
|
+
NULL AS description
|
|
99
|
+
FROM
|
|
100
|
+
sqlite_master m
|
|
101
|
+
WHERE
|
|
102
|
+
m.type IN ('table', 'view')
|
|
103
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
104
|
+
ORDER BY
|
|
105
|
+
m.name;
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def _sql_columns(self) -> str:
|
|
109
|
+
return f"""
|
|
110
|
+
SELECT
|
|
111
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
112
|
+
m.name AS table_name,
|
|
113
|
+
c.name AS column_name,
|
|
114
|
+
(c.cid + 1) AS ordinal_position,
|
|
115
|
+
COALESCE(c.type,'') AS data_type,
|
|
116
|
+
CASE
|
|
117
|
+
WHEN c.pk > 0 THEN 0
|
|
118
|
+
WHEN c."notnull" = 0 THEN 1
|
|
119
|
+
ELSE 0
|
|
120
|
+
END AS is_nullable,
|
|
121
|
+
c.dflt_value AS default_expression,
|
|
122
|
+
CASE
|
|
123
|
+
WHEN c.hidden IN (2,3) THEN 'computed'
|
|
124
|
+
END AS generated,
|
|
125
|
+
NULL AS description
|
|
126
|
+
FROM
|
|
127
|
+
sqlite_master m
|
|
128
|
+
JOIN pragma_table_xinfo(m.name) c
|
|
129
|
+
WHERE
|
|
130
|
+
m.type IN ('table','view')
|
|
131
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
132
|
+
ORDER BY
|
|
133
|
+
m.name,
|
|
134
|
+
c.cid;
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def _sql_primary_keys(self) -> str:
|
|
138
|
+
return f"""
|
|
139
|
+
SELECT
|
|
140
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
141
|
+
m.name AS table_name,
|
|
142
|
+
('pk_' || m.name) AS constraint_name,
|
|
143
|
+
c.pk AS position,
|
|
144
|
+
c.name AS column_name
|
|
145
|
+
FROM
|
|
146
|
+
sqlite_master m
|
|
147
|
+
JOIN pragma_table_info(m.name) c
|
|
148
|
+
WHERE
|
|
149
|
+
m.type = 'table'
|
|
150
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
151
|
+
AND c.pk > 0
|
|
152
|
+
ORDER BY
|
|
153
|
+
m.name,
|
|
154
|
+
c.pk;
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def _sql_unique(self) -> str:
|
|
158
|
+
return f"""
|
|
159
|
+
SELECT
|
|
160
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
161
|
+
m.name AS table_name,
|
|
162
|
+
il.name AS constraint_name,
|
|
163
|
+
(ii.seqno + 1) AS position,
|
|
164
|
+
ii.name AS column_name
|
|
165
|
+
FROM
|
|
166
|
+
sqlite_master m
|
|
167
|
+
JOIN pragma_index_list(m.name) il
|
|
168
|
+
JOIN pragma_index_info(il.name) ii
|
|
169
|
+
WHERE
|
|
170
|
+
m.type = 'table'
|
|
171
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
172
|
+
AND il."unique" = 1
|
|
173
|
+
AND il.origin = 'u'
|
|
174
|
+
ORDER BY
|
|
175
|
+
m.name,
|
|
176
|
+
il.name,
|
|
177
|
+
ii.seqno;
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def _sql_foreign_keys(self) -> str:
|
|
181
|
+
return f"""
|
|
182
|
+
SELECT
|
|
183
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
184
|
+
m.name AS table_name,
|
|
185
|
+
('fk_' || m.name || '_' || fk.id) AS constraint_name,
|
|
186
|
+
(fk.seq + 1) AS position,
|
|
187
|
+
fk."from" AS from_column,
|
|
188
|
+
'main' AS ref_schema,
|
|
189
|
+
fk."table" AS ref_table,
|
|
190
|
+
fk."to" AS to_column,
|
|
191
|
+
lower(fk.on_update) AS on_update,
|
|
192
|
+
lower(fk.on_delete) AS on_delete,
|
|
193
|
+
1 AS enforced,
|
|
194
|
+
1 AS validated
|
|
195
|
+
FROM sqlite_master m
|
|
196
|
+
JOIN pragma_foreign_key_list(m.name) fk
|
|
197
|
+
WHERE
|
|
198
|
+
m.type = 'table'
|
|
199
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
200
|
+
ORDER BY
|
|
201
|
+
m.name,
|
|
202
|
+
fk.id,
|
|
203
|
+
fk.seq;
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
def _sql_indexes(self) -> str:
|
|
207
|
+
return f"""
|
|
208
|
+
SELECT
|
|
209
|
+
'{self._PSEUDO_SCHEMA}' AS schema_name,
|
|
210
|
+
m.name AS table_name,
|
|
211
|
+
il.name AS index_name,
|
|
212
|
+
(ix.seqno + 1) AS position,
|
|
213
|
+
ix.name AS expr,
|
|
214
|
+
il."unique" AS is_unique,
|
|
215
|
+
NULL AS method,
|
|
216
|
+
CASE
|
|
217
|
+
WHEN il.partial = 1 AND sm.sql IS NOT NULL AND instr(upper(sm.sql), 'WHERE') > 0
|
|
218
|
+
THEN trim(substr(sm.sql, instr(upper(sm.sql), 'WHERE') + length('WHERE')))
|
|
219
|
+
END AS predicate
|
|
220
|
+
FROM
|
|
221
|
+
sqlite_master m
|
|
222
|
+
JOIN pragma_index_list(m.name) il
|
|
223
|
+
JOIN pragma_index_xinfo(il.name) ix
|
|
224
|
+
LEFT JOIN sqlite_master sm ON sm.type = 'index' AND sm.name = il.name
|
|
225
|
+
WHERE
|
|
226
|
+
m.type='table'
|
|
227
|
+
AND m.name NOT LIKE 'sqlite_%'
|
|
228
|
+
AND lower(il.origin) = 'c'
|
|
229
|
+
AND ix.key = 1
|
|
230
|
+
ORDER BY
|
|
231
|
+
m.name,
|
|
232
|
+
il.name,
|
|
233
|
+
ix.seqno;
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def _sql_sample_rows(self, catalog: str, schema: str, table: str, limit: int) -> SQLQuery:
|
|
237
|
+
sql = f"SELECT * FROM {self._quote_ident(table)} LIMIT ?"
|
|
238
|
+
return SQLQuery(sql, (limit,))
|
|
239
|
+
|
|
240
|
+
def _quote_ident(self, ident: str) -> str:
|
|
241
|
+
return '"' + str(ident).replace('"', '""') + '"'
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
4
|
+
from databao_context_engine.plugins.dbt.types import DbtColumn, DbtContext, DbtModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DbtColumnChunkContent:
|
|
9
|
+
database_name: str
|
|
10
|
+
schema_name: str
|
|
11
|
+
model_name: str
|
|
12
|
+
column: DbtColumn
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_dbt_chunks(context: DbtContext) -> list[EmbeddableChunk]:
|
|
16
|
+
chunks = []
|
|
17
|
+
|
|
18
|
+
for model in context.models:
|
|
19
|
+
chunks.append(_create_model_chunk(model))
|
|
20
|
+
|
|
21
|
+
for column in model.columns:
|
|
22
|
+
chunks.append(_create_column_chunk(model, column))
|
|
23
|
+
|
|
24
|
+
return chunks
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _create_model_chunk(model: DbtModel) -> EmbeddableChunk:
|
|
28
|
+
return EmbeddableChunk(embeddable_text=_build_model_chunk_text(model), content=model)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _build_model_chunk_text(model: DbtModel) -> str:
|
|
32
|
+
# TODO: Use description and potentially other infos?
|
|
33
|
+
return f"Model {model.name} in database {model.database} and schema {model.schema}, with unique id {model.id}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _create_column_chunk(model: DbtModel, column: DbtColumn) -> EmbeddableChunk:
|
|
37
|
+
return EmbeddableChunk(
|
|
38
|
+
embeddable_text=_build_column_chunk_text(model, column),
|
|
39
|
+
content=DbtColumnChunkContent(
|
|
40
|
+
database_name=model.database, schema_name=model.schema, model_name=model.name, column=column
|
|
41
|
+
),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _build_column_chunk_text(model: DbtModel, column: DbtColumn) -> str:
|
|
46
|
+
# TODO: Use description and potentially other infos?
|
|
47
|
+
return f"Column {column.name} in model {model.id}"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.plugins.dbt.types import (
|
|
4
|
+
DbtColumn,
|
|
5
|
+
DbtConfigFile,
|
|
6
|
+
DbtContext,
|
|
7
|
+
DbtMaterialization,
|
|
8
|
+
DbtModel,
|
|
9
|
+
)
|
|
10
|
+
from databao_context_engine.plugins.dbt.types_artifacts import (
|
|
11
|
+
DbtArtifacts,
|
|
12
|
+
DbtCatalog,
|
|
13
|
+
DbtCatalogColumn,
|
|
14
|
+
DbtCatalogNode,
|
|
15
|
+
DbtManifest,
|
|
16
|
+
DbtManifestColumn,
|
|
17
|
+
DbtManifestModel,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def check_connection(config_file: DbtConfigFile) -> None:
|
|
22
|
+
_read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_context(config_file: DbtConfigFile) -> DbtContext:
|
|
26
|
+
artifacts = _read_dbt_artifacts(config_file.dbt_target_folder_path.expanduser())
|
|
27
|
+
|
|
28
|
+
return _extract_context_from_artifacts(artifacts)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _read_dbt_artifacts(dbt_target_folder_path: Path) -> DbtArtifacts:
|
|
32
|
+
if not dbt_target_folder_path.is_dir():
|
|
33
|
+
raise ValueError(f'Invalid "dbt_target_folder_path": not a directory ({dbt_target_folder_path})')
|
|
34
|
+
|
|
35
|
+
# TODO: Check the manifest schema version?
|
|
36
|
+
manifest_file = dbt_target_folder_path.joinpath("manifest.json")
|
|
37
|
+
if not manifest_file.is_file():
|
|
38
|
+
raise ValueError(f'Invalid "dbt_target_folder_path": missing manifest.json file ({manifest_file})')
|
|
39
|
+
|
|
40
|
+
manifest = DbtManifest.model_validate_json(manifest_file.read_text())
|
|
41
|
+
|
|
42
|
+
catalog_file = dbt_target_folder_path.joinpath("catalog.json")
|
|
43
|
+
catalog = DbtCatalog.model_validate_json(catalog_file.read_text()) if catalog_file.is_file() else None
|
|
44
|
+
|
|
45
|
+
return DbtArtifacts(manifest=manifest, catalog=catalog)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _extract_context_from_artifacts(artifacts: DbtArtifacts) -> DbtContext:
|
|
49
|
+
manifest_models = [
|
|
50
|
+
manifest_model
|
|
51
|
+
for manifest_model in artifacts.manifest.nodes.values()
|
|
52
|
+
if isinstance(manifest_model, DbtManifestModel)
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
catalog_nodes = artifacts.catalog.nodes if artifacts.catalog else {}
|
|
56
|
+
|
|
57
|
+
# TODO: Extract the stages? Or at least the "highest-level" models (= marts?)
|
|
58
|
+
# TODO: Extract the constraints
|
|
59
|
+
# TODO: Organize the models by schemas? Or by stages?
|
|
60
|
+
return DbtContext(
|
|
61
|
+
models=[
|
|
62
|
+
_manifest_model_to_dbt_model(manifest_model, catalog_nodes.get(manifest_model.unique_id, None))
|
|
63
|
+
for manifest_model in manifest_models
|
|
64
|
+
],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _manifest_model_to_dbt_model(manifest_model: DbtManifestModel, catalog_node: DbtCatalogNode | None) -> DbtModel:
|
|
69
|
+
catalog_columns = catalog_node.columns if catalog_node else {}
|
|
70
|
+
|
|
71
|
+
return DbtModel(
|
|
72
|
+
id=manifest_model.unique_id,
|
|
73
|
+
name=manifest_model.name,
|
|
74
|
+
database=manifest_model.database,
|
|
75
|
+
schema=manifest_model.schema_,
|
|
76
|
+
description=manifest_model.description,
|
|
77
|
+
columns=[
|
|
78
|
+
_manifest_column_to_dbt_column(manifest_column, catalog_columns.get(manifest_column.name))
|
|
79
|
+
for manifest_column in manifest_model.columns.values()
|
|
80
|
+
],
|
|
81
|
+
materialization=_manifest_materialization_to_dbt_materializaton(
|
|
82
|
+
manifest_model.config.materialized if manifest_model.config else None
|
|
83
|
+
),
|
|
84
|
+
primary_key=manifest_model.primary_key,
|
|
85
|
+
depends_on_nodes=manifest_model.depends_on.get("nodes", []) if manifest_model.depends_on else [],
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _manifest_column_to_dbt_column(
|
|
90
|
+
manifest_column: DbtManifestColumn, catalog_column: DbtCatalogColumn | None
|
|
91
|
+
) -> DbtColumn:
|
|
92
|
+
return DbtColumn(
|
|
93
|
+
name=manifest_column.name,
|
|
94
|
+
description=manifest_column.description,
|
|
95
|
+
type=catalog_column.type if catalog_column else manifest_column.data_type,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _manifest_materialization_to_dbt_materializaton(materialized: str | None) -> DbtMaterialization | None:
|
|
100
|
+
if materialized is None:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
return DbtMaterialization(materialized)
|
|
105
|
+
except ValueError:
|
|
106
|
+
return None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from databao_context_engine import BuildDatasourcePlugin
|
|
4
|
+
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
5
|
+
from databao_context_engine.plugins.dbt.dbt_chunker import build_dbt_chunks
|
|
6
|
+
from databao_context_engine.plugins.dbt.dbt_context_extractor import check_connection, extract_context
|
|
7
|
+
from databao_context_engine.plugins.dbt.types import DbtConfigFile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DbtPlugin(BuildDatasourcePlugin[DbtConfigFile]):
|
|
11
|
+
id = "jetbrains/dbt"
|
|
12
|
+
name = "Dbt Plugin"
|
|
13
|
+
config_file_type = DbtConfigFile
|
|
14
|
+
|
|
15
|
+
def supported_types(self) -> set[str]:
|
|
16
|
+
return {"dbt"}
|
|
17
|
+
|
|
18
|
+
def build_context(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> Any:
|
|
19
|
+
return extract_context(file_config)
|
|
20
|
+
|
|
21
|
+
def check_connection(self, full_type: str, datasource_name: str, file_config: DbtConfigFile) -> None:
|
|
22
|
+
check_connection(file_config)
|
|
23
|
+
|
|
24
|
+
def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
|
|
25
|
+
return build_dbt_chunks(context)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DbtConfigFile(BaseModel):
|
|
9
|
+
name: str | None = Field(default=None)
|
|
10
|
+
type: str = Field(default="dbt")
|
|
11
|
+
dbt_target_folder_path: Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DbtMaterialization(str, Enum):
|
|
15
|
+
TABLE = "table"
|
|
16
|
+
VIEW = "view"
|
|
17
|
+
|
|
18
|
+
def __str__(self):
|
|
19
|
+
return self.value
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(kw_only=True)
|
|
23
|
+
class DbtColumn:
|
|
24
|
+
name: str
|
|
25
|
+
type: str | None = None
|
|
26
|
+
description: str | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(kw_only=True)
|
|
30
|
+
class DbtModel:
|
|
31
|
+
id: str
|
|
32
|
+
name: str
|
|
33
|
+
database: str
|
|
34
|
+
schema: str
|
|
35
|
+
columns: list[DbtColumn]
|
|
36
|
+
description: str | None = None
|
|
37
|
+
materialization: DbtMaterialization | None = None
|
|
38
|
+
primary_key: list[str] | None = None
|
|
39
|
+
depends_on_nodes: list[str]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(kw_only=True)
|
|
43
|
+
class DbtContext:
|
|
44
|
+
models: list[DbtModel]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Annotated, Literal
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Discriminator, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DbtManifestNodeConfig(BaseModel):
|
|
8
|
+
materialized: str
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DbtManifestColumn(BaseModel):
|
|
12
|
+
name: str
|
|
13
|
+
description: str | None = None
|
|
14
|
+
data_type: str | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DbtManifestModel(BaseModel):
|
|
18
|
+
resource_type: Literal["model"]
|
|
19
|
+
unique_id: str
|
|
20
|
+
name: str
|
|
21
|
+
database: str
|
|
22
|
+
schema_: str = Field(alias="schema")
|
|
23
|
+
description: str | None = None
|
|
24
|
+
config: DbtManifestNodeConfig | None = None
|
|
25
|
+
columns: dict[str, DbtManifestColumn]
|
|
26
|
+
depends_on: dict[str, list[str]] | None = None
|
|
27
|
+
primary_key: list[str] | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DbtManifestOtherNode(BaseModel):
|
|
31
|
+
resource_type: Literal["seed", "analysis", "test", "operation", "sql_operation", "snapshot"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
DbtManifestNode = Annotated[DbtManifestModel | DbtManifestOtherNode, Discriminator("resource_type")]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DbtManifest(BaseModel):
|
|
38
|
+
nodes: dict[str, DbtManifestNode]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DbtCatalogColumn(BaseModel):
|
|
42
|
+
name: str
|
|
43
|
+
type: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DbtCatalogNode(BaseModel):
|
|
47
|
+
unique_id: str | None = None
|
|
48
|
+
columns: dict[str, DbtCatalogColumn]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DbtCatalog(BaseModel):
|
|
52
|
+
nodes: dict[str, DbtCatalogNode]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass(kw_only=True)
|
|
56
|
+
class DbtArtifacts:
|
|
57
|
+
manifest: DbtManifest
|
|
58
|
+
catalog: DbtCatalog | None
|
|
File without changes
|
databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py}
RENAMED
|
@@ -24,7 +24,7 @@ class InternalUnstructuredFilesPlugin(BuildFilePlugin):
|
|
|
24
24
|
self.tokens_overlap = tokens_overlap or self._DEFAULT_TOKENS_OVERLAP
|
|
25
25
|
|
|
26
26
|
def supported_types(self) -> set[str]:
|
|
27
|
-
return
|
|
27
|
+
return self._SUPPORTED_FILES_EXTENSIONS
|
|
28
28
|
|
|
29
29
|
def build_file_context(self, full_type: str, file_name: str, file_buffer: BufferedReader) -> Any:
|
|
30
30
|
file_content = self._read_file(file_buffer)
|