databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from typing import Optional, Sequence, Tuple
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
from _duckdb import ConstraintException
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.services.table_name_policy import TableNamePolicy
|
|
7
|
+
from databao_context_engine.storage.exceptions.exceptions import IntegrityError
|
|
8
|
+
from databao_context_engine.storage.models import EmbeddingDTO
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EmbeddingRepository:
|
|
12
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
13
|
+
self._conn = conn
|
|
14
|
+
|
|
15
|
+
def create(
|
|
16
|
+
self,
|
|
17
|
+
*,
|
|
18
|
+
table_name: str,
|
|
19
|
+
chunk_id: int,
|
|
20
|
+
vec: Sequence[float],
|
|
21
|
+
) -> EmbeddingDTO:
|
|
22
|
+
try:
|
|
23
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
24
|
+
row = self._conn.execute(
|
|
25
|
+
f"""
|
|
26
|
+
INSERT INTO
|
|
27
|
+
{table_name} (chunk_id, vec)
|
|
28
|
+
VALUES
|
|
29
|
+
(?, ?)
|
|
30
|
+
RETURNING
|
|
31
|
+
*
|
|
32
|
+
""",
|
|
33
|
+
[chunk_id, vec],
|
|
34
|
+
).fetchone()
|
|
35
|
+
if row is None:
|
|
36
|
+
raise RuntimeError("Embedding creation returned no object")
|
|
37
|
+
return self._row_to_dto(row)
|
|
38
|
+
except ConstraintException as e:
|
|
39
|
+
raise IntegrityError from e
|
|
40
|
+
|
|
41
|
+
def get(self, *, table_name: str, chunk_id: int) -> Optional[EmbeddingDTO]:
|
|
42
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
43
|
+
row = self._conn.execute(
|
|
44
|
+
f"""
|
|
45
|
+
SELECT
|
|
46
|
+
*
|
|
47
|
+
FROM
|
|
48
|
+
{table_name}
|
|
49
|
+
WHERE
|
|
50
|
+
chunk_id = ?
|
|
51
|
+
""",
|
|
52
|
+
[chunk_id],
|
|
53
|
+
).fetchone()
|
|
54
|
+
return self._row_to_dto(row) if row else None
|
|
55
|
+
|
|
56
|
+
def update(
|
|
57
|
+
self,
|
|
58
|
+
*,
|
|
59
|
+
table_name: str,
|
|
60
|
+
chunk_id: int,
|
|
61
|
+
vec: Sequence[float],
|
|
62
|
+
) -> Optional[EmbeddingDTO]:
|
|
63
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
64
|
+
self._conn.execute(
|
|
65
|
+
f"""
|
|
66
|
+
UPDATE
|
|
67
|
+
{table_name}
|
|
68
|
+
SET
|
|
69
|
+
vec = ?
|
|
70
|
+
WHERE
|
|
71
|
+
chunk_id = ?
|
|
72
|
+
""",
|
|
73
|
+
[list(vec), chunk_id],
|
|
74
|
+
)
|
|
75
|
+
return self.get(table_name=table_name, chunk_id=chunk_id)
|
|
76
|
+
|
|
77
|
+
def delete(self, *, table_name: str, chunk_id: int) -> int:
|
|
78
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
79
|
+
row = self._conn.execute(
|
|
80
|
+
f"""
|
|
81
|
+
DELETE FROM
|
|
82
|
+
{table_name}
|
|
83
|
+
WHERE
|
|
84
|
+
chunk_id = ?
|
|
85
|
+
RETURNING
|
|
86
|
+
chunk_id
|
|
87
|
+
""",
|
|
88
|
+
[chunk_id],
|
|
89
|
+
).fetchone()
|
|
90
|
+
return 1 if row else 0
|
|
91
|
+
|
|
92
|
+
def list(self, table_name: str) -> list[EmbeddingDTO]:
|
|
93
|
+
TableNamePolicy.validate_table_name(table_name=table_name)
|
|
94
|
+
rows = self._conn.execute(
|
|
95
|
+
f"""
|
|
96
|
+
SELECT
|
|
97
|
+
*
|
|
98
|
+
FROM
|
|
99
|
+
{table_name}
|
|
100
|
+
ORDER BY
|
|
101
|
+
chunk_id DESC
|
|
102
|
+
"""
|
|
103
|
+
).fetchall()
|
|
104
|
+
return [self._row_to_dto(r) for r in rows]
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def _row_to_dto(row: Tuple) -> EmbeddingDTO:
|
|
108
|
+
chunk_id, vec, created_at = row
|
|
109
|
+
return EmbeddingDTO(
|
|
110
|
+
chunk_id=int(chunk_id),
|
|
111
|
+
vec=list(vec) if not isinstance(vec, list) else vec,
|
|
112
|
+
created_at=created_at,
|
|
113
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from _duckdb import DuckDBPyConnection
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.services.run_name_policy import RunNamePolicy
|
|
4
|
+
from databao_context_engine.storage.repositories.chunk_repository import ChunkRepository
|
|
5
|
+
from databao_context_engine.storage.repositories.datasource_run_repository import DatasourceRunRepository
|
|
6
|
+
from databao_context_engine.storage.repositories.embedding_model_registry_repository import (
|
|
7
|
+
EmbeddingModelRegistryRepository,
|
|
8
|
+
)
|
|
9
|
+
from databao_context_engine.storage.repositories.embedding_repository import EmbeddingRepository
|
|
10
|
+
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
11
|
+
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchRepository
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_run_repository(conn: DuckDBPyConnection) -> RunRepository:
|
|
15
|
+
return RunRepository(conn, run_name_policy=RunNamePolicy())
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_datasource_run_repository(conn: DuckDBPyConnection) -> DatasourceRunRepository:
|
|
19
|
+
return DatasourceRunRepository(conn)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_chunk_repository(conn: DuckDBPyConnection) -> ChunkRepository:
|
|
23
|
+
return ChunkRepository(conn)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_embedding_repository(conn: DuckDBPyConnection) -> EmbeddingRepository:
|
|
27
|
+
return EmbeddingRepository(conn)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def create_registry_repository(conn: DuckDBPyConnection) -> EmbeddingModelRegistryRepository:
|
|
31
|
+
return EmbeddingModelRegistryRepository(conn)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def create_vector_search_repository(conn: DuckDBPyConnection) -> VectorSearchRepository:
|
|
35
|
+
return VectorSearchRepository(conn)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.services.run_name_policy import RunNamePolicy
|
|
7
|
+
from databao_context_engine.storage.models import RunDTO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RunRepository:
|
|
11
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection, run_name_policy: RunNamePolicy):
|
|
12
|
+
self._conn = conn
|
|
13
|
+
self._run_name_policy = run_name_policy
|
|
14
|
+
|
|
15
|
+
def create(
|
|
16
|
+
self, *, project_id: str, dce_version: Optional[str] = None, started_at: datetime | None = None
|
|
17
|
+
) -> RunDTO:
|
|
18
|
+
if started_at is None:
|
|
19
|
+
started_at = datetime.now()
|
|
20
|
+
run_name = self._run_name_policy.build(run_started_at=started_at)
|
|
21
|
+
|
|
22
|
+
row = self._conn.execute(
|
|
23
|
+
"""
|
|
24
|
+
INSERT INTO
|
|
25
|
+
run (project_id, nemory_version, started_at, run_name)
|
|
26
|
+
VALUES
|
|
27
|
+
(?, ?, ?, ?)
|
|
28
|
+
RETURNING
|
|
29
|
+
*
|
|
30
|
+
""",
|
|
31
|
+
[project_id, dce_version, started_at, run_name],
|
|
32
|
+
).fetchone()
|
|
33
|
+
if row is None:
|
|
34
|
+
raise RuntimeError("Run creation returned no object")
|
|
35
|
+
return self._row_to_dto(row)
|
|
36
|
+
|
|
37
|
+
def get(self, run_id: int) -> Optional[RunDTO]:
|
|
38
|
+
row = self._conn.execute(
|
|
39
|
+
"""
|
|
40
|
+
SELECT
|
|
41
|
+
*
|
|
42
|
+
FROM
|
|
43
|
+
run
|
|
44
|
+
WHERE
|
|
45
|
+
run_id = ?
|
|
46
|
+
""",
|
|
47
|
+
[run_id],
|
|
48
|
+
).fetchone()
|
|
49
|
+
return self._row_to_dto(row) if row else None
|
|
50
|
+
|
|
51
|
+
def get_by_run_name(self, *, project_id: str, run_name: str) -> RunDTO | None:
|
|
52
|
+
row = self._conn.execute(
|
|
53
|
+
"""
|
|
54
|
+
SELECT
|
|
55
|
+
*
|
|
56
|
+
FROM
|
|
57
|
+
run
|
|
58
|
+
WHERE
|
|
59
|
+
run.project_id = ? AND run_name = ?
|
|
60
|
+
""",
|
|
61
|
+
[project_id, run_name],
|
|
62
|
+
).fetchone()
|
|
63
|
+
return self._row_to_dto(row) if row else None
|
|
64
|
+
|
|
65
|
+
def get_latest_run_for_project(self, project_id: str) -> RunDTO | None:
|
|
66
|
+
row = self._conn.execute(
|
|
67
|
+
"""
|
|
68
|
+
SELECT
|
|
69
|
+
*
|
|
70
|
+
FROM
|
|
71
|
+
run
|
|
72
|
+
WHERE
|
|
73
|
+
run.project_id = ?
|
|
74
|
+
ORDER BY run.started_at DESC
|
|
75
|
+
LIMIT 1
|
|
76
|
+
""",
|
|
77
|
+
[project_id],
|
|
78
|
+
).fetchone()
|
|
79
|
+
return self._row_to_dto(row) if row else None
|
|
80
|
+
|
|
81
|
+
def update(
|
|
82
|
+
self,
|
|
83
|
+
run_id: int,
|
|
84
|
+
*,
|
|
85
|
+
project_id: Optional[str] = None,
|
|
86
|
+
ended_at: Optional[datetime] = None,
|
|
87
|
+
dce_version: Optional[str] = None,
|
|
88
|
+
) -> Optional[RunDTO]:
|
|
89
|
+
sets: list[Any] = []
|
|
90
|
+
params: list[Any] = []
|
|
91
|
+
|
|
92
|
+
if project_id is not None:
|
|
93
|
+
sets.append("project_id = ?")
|
|
94
|
+
params.append(project_id)
|
|
95
|
+
if ended_at is not None:
|
|
96
|
+
sets.append("ended_at = ?")
|
|
97
|
+
params.append(ended_at)
|
|
98
|
+
if dce_version is not None:
|
|
99
|
+
sets.append("nemory_version = ?")
|
|
100
|
+
params.append(dce_version)
|
|
101
|
+
|
|
102
|
+
if not sets:
|
|
103
|
+
return self.get(run_id)
|
|
104
|
+
|
|
105
|
+
params.append(run_id)
|
|
106
|
+
self._conn.execute(
|
|
107
|
+
f"""
|
|
108
|
+
UPDATE
|
|
109
|
+
run
|
|
110
|
+
SET
|
|
111
|
+
{", ".join(sets)}
|
|
112
|
+
WHERE
|
|
113
|
+
run_id = ?
|
|
114
|
+
""",
|
|
115
|
+
params,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return self.get(run_id)
|
|
119
|
+
|
|
120
|
+
def delete(self, run_id: int) -> int:
|
|
121
|
+
row = self._conn.execute(
|
|
122
|
+
"""
|
|
123
|
+
DELETE FROM
|
|
124
|
+
run
|
|
125
|
+
WHERE
|
|
126
|
+
run_id = ?
|
|
127
|
+
RETURNING
|
|
128
|
+
run_id
|
|
129
|
+
""",
|
|
130
|
+
[run_id],
|
|
131
|
+
).fetchone()
|
|
132
|
+
return 1 if row else 0
|
|
133
|
+
|
|
134
|
+
def list(self) -> list[RunDTO]:
|
|
135
|
+
rows = self._conn.execute(
|
|
136
|
+
"""
|
|
137
|
+
SELECT
|
|
138
|
+
*
|
|
139
|
+
FROM
|
|
140
|
+
run
|
|
141
|
+
ORDER BY
|
|
142
|
+
run_id DESC
|
|
143
|
+
"""
|
|
144
|
+
).fetchall()
|
|
145
|
+
return [self._row_to_dto(r) for r in rows]
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _row_to_dto(row: tuple) -> RunDTO:
|
|
149
|
+
run_id, project_id, started_at, ended_at, dce_version, run_name = row
|
|
150
|
+
return RunDTO(
|
|
151
|
+
run_id=int(run_id),
|
|
152
|
+
run_name=run_name,
|
|
153
|
+
project_id=str(project_id),
|
|
154
|
+
started_at=started_at,
|
|
155
|
+
ended_at=ended_at,
|
|
156
|
+
nemory_version=dce_version,
|
|
157
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
7
|
+
from databao_context_engine.project.types import DatasourceId
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(kw_only=True, frozen=True)
|
|
11
|
+
class VectorSearchResult:
|
|
12
|
+
display_text: str
|
|
13
|
+
embeddable_text: str
|
|
14
|
+
cosine_distance: float
|
|
15
|
+
datasource_type: DatasourceType
|
|
16
|
+
datasource_id: DatasourceId
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class VectorSearchRepository:
|
|
20
|
+
_DEFAULT_DISTANCE_THRESHOLD = 0.75
|
|
21
|
+
|
|
22
|
+
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
23
|
+
self._conn = conn
|
|
24
|
+
|
|
25
|
+
def get_display_texts_by_similarity(
|
|
26
|
+
self, *, table_name: str, run_id: int, retrieve_vec: Sequence[float], dimension: int, limit: int
|
|
27
|
+
) -> list[VectorSearchResult]:
|
|
28
|
+
"""
|
|
29
|
+
Read only similarity search on a specific embedding shard table.
|
|
30
|
+
Returns the display text for the closest matches in a given run
|
|
31
|
+
"""
|
|
32
|
+
rows = self._conn.execute(
|
|
33
|
+
f"""
|
|
34
|
+
SELECT
|
|
35
|
+
COALESCE(c.display_text, c.embeddable_text) AS display_text,
|
|
36
|
+
c.embeddable_text AS embeddable_text,
|
|
37
|
+
array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) AS cosine_distance,
|
|
38
|
+
dr.full_type,
|
|
39
|
+
dr.source_id,
|
|
40
|
+
FROM
|
|
41
|
+
{table_name} e
|
|
42
|
+
JOIN chunk c ON e.chunk_id = c.chunk_id
|
|
43
|
+
JOIN datasource_run dr ON c.datasource_run_id = dr.datasource_run_id
|
|
44
|
+
WHERE
|
|
45
|
+
dr.run_id = ?
|
|
46
|
+
AND cosine_distance < ?
|
|
47
|
+
ORDER BY
|
|
48
|
+
array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) ASC
|
|
49
|
+
LIMIT ?
|
|
50
|
+
""",
|
|
51
|
+
[list(retrieve_vec), run_id, self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit],
|
|
52
|
+
).fetchall()
|
|
53
|
+
|
|
54
|
+
return [
|
|
55
|
+
VectorSearchResult(
|
|
56
|
+
display_text=row[0],
|
|
57
|
+
embeddable_text=row[1],
|
|
58
|
+
cosine_distance=row[2],
|
|
59
|
+
datasource_type=DatasourceType(full_type=row[3]),
|
|
60
|
+
datasource_id=DatasourceId.from_string_repr(row[4]),
|
|
61
|
+
)
|
|
62
|
+
for row in rows
|
|
63
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@contextmanager
|
|
7
|
+
def transaction(conn: duckdb.DuckDBPyConnection):
|
|
8
|
+
conn.execute("BEGIN")
|
|
9
|
+
try:
|
|
10
|
+
yield
|
|
11
|
+
conn.execute("COMMIT")
|
|
12
|
+
except Exception:
|
|
13
|
+
conn.execute("ROLLBACK")
|
|
14
|
+
raise
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
# it's private, so it doesn't get imported directy. This value is mocked in tests
|
|
5
|
+
_dce_path = Path(os.getenv("DATABAO_CONTEXT_ENGINE_PATH") or "~/.dce").expanduser().resolve()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_dce_path() -> Path:
|
|
9
|
+
return _dce_path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_db_path() -> Path:
|
|
13
|
+
return get_dce_path() / "dce.duckdb"
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from jinja2.sandbox import SandboxedEnvironment
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DceTemplateError(Exception):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UnknownEnvVarTemplateError(DceTemplateError):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def render_template(source: str) -> str:
|
|
15
|
+
env = SandboxedEnvironment()
|
|
16
|
+
|
|
17
|
+
return env.from_string(source=str(source)).render(env_var=resolve_env_var)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_env_var(env_var: str, default: str | None = None) -> str:
|
|
21
|
+
if env_var in os.environ:
|
|
22
|
+
return os.environ[env_var]
|
|
23
|
+
|
|
24
|
+
if default is not None:
|
|
25
|
+
return default
|
|
26
|
+
|
|
27
|
+
raise UnknownEnvVarTemplateError(
|
|
28
|
+
f"Error in template. The environment variable {env_var} is missing and no default was provided"
|
|
29
|
+
)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: databao-context-engine
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Dist: click>=8.3.0
|
|
6
|
+
Requires-Dist: duckdb>=1.4.3
|
|
7
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
8
|
+
Requires-Dist: requests>=2.32.5
|
|
9
|
+
Requires-Dist: pymysql>=1.1.2
|
|
10
|
+
Requires-Dist: clickhouse-connect>=0.10.0
|
|
11
|
+
Requires-Dist: mcp>=1.23.3
|
|
12
|
+
Requires-Dist: pyathena>=3.22.0
|
|
13
|
+
Requires-Dist: snowflake-connector-python>=4.1.0
|
|
14
|
+
Requires-Dist: mssql-python>=1.0.0
|
|
15
|
+
Requires-Dist: pydantic>=2.12.4
|
|
16
|
+
Requires-Dist: jinja2>=3.1.6
|
|
17
|
+
Requires-Dist: asyncpg>=0.31.0
|
|
18
|
+
Requires-Dist: asyncio>=4.0.0
|
|
19
|
+
Requires-Dist: asyncpg-stubs>=0.31.1
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
[](https://confluence.jetbrains.com/display/ALL/JetBrains+on+GitHub)
|
|
24
|
+
[](https://github.com/JetBrains/databao-context-engine/blob/main/LICENSE)
|
|
25
|
+
|
|
26
|
+
[//]: # ([](https://pypi.org/project/databao-context-engine))
|
|
27
|
+
|
|
28
|
+
[//]: # ([](https://pypi.org/project/databao-context-engine/))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
<h1 align="center">Databao Context Engine</h1>
|
|
32
|
+
<p align="center">
|
|
33
|
+
<b>Semantic context for your LLMs — generated automatically.</b><br/>
|
|
34
|
+
No more copying schemas. No manual documentation. Just accurate answers.
|
|
35
|
+
</p>
|
|
36
|
+
<p align="center">
|
|
37
|
+
<a href="https://databao.app">Website</a>
|
|
38
|
+
|
|
39
|
+
[//]: # (•)
|
|
40
|
+
|
|
41
|
+
[//]: # ( <a href="#quickstart">Quickstart</a> •)
|
|
42
|
+
|
|
43
|
+
[//]: # ( <a href="#supported-data-sources">Data Sources</a> •)
|
|
44
|
+
|
|
45
|
+
[//]: # ( <a href="#contributing">Contributing</a>)
|
|
46
|
+
</p>
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## What is Databao Context Engine?
|
|
51
|
+
|
|
52
|
+
Databao Context Engine **automatically generates governed semantic context** from your databases, BI tools, documents, and spreadsheets.
|
|
53
|
+
|
|
54
|
+
Integrate it with any LLM to deliver **accurate, context-aware answers** — without copying schemas or writing documentation by hand.
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
Your data sources → Context Engine → Unified semantic graph → Any LLM
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Why choose Databao Context Engine?
|
|
61
|
+
|
|
62
|
+
| Feature | What it means for you |
|
|
63
|
+
|----------------------------|----------------------------------------------------------------|
|
|
64
|
+
| **Auto-generated context** | Extracts schemas, relationships, and semantics automatically |
|
|
65
|
+
| **Runs locally** | Your data never leaves your environment |
|
|
66
|
+
| **MCP integration** | Works with Claude Desktop, Cursor, and any MCP-compatible tool |
|
|
67
|
+
| **Multiple sources** | Databases, dbt projects, spreadsheets, documents |
|
|
68
|
+
| **Built-in benchmarks** | Measure and improve context quality over time |
|
|
69
|
+
| **LLM agnostic** | OpenAI, Anthropic, Ollama, Gemini — use any model |
|
|
70
|
+
| **Governed & versioned** | Track, version, and share context across your team |
|
|
71
|
+
| **Dynamic or static** | Serve context via MCP server or export as artifact |
|
|
72
|
+
|
|
73
|
+
# Prerequisites
|
|
74
|
+
|
|
75
|
+
This README assumes you will use `uv` as your package manager.
|
|
76
|
+
|
|
77
|
+
You can install it following the instructions [here](https://docs.astral.sh/uv/getting-started/installation/)
|
|
78
|
+
|
|
79
|
+
If you are going to push to the repository, please make sure to install git pre-commit hooks by running
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run pre-commit install
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
# How to run?
|
|
86
|
+
|
|
87
|
+
You can run it with:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
uv run dce info
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Not providing the `info` subcommand or using the `--help` flag will show the help screen for the command.
|
|
94
|
+
|
|
95
|
+
## Using the dce command directly
|
|
96
|
+
|
|
97
|
+
To be able to use the `dce` command directly (without using `uv run` or `python`) there are two options.
|
|
98
|
+
|
|
99
|
+
### Installing dce locally
|
|
100
|
+
|
|
101
|
+
For that one needs to:
|
|
102
|
+
|
|
103
|
+
1. Build the project by running
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
uv build
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
2. Installing the project on our machine by running:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
uv tool install -e .
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
This second step will install the `dce` script on your machine and add it into your path.
|
|
116
|
+
|
|
117
|
+
### Create dce alias using nix
|
|
118
|
+
|
|
119
|
+
This method will simply create a new shell environment with `dce` alias. For that one needs to install `nix` package
|
|
120
|
+
manager (https://nixos.org/download/). After that one could simply run in the project root
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
$ nix-shell
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
which is a short version of `$ nix-shell shell.nix`.
|
|
127
|
+
|
|
128
|
+
Alternatively, one could specify the path to the project repository
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
$ nix-shell {path_to_dce_repository}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
After that, you can then directly use:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
dce --help
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Note: when we actually release our built Python package, users that don't use `uv` will still be able to install the CLI
|
|
141
|
+
by using `pipx install` instead.
|
|
142
|
+
|
|
143
|
+
# Running Mypy
|
|
144
|
+
|
|
145
|
+
[mypy](https://mypy.readthedocs.io/en/stable/getting_started.html) has been added to the project for type checking.
|
|
146
|
+
|
|
147
|
+
You can run it with the following:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
uv run mypy src --exclude "test_*" --exclude dist
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
NB: the above runs type checking on all files within the `src` directory, excluding all test files.
|
|
154
|
+
|
|
155
|
+
# Running tests
|
|
156
|
+
|
|
157
|
+
You can run the tests with:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
uv run pytest
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
(there is currently one test succeeding and one test failing in the project)
|
|
164
|
+
|
|
165
|
+
# Generating JSON Schemas for our plugin's config files
|
|
166
|
+
|
|
167
|
+
To be able to build a datasource, each plugin requires a yaml config file that describes how to connect to the
|
|
168
|
+
datasource,
|
|
169
|
+
as well as other information needed to customise the plugin.
|
|
170
|
+
|
|
171
|
+
To document what each config file should look like, we can generate a JSON schema describing the fields allowed in that
|
|
172
|
+
file.
|
|
173
|
+
|
|
174
|
+
You can generate all JSON schemas for all plugins by running:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
uv run generate_configs_schemas
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Some options can be provided to the command to choose which plugins to include or exclude from the generation.
|
|
181
|
+
To see the options available, you can refer to the help:
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
uv run generate_configs_schemas --help
|
|
185
|
+
```
|
|
186
|
+
|