databao-context-engine 0.1.2__py3-none-any.whl → 0.1.4.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +18 -6
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +27 -23
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +84 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +3 -7
- databao_context_engine/cli/add_datasource_config.py +41 -15
- databao_context_engine/cli/commands.py +12 -43
- databao_context_engine/cli/info.py +3 -2
- databao_context_engine/databao_context_engine.py +137 -0
- databao_context_engine/databao_context_project_manager.py +96 -6
- databao_context_engine/datasources/add_config.py +34 -0
- databao_context_engine/{datasource_config → datasources}/check_config.py +18 -7
- databao_context_engine/datasources/datasource_context.py +93 -0
- databao_context_engine/{project → datasources}/datasource_discovery.py +17 -16
- databao_context_engine/{project → datasources}/types.py +64 -15
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +67 -53
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/install.py +13 -20
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +10 -10
- databao_context_engine/plugin_loader.py +111 -0
- databao_context_engine/pluginlib/build_plugin.py +25 -9
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/base_db_plugin.py +5 -2
- databao_context_engine/plugins/databases/athena_introspector.py +85 -22
- databao_context_engine/plugins/databases/base_introspector.py +5 -3
- databao_context_engine/plugins/databases/clickhouse_introspector.py +22 -11
- databao_context_engine/plugins/databases/duckdb_introspector.py +3 -5
- databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/databases/mssql_introspector.py +26 -17
- databao_context_engine/plugins/databases/mysql_introspector.py +23 -12
- databao_context_engine/plugins/databases/postgresql_introspector.py +2 -2
- databao_context_engine/plugins/databases/snowflake_introspector.py +43 -10
- databao_context_engine/plugins/duckdb_tools.py +18 -0
- databao_context_engine/plugins/plugin_loader.py +43 -42
- databao_context_engine/plugins/resources/parquet_introspector.py +7 -19
- databao_context_engine/project/info.py +34 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +3 -3
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/{internal/export_results.py → export_results.py} +2 -2
- databao_context_engine/retrieve_embeddings/{internal/retrieve_runner.py → retrieve_runner.py} +5 -9
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +3 -17
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +49 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +1 -1
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +2 -4
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +8 -13
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.4.dev1.dist-info/METADATA +75 -0
- databao_context_engine-0.1.4.dev1.dist-info/RECORD +125 -0
- {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/WHEEL +1 -1
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/mcp/all_results_tool.py +0 -5
- databao_context_engine/mcp/retrieve_tool.py +0 -22
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.2.dist-info/METADATA +0 -187
- databao_context_engine-0.1.2.dist-info/RECORD +0 -135
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → serialization}/__init__.py +0 -0
- {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/entry_points.txt +0 -0
|
@@ -5,7 +5,7 @@ from typing import cast
|
|
|
5
5
|
from databao_context_engine.llm.descriptions.provider import DescriptionProvider
|
|
6
6
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
7
7
|
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
8
|
-
from databao_context_engine.
|
|
8
|
+
from databao_context_engine.serialization.yaml import to_yaml_string
|
|
9
9
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
10
10
|
from databao_context_engine.services.models import ChunkEmbedding
|
|
11
11
|
from databao_context_engine.services.persistence_service import PersistenceService
|
|
@@ -14,9 +14,22 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ChunkEmbeddingMode(Enum):
|
|
17
|
+
"""Mode controlling how chunks are embedded."""
|
|
18
|
+
|
|
17
19
|
EMBEDDABLE_TEXT_ONLY = "EMBEDDABLE_TEXT_ONLY"
|
|
20
|
+
"""
|
|
21
|
+
The embedding is generated only from the string defined by the plugin as embeddable for a chunk.
|
|
22
|
+
"""
|
|
23
|
+
|
|
18
24
|
GENERATED_DESCRIPTION_ONLY = "GENERATED_DESCRIPTION_ONLY"
|
|
25
|
+
"""
|
|
26
|
+
The embedding is generated only from a description of the chunk generated by a LLM.
|
|
27
|
+
"""
|
|
28
|
+
|
|
19
29
|
EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION = "EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION"
|
|
30
|
+
"""
|
|
31
|
+
The embedding is generated from both the embeddable string of the chunk and the description of the chunk generated by a LLM.
|
|
32
|
+
"""
|
|
20
33
|
|
|
21
34
|
def should_generate_description(self) -> bool:
|
|
22
35
|
return self in (
|
|
@@ -44,26 +57,24 @@ class ChunkEmbeddingService:
|
|
|
44
57
|
if self._chunk_embedding_mode.should_generate_description() and description_provider is None:
|
|
45
58
|
raise ValueError("A DescriptionProvider must be provided when generating descriptions")
|
|
46
59
|
|
|
47
|
-
def embed_chunks(self, *,
|
|
48
|
-
"""
|
|
49
|
-
Turn plugin chunks into persisted chunks and embeddings
|
|
60
|
+
def embed_chunks(self, *, chunks: list[EmbeddableChunk], result: str, full_type: str, datasource_id: str) -> None:
|
|
61
|
+
"""Turn plugin chunks into persisted chunks and embeddings.
|
|
50
62
|
|
|
51
63
|
Flow:
|
|
52
|
-
1) Embed each chunk into an embedded vector
|
|
53
|
-
2) Get or create embedding table for the appropriate model and embedding dimensions
|
|
54
|
-
3) Persist chunks and embeddings vectors in a single transaction
|
|
64
|
+
1) Embed each chunk into an embedded vector.
|
|
65
|
+
2) Get or create embedding table for the appropriate model and embedding dimensions.
|
|
66
|
+
3) Persist chunks and embeddings vectors in a single transaction.
|
|
55
67
|
"""
|
|
56
|
-
|
|
57
68
|
if not chunks:
|
|
58
69
|
return
|
|
59
70
|
|
|
60
71
|
logger.debug(
|
|
61
|
-
f"Embedding {len(chunks)} chunks for datasource
|
|
72
|
+
f"Embedding {len(chunks)} chunks for datasource {datasource_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
|
|
62
73
|
)
|
|
63
74
|
|
|
64
75
|
enriched_embeddings: list[ChunkEmbedding] = []
|
|
65
76
|
for chunk in chunks:
|
|
66
|
-
chunk_display_text = to_yaml_string(chunk.content)
|
|
77
|
+
chunk_display_text = chunk.content if isinstance(chunk.content, str) else to_yaml_string(chunk.content)
|
|
67
78
|
|
|
68
79
|
generated_description = ""
|
|
69
80
|
match self._chunk_embedding_mode:
|
|
@@ -98,7 +109,8 @@ class ChunkEmbeddingService:
|
|
|
98
109
|
)
|
|
99
110
|
|
|
100
111
|
self._persistence_service.write_chunks_and_embeddings(
|
|
101
|
-
datasource_run_id=datasource_run_id,
|
|
102
112
|
chunk_embeddings=enriched_embeddings,
|
|
103
113
|
table_name=table_name,
|
|
114
|
+
full_type=full_type,
|
|
115
|
+
datasource_id=datasource_id,
|
|
104
116
|
)
|
|
@@ -1,20 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from duckdb import DuckDBPyConnection
|
|
2
2
|
|
|
3
|
-
from databao_context_engine.build_sources.internal.build_service import BuildService
|
|
4
3
|
from databao_context_engine.llm.descriptions.provider import DescriptionProvider
|
|
5
4
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
6
|
-
from databao_context_engine.retrieve_embeddings.internal.retrieve_service import RetrieveService
|
|
7
5
|
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode, ChunkEmbeddingService
|
|
8
6
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
9
7
|
from databao_context_engine.services.persistence_service import PersistenceService
|
|
10
8
|
from databao_context_engine.services.table_name_policy import TableNamePolicy
|
|
11
9
|
from databao_context_engine.storage.repositories.factories import (
|
|
12
10
|
create_chunk_repository,
|
|
13
|
-
create_datasource_run_repository,
|
|
14
11
|
create_embedding_repository,
|
|
15
12
|
create_registry_repository,
|
|
16
|
-
create_run_repository,
|
|
17
|
-
create_vector_search_repository,
|
|
18
13
|
)
|
|
19
14
|
|
|
20
15
|
|
|
@@ -46,43 +41,3 @@ def create_chunk_embedding_service(
|
|
|
46
41
|
description_provider=description_provider,
|
|
47
42
|
chunk_embedding_mode=chunk_embedding_mode,
|
|
48
43
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def create_build_service(
|
|
52
|
-
conn: DuckDBPyConnection,
|
|
53
|
-
*,
|
|
54
|
-
embedding_provider: EmbeddingProvider,
|
|
55
|
-
description_provider: DescriptionProvider | None,
|
|
56
|
-
chunk_embedding_mode: ChunkEmbeddingMode,
|
|
57
|
-
) -> BuildService:
|
|
58
|
-
run_repo = create_run_repository(conn)
|
|
59
|
-
datasource_run_repo = create_datasource_run_repository(conn)
|
|
60
|
-
chunk_embedding_service = create_chunk_embedding_service(
|
|
61
|
-
conn,
|
|
62
|
-
embedding_provider=embedding_provider,
|
|
63
|
-
description_provider=description_provider,
|
|
64
|
-
chunk_embedding_mode=chunk_embedding_mode,
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
return BuildService(
|
|
68
|
-
run_repo=run_repo,
|
|
69
|
-
datasource_run_repo=datasource_run_repo,
|
|
70
|
-
chunk_embedding_service=chunk_embedding_service,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def create_retrieve_service(
|
|
75
|
-
conn: DuckDBPyConnection,
|
|
76
|
-
*,
|
|
77
|
-
embedding_provider: EmbeddingProvider,
|
|
78
|
-
) -> RetrieveService:
|
|
79
|
-
run_repo = create_run_repository(conn)
|
|
80
|
-
vector_search_repo = create_vector_search_repository(conn)
|
|
81
|
-
shard_resolver = create_shard_resolver(conn)
|
|
82
|
-
|
|
83
|
-
return RetrieveService(
|
|
84
|
-
run_repo=run_repo,
|
|
85
|
-
vector_search_repo=vector_search_repo,
|
|
86
|
-
shard_resolver=shard_resolver,
|
|
87
|
-
provider=embedding_provider,
|
|
88
|
-
)
|
|
@@ -24,11 +24,13 @@ class PersistenceService:
|
|
|
24
24
|
self._dim = dim
|
|
25
25
|
|
|
26
26
|
def write_chunks_and_embeddings(
|
|
27
|
-
self, *,
|
|
27
|
+
self, *, chunk_embeddings: list[ChunkEmbedding], table_name: str, full_type: str, datasource_id: str
|
|
28
28
|
):
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
"""Atomically persist chunks and their vectors.
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If chunk_embeddings is an empty list.
|
|
33
|
+
|
|
32
34
|
"""
|
|
33
35
|
if not chunk_embeddings:
|
|
34
36
|
raise ValueError("chunk_embeddings must be a non-empty list")
|
|
@@ -36,21 +38,19 @@ class PersistenceService:
|
|
|
36
38
|
with transaction(self._conn):
|
|
37
39
|
for chunk_embedding in chunk_embeddings:
|
|
38
40
|
chunk_dto = self.create_chunk(
|
|
39
|
-
|
|
41
|
+
full_type=full_type,
|
|
42
|
+
datasource_id=datasource_id,
|
|
40
43
|
embeddable_text=chunk_embedding.chunk.embeddable_text,
|
|
41
44
|
display_text=chunk_embedding.display_text,
|
|
42
|
-
generated_description=chunk_embedding.generated_description,
|
|
43
45
|
)
|
|
44
46
|
self.create_embedding(table_name=table_name, chunk_id=chunk_dto.chunk_id, vec=chunk_embedding.vec)
|
|
45
47
|
|
|
46
|
-
def create_chunk(
|
|
47
|
-
self, *, datasource_run_id: int, embeddable_text: str, display_text: str, generated_description: str
|
|
48
|
-
) -> ChunkDTO:
|
|
48
|
+
def create_chunk(self, *, full_type: str, datasource_id: str, embeddable_text: str, display_text: str) -> ChunkDTO:
|
|
49
49
|
return self._chunk_repo.create(
|
|
50
|
-
|
|
50
|
+
full_type=full_type,
|
|
51
|
+
datasource_id=datasource_id,
|
|
51
52
|
embeddable_text=embeddable_text,
|
|
52
53
|
display_text=display_text,
|
|
53
|
-
generated_description=generated_description,
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
def create_embedding(self, *, table_name: str, chunk_id: int, vec: Sequence[float]):
|
|
@@ -1,24 +1,28 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Iterator
|
|
4
5
|
|
|
5
6
|
import duckdb
|
|
6
|
-
|
|
7
|
-
from databao_context_engine.system.properties import get_db_path
|
|
7
|
+
from duckdb import DuckDBPyConnection
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@contextmanager
|
|
13
|
-
def open_duckdb_connection(db_path: str | Path
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def open_duckdb_connection(db_path: str | Path) -> Iterator[DuckDBPyConnection]:
|
|
14
|
+
"""Open a DuckDB connection with vector search enabled and close on exist.
|
|
15
|
+
|
|
16
|
+
It also loads the vss extension and enables HNSW experimental persistence on the DuckDB.
|
|
17
17
|
|
|
18
18
|
Usage:
|
|
19
19
|
with open_duckdb_connection() as conn:
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
The opened DuckDB connection.
|
|
23
|
+
|
|
20
24
|
"""
|
|
21
|
-
path = str(db_path
|
|
25
|
+
path = str(db_path)
|
|
22
26
|
conn = duckdb.connect(path)
|
|
23
27
|
logger.debug(f"Connected to DuckDB database at {path}")
|
|
24
28
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
class RepositoryError(Exception):
|
|
2
|
-
"""Base exception for repository errors"""
|
|
2
|
+
"""Base exception for repository errors."""
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class IntegrityError(RepositoryError):
|
|
6
|
-
"""Raised when a DB constraint is violated"""
|
|
6
|
+
"""Raised when a DB constraint is violated."""
|
|
@@ -8,12 +8,10 @@ from typing import LiteralString
|
|
|
8
8
|
|
|
9
9
|
import duckdb
|
|
10
10
|
|
|
11
|
-
from databao_context_engine.system.properties import get_db_path
|
|
12
|
-
|
|
13
11
|
logger = logging.getLogger(__name__)
|
|
14
12
|
|
|
15
13
|
|
|
16
|
-
def migrate(db_path: str | Path
|
|
14
|
+
def migrate(db_path: str | Path, migration_files: list[Path] | None = None) -> None:
|
|
17
15
|
if migration_files is None:
|
|
18
16
|
migration_files = [
|
|
19
17
|
migration
|
|
@@ -21,7 +19,7 @@ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | Non
|
|
|
21
19
|
if isinstance(migration, Path) and ".sql" == migration.suffix
|
|
22
20
|
]
|
|
23
21
|
|
|
24
|
-
db = Path(db_path
|
|
22
|
+
db = Path(db_path).expanduser().resolve()
|
|
25
23
|
db.parent.mkdir(parents=True, exist_ok=True)
|
|
26
24
|
logger.debug("Running migrations on database: %s", db)
|
|
27
25
|
|
|
@@ -2,36 +2,15 @@ INSTALL vss;
|
|
|
2
2
|
LOAD vss;
|
|
3
3
|
SET hnsw_enable_experimental_persistence = true;
|
|
4
4
|
|
|
5
|
-
CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
|
|
6
|
-
CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
|
|
7
5
|
CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
|
|
8
6
|
|
|
9
|
-
CREATE TABLE IF NOT EXISTS run (
|
|
10
|
-
run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
|
|
11
|
-
project_id TEXT NOT NULL,
|
|
12
|
-
started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
13
|
-
ended_at TIMESTAMP,
|
|
14
|
-
nemory_version TEXT,
|
|
15
|
-
run_name TEXT NOT NULL,
|
|
16
|
-
);
|
|
17
|
-
|
|
18
|
-
CREATE TABLE IF NOT EXISTS datasource_run (
|
|
19
|
-
datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
|
|
20
|
-
run_id BIGINT NOT NULL REFERENCES run(run_id),
|
|
21
|
-
plugin TEXT NOT NULL,
|
|
22
|
-
source_id TEXT NOT NULL,
|
|
23
|
-
storage_directory TEXT NOT NULL,
|
|
24
|
-
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
25
|
-
full_type TEXT NOT NULL,
|
|
26
|
-
);
|
|
27
|
-
|
|
28
7
|
CREATE TABLE IF NOT EXISTS chunk (
|
|
29
|
-
chunk_id
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
8
|
+
chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
|
|
9
|
+
full_type TEXT NOT NULL,
|
|
10
|
+
datasource_id TEXT NOT NULL,
|
|
11
|
+
embeddable_text TEXT NOT NULL,
|
|
12
|
+
display_text TEXT,
|
|
13
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
35
14
|
);
|
|
36
15
|
|
|
37
16
|
CREATE TABLE IF NOT EXISTS embedding_model_registry (
|
|
@@ -57,7 +36,3 @@ OR IGNORE INTO
|
|
|
57
36
|
embedding_model_registry(embedder, model_id, dim, table_name)
|
|
58
37
|
VALUES
|
|
59
38
|
('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
|
|
60
|
-
|
|
61
|
-
CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
|
|
62
|
-
CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
|
|
63
|
-
CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
|
|
@@ -4,34 +4,13 @@ from datetime import datetime
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
@dataclass(frozen=True)
|
|
8
|
-
class RunDTO:
|
|
9
|
-
run_id: int
|
|
10
|
-
run_name: str
|
|
11
|
-
project_id: str
|
|
12
|
-
started_at: datetime
|
|
13
|
-
ended_at: Optional[datetime]
|
|
14
|
-
nemory_version: str
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass(frozen=True)
|
|
18
|
-
class DatasourceRunDTO:
|
|
19
|
-
datasource_run_id: int
|
|
20
|
-
run_id: int
|
|
21
|
-
plugin: str
|
|
22
|
-
full_type: str
|
|
23
|
-
source_id: str
|
|
24
|
-
storage_directory: str
|
|
25
|
-
created_at: datetime
|
|
26
|
-
|
|
27
|
-
|
|
28
7
|
@dataclass(frozen=True)
|
|
29
8
|
class ChunkDTO:
|
|
30
9
|
chunk_id: int
|
|
31
|
-
|
|
10
|
+
full_type: str
|
|
11
|
+
datasource_id: str
|
|
32
12
|
embeddable_text: str
|
|
33
13
|
display_text: Optional[str]
|
|
34
|
-
generated_description: str
|
|
35
14
|
created_at: datetime
|
|
36
15
|
|
|
37
16
|
|
|
@@ -14,22 +14,22 @@ class ChunkRepository:
|
|
|
14
14
|
def create(
|
|
15
15
|
self,
|
|
16
16
|
*,
|
|
17
|
-
|
|
17
|
+
full_type: str,
|
|
18
|
+
datasource_id: str,
|
|
18
19
|
embeddable_text: str,
|
|
19
20
|
display_text: Optional[str],
|
|
20
|
-
generated_description: str,
|
|
21
21
|
) -> ChunkDTO:
|
|
22
22
|
try:
|
|
23
23
|
row = self._conn.execute(
|
|
24
24
|
"""
|
|
25
25
|
INSERT INTO
|
|
26
|
-
chunk(
|
|
26
|
+
chunk(full_type, datasource_id, embeddable_text, display_text)
|
|
27
27
|
VALUES
|
|
28
28
|
(?, ?, ?, ?)
|
|
29
29
|
RETURNING
|
|
30
30
|
*
|
|
31
31
|
""",
|
|
32
|
-
[
|
|
32
|
+
[full_type, datasource_id, embeddable_text, display_text],
|
|
33
33
|
).fetchone()
|
|
34
34
|
if row is None:
|
|
35
35
|
raise RuntimeError("chunk creation returned no object")
|
|
@@ -55,22 +55,26 @@ class ChunkRepository:
|
|
|
55
55
|
self,
|
|
56
56
|
chunk_id: int,
|
|
57
57
|
*,
|
|
58
|
+
full_type: Optional[str] = None,
|
|
59
|
+
datasource_id: Optional[str] = None,
|
|
58
60
|
embeddable_text: Optional[str] = None,
|
|
59
61
|
display_text: Optional[str] = None,
|
|
60
|
-
generated_description: Optional[str] = None,
|
|
61
62
|
) -> Optional[ChunkDTO]:
|
|
62
63
|
sets: list[Any] = []
|
|
63
64
|
params: list[Any] = []
|
|
64
65
|
|
|
66
|
+
if full_type is not None:
|
|
67
|
+
sets.append("full_type = ?")
|
|
68
|
+
params.append(full_type)
|
|
69
|
+
if datasource_id is not None:
|
|
70
|
+
sets.append("datasource_id = ?")
|
|
71
|
+
params.append(datasource_id)
|
|
65
72
|
if embeddable_text is not None:
|
|
66
73
|
sets.append("embeddable_text = ?")
|
|
67
74
|
params.append(embeddable_text)
|
|
68
75
|
if display_text is not None:
|
|
69
76
|
sets.append("display_text = ?")
|
|
70
77
|
params.append(display_text)
|
|
71
|
-
if generated_description is not None:
|
|
72
|
-
sets.append("generated_description = ?")
|
|
73
|
-
params.append(generated_description)
|
|
74
78
|
|
|
75
79
|
if not sets:
|
|
76
80
|
return self.get(chunk_id)
|
|
@@ -119,12 +123,12 @@ class ChunkRepository:
|
|
|
119
123
|
|
|
120
124
|
@staticmethod
|
|
121
125
|
def _row_to_dto(row: Tuple) -> ChunkDTO:
|
|
122
|
-
chunk_id,
|
|
126
|
+
chunk_id, full_type, datasource_id, embeddable_text, display_text, created_at = row
|
|
123
127
|
return ChunkDTO(
|
|
124
128
|
chunk_id=int(chunk_id),
|
|
125
|
-
|
|
126
|
-
|
|
129
|
+
full_type=full_type,
|
|
130
|
+
datasource_id=datasource_id,
|
|
131
|
+
embeddable_text=embeddable_text,
|
|
127
132
|
display_text=display_text,
|
|
128
|
-
generated_description=str(generated_description),
|
|
129
133
|
created_at=created_at,
|
|
130
134
|
)
|
|
@@ -1,24 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from duckdb import DuckDBPyConnection
|
|
2
2
|
|
|
3
|
-
from databao_context_engine.services.run_name_policy import RunNamePolicy
|
|
4
3
|
from databao_context_engine.storage.repositories.chunk_repository import ChunkRepository
|
|
5
|
-
from databao_context_engine.storage.repositories.datasource_run_repository import DatasourceRunRepository
|
|
6
4
|
from databao_context_engine.storage.repositories.embedding_model_registry_repository import (
|
|
7
5
|
EmbeddingModelRegistryRepository,
|
|
8
6
|
)
|
|
9
7
|
from databao_context_engine.storage.repositories.embedding_repository import EmbeddingRepository
|
|
10
|
-
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
11
8
|
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchRepository
|
|
12
9
|
|
|
13
10
|
|
|
14
|
-
def create_run_repository(conn: DuckDBPyConnection) -> RunRepository:
|
|
15
|
-
return RunRepository(conn, run_name_policy=RunNamePolicy())
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def create_datasource_run_repository(conn: DuckDBPyConnection) -> DatasourceRunRepository:
|
|
19
|
-
return DatasourceRunRepository(conn)
|
|
20
|
-
|
|
21
|
-
|
|
22
11
|
def create_chunk_repository(conn: DuckDBPyConnection) -> ChunkRepository:
|
|
23
12
|
return ChunkRepository(conn)
|
|
24
13
|
|
|
@@ -3,8 +3,8 @@ from dataclasses import dataclass
|
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
5
|
|
|
6
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
6
7
|
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
7
|
-
from databao_context_engine.project.types import DatasourceId
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass(kw_only=True, frozen=True)
|
|
@@ -23,32 +23,27 @@ class VectorSearchRepository:
|
|
|
23
23
|
self._conn = conn
|
|
24
24
|
|
|
25
25
|
def get_display_texts_by_similarity(
|
|
26
|
-
self, *, table_name: str,
|
|
26
|
+
self, *, table_name: str, retrieve_vec: Sequence[float], dimension: int, limit: int
|
|
27
27
|
) -> list[VectorSearchResult]:
|
|
28
|
-
"""
|
|
29
|
-
Read only similarity search on a specific embedding shard table.
|
|
30
|
-
Returns the display text for the closest matches in a given run
|
|
31
|
-
"""
|
|
28
|
+
"""Read only similarity search on a specific embedding shard table."""
|
|
32
29
|
rows = self._conn.execute(
|
|
33
30
|
f"""
|
|
34
31
|
SELECT
|
|
35
32
|
COALESCE(c.display_text, c.embeddable_text) AS display_text,
|
|
36
|
-
c.embeddable_text
|
|
33
|
+
c.embeddable_text,
|
|
37
34
|
array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) AS cosine_distance,
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
c.full_type,
|
|
36
|
+
c.datasource_id,
|
|
40
37
|
FROM
|
|
41
38
|
{table_name} e
|
|
42
39
|
JOIN chunk c ON e.chunk_id = c.chunk_id
|
|
43
|
-
JOIN datasource_run dr ON c.datasource_run_id = dr.datasource_run_id
|
|
44
40
|
WHERE
|
|
45
|
-
|
|
46
|
-
AND cosine_distance < ?
|
|
41
|
+
cosine_distance < ?
|
|
47
42
|
ORDER BY
|
|
48
43
|
array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) ASC
|
|
49
44
|
LIMIT ?
|
|
50
45
|
""",
|
|
51
|
-
[list(retrieve_vec),
|
|
46
|
+
[list(retrieve_vec), self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit],
|
|
52
47
|
).fetchall()
|
|
53
48
|
|
|
54
49
|
return [
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from databao_context_engine.project.layout import get_output_dir
|
|
5
|
+
|
|
4
6
|
# it's private, so it doesn't get imported directy. This value is mocked in tests
|
|
5
7
|
_dce_path = Path(os.getenv("DATABAO_CONTEXT_ENGINE_PATH") or "~/.dce").expanduser().resolve()
|
|
6
8
|
|
|
@@ -9,5 +11,5 @@ def get_dce_path() -> Path:
|
|
|
9
11
|
return _dce_path
|
|
10
12
|
|
|
11
13
|
|
|
12
|
-
def get_db_path() -> Path:
|
|
13
|
-
return
|
|
14
|
+
def get_db_path(project_dir: Path) -> Path:
|
|
15
|
+
return get_output_dir(project_dir) / "dce.duckdb"
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: databao-context-engine
|
|
3
|
+
Version: 0.1.4.dev1
|
|
4
|
+
Summary: Semantic context for your LLMs — generated automatically
|
|
5
|
+
Requires-Dist: click>=8.3.0
|
|
6
|
+
Requires-Dist: duckdb>=1.4.3
|
|
7
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
8
|
+
Requires-Dist: requests>=2.32.5
|
|
9
|
+
Requires-Dist: mcp>=1.23.3
|
|
10
|
+
Requires-Dist: pydantic>=2.12.4
|
|
11
|
+
Requires-Dist: jinja2>=3.1.6
|
|
12
|
+
Requires-Dist: pyathena>=3.25.0 ; extra == 'athena'
|
|
13
|
+
Requires-Dist: clickhouse-connect>=0.10.0 ; extra == 'clickhouse'
|
|
14
|
+
Requires-Dist: mssql-python>=1.0.0 ; extra == 'mssql'
|
|
15
|
+
Requires-Dist: pymysql>=1.1.2 ; extra == 'mysql'
|
|
16
|
+
Requires-Dist: asyncpg>=0.31.0 ; extra == 'postgresql'
|
|
17
|
+
Requires-Dist: snowflake-connector-python>=4.2.0 ; extra == 'snowflake'
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Provides-Extra: athena
|
|
20
|
+
Provides-Extra: clickhouse
|
|
21
|
+
Provides-Extra: mssql
|
|
22
|
+
Provides-Extra: mysql
|
|
23
|
+
Provides-Extra: postgresql
|
|
24
|
+
Provides-Extra: snowflake
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
[](https://confluence.jetbrains.com/display/ALL/JetBrains+on+GitHub)
|
|
28
|
+
[](https://github.com/JetBrains/databao-context-engine/blob/main/LICENSE)
|
|
29
|
+
|
|
30
|
+
[//]: # ([](https://pypi.org/project/databao-context-engine))
|
|
31
|
+
|
|
32
|
+
[//]: # ([](https://pypi.org/project/databao-context-engine/))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
<h1 align="center">Databao Context Engine</h1>
|
|
36
|
+
<p align="center">
|
|
37
|
+
<b>Semantic context for your LLMs — generated automatically.</b><br/>
|
|
38
|
+
No more copying schemas. No manual documentation. Just accurate answers.
|
|
39
|
+
</p>
|
|
40
|
+
<p align="center">
|
|
41
|
+
<a href="https://databao.app">Website</a>
|
|
42
|
+
|
|
43
|
+
[//]: # (•)
|
|
44
|
+
|
|
45
|
+
[//]: # ( <a href="#quickstart">Quickstart</a> •)
|
|
46
|
+
|
|
47
|
+
[//]: # ( <a href="#supported-data-sources">Data Sources</a> •)
|
|
48
|
+
|
|
49
|
+
[//]: # ( <a href="#contributing">Contributing</a>)
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## What is Databao Context Engine?
|
|
55
|
+
|
|
56
|
+
Databao Context Engine **automatically generates governed semantic context** from your databases, BI tools, documents, and spreadsheets.
|
|
57
|
+
|
|
58
|
+
Integrate it with any LLM to deliver **accurate, context-aware answers** — without copying schemas or writing documentation by hand.
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
Your data sources → Context Engine → Unified semantic graph → Any LLM
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Why choose Databao Context Engine?
|
|
65
|
+
|
|
66
|
+
| Feature | What it means for you |
|
|
67
|
+
|----------------------------|----------------------------------------------------------------|
|
|
68
|
+
| **Auto-generated context** | Extracts schemas, relationships, and semantics automatically |
|
|
69
|
+
| **Runs locally** | Your data never leaves your environment |
|
|
70
|
+
| **MCP integration** | Works with Claude Desktop, Cursor, and any MCP-compatible tool |
|
|
71
|
+
| **Multiple sources** | Databases, dbt projects, spreadsheets, documents |
|
|
72
|
+
| **Built-in benchmarks** | Measure and improve context quality over time |
|
|
73
|
+
| **LLM agnostic** | OpenAI, Anthropic, Ollama, Gemini — use any model |
|
|
74
|
+
| **Governed & versioned** | Track, version, and share context across your team |
|
|
75
|
+
| **Dynamic or static** | Serve context via MCP server or export as artifact |
|