databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +32 -7
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +82 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
- databao_context_engine/cli/add_datasource_config.py +49 -44
- databao_context_engine/cli/commands.py +40 -55
- databao_context_engine/cli/info.py +3 -2
- databao_context_engine/databao_context_engine.py +127 -0
- databao_context_engine/databao_context_project_manager.py +147 -30
- databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
- databao_context_engine/datasources/datasource_context.py +90 -0
- databao_context_engine/datasources/datasource_discovery.py +143 -0
- databao_context_engine/datasources/types.py +194 -0
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +76 -57
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +26 -30
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +9 -11
- databao_context_engine/plugin_loader.py +110 -0
- databao_context_engine/pluginlib/build_plugin.py +12 -29
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
- databao_context_engine/plugins/databases/base_introspector.py +11 -12
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
- databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/duckdb_tools.py +18 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +58 -52
- databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/info.py +34 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +14 -15
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
- databao_context_engine/serialization/__init__.py +0 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +3 -5
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.5.dist-info/METADATA +228 -0
- databao_context_engine-0.1.5.dist-info/RECORD +135 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/mcp/all_results_tool.py +0 -5
- databao_context_engine/mcp/retrieve_tool.py +0 -22
- databao_context_engine/plugins/databases/athena_introspector.py +0 -101
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/project/datasource_discovery.py +0 -141
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/project/types.py +0 -134
- databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.1.dist-info/METADATA +0 -186
- databao_context_engine-0.1.1.dist-info/RECORD +0 -135
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
- /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
- /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
- /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
- {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
from databao_context_engine.project.project_config import ProjectConfig
|
|
6
|
-
from databao_context_engine.project.types import DatasourceId
|
|
7
6
|
|
|
8
7
|
SOURCE_FOLDER_NAME = "src"
|
|
9
8
|
OUTPUT_FOLDER_NAME = "output"
|
|
@@ -25,6 +24,14 @@ class ProjectLayout:
|
|
|
25
24
|
def read_config_file(self) -> ProjectConfig:
|
|
26
25
|
return ProjectConfig.from_file(self.config_file)
|
|
27
26
|
|
|
27
|
+
@property
|
|
28
|
+
def src_dir(self) -> Path:
|
|
29
|
+
return get_source_dir(self.project_dir)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def output_dir(self) -> Path:
|
|
33
|
+
return get_output_dir(self.project_dir)
|
|
34
|
+
|
|
28
35
|
|
|
29
36
|
def ensure_project_dir(project_dir: Path) -> ProjectLayout:
|
|
30
37
|
return _ProjectValidator(project_dir).ensure_project_dir_valid()
|
|
@@ -62,22 +69,14 @@ def get_logs_dir(project_dir: Path) -> Path:
|
|
|
62
69
|
return project_dir.joinpath(LOGS_FOLDER_NAME)
|
|
63
70
|
|
|
64
71
|
|
|
65
|
-
def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
|
|
66
|
-
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
67
|
-
|
|
68
|
-
if config_file.is_file():
|
|
69
|
-
raise ValueError(f"A config file already exists for {str(datasource_id)}")
|
|
70
|
-
|
|
71
|
-
return config_file
|
|
72
|
-
|
|
73
|
-
|
|
74
72
|
def create_datasource_config_file(
|
|
75
|
-
|
|
73
|
+
project_layout: ProjectLayout, datasource_relative_name: str, config_content: str, overwrite_existing: bool
|
|
76
74
|
) -> Path:
|
|
75
|
+
config_file = project_layout.src_dir / datasource_relative_name
|
|
77
76
|
if not overwrite_existing:
|
|
78
|
-
|
|
77
|
+
if config_file.is_file():
|
|
78
|
+
raise ValueError(f"A config file already exists {config_file}")
|
|
79
79
|
|
|
80
|
-
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
81
80
|
config_file.parent.mkdir(parents=True, exist_ok=True)
|
|
82
81
|
|
|
83
82
|
config_file.write_text(config_content)
|
|
@@ -96,12 +95,12 @@ class _ProjectValidator:
|
|
|
96
95
|
|
|
97
96
|
if self.config_file is None:
|
|
98
97
|
raise ValueError(
|
|
99
|
-
f"The current project directory has not been
|
|
98
|
+
f"The current project directory has not been initialized. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
|
|
100
99
|
)
|
|
101
100
|
|
|
102
101
|
if not self.is_src_valid():
|
|
103
102
|
raise ValueError(
|
|
104
|
-
f"The current project directory has not been
|
|
103
|
+
f"The current project directory has not been initialized. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
|
|
105
104
|
)
|
|
106
105
|
|
|
107
106
|
return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
4
|
+
from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
|
|
5
|
+
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def retrieve(
|
|
11
|
+
*,
|
|
12
|
+
retrieve_service: RetrieveService,
|
|
13
|
+
text: str,
|
|
14
|
+
limit: int | None,
|
|
15
|
+
datasource_ids: list[DatasourceId] | None = None,
|
|
16
|
+
) -> list[VectorSearchResult]:
|
|
17
|
+
return retrieve_service.retrieve(text=text, limit=limit, datasource_ids=datasource_ids)
|
databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py}
RENAMED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
|
|
4
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
4
5
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
5
|
-
from databao_context_engine.project.runs import resolve_run_name_from_repo
|
|
6
6
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
7
|
-
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
8
7
|
from databao_context_engine.storage.repositories.vector_search_repository import (
|
|
9
8
|
VectorSearchRepository,
|
|
10
9
|
VectorSearchResult,
|
|
@@ -17,52 +16,46 @@ class RetrieveService:
|
|
|
17
16
|
def __init__(
|
|
18
17
|
self,
|
|
19
18
|
*,
|
|
20
|
-
run_repo: RunRepository,
|
|
21
19
|
vector_search_repo: VectorSearchRepository,
|
|
22
20
|
shard_resolver: EmbeddingShardResolver,
|
|
23
21
|
provider: EmbeddingProvider,
|
|
24
22
|
):
|
|
25
|
-
self._run_repo = run_repo
|
|
26
23
|
self._shard_resolver = shard_resolver
|
|
27
24
|
self._provider = provider
|
|
28
25
|
self._vector_search_repo = vector_search_repo
|
|
29
26
|
|
|
30
27
|
def retrieve(
|
|
31
|
-
self, *,
|
|
28
|
+
self, *, text: str, limit: int | None = None, datasource_ids: list[DatasourceId] | None = None
|
|
32
29
|
) -> list[VectorSearchResult]:
|
|
33
30
|
if limit is None:
|
|
34
31
|
limit = 10
|
|
35
32
|
|
|
36
|
-
run = self._run_repo.get_by_run_name(project_id=project_id, run_name=run_name)
|
|
37
|
-
if run is None:
|
|
38
|
-
raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
|
|
39
|
-
|
|
40
33
|
table_name, dimension = self._shard_resolver.resolve(
|
|
41
34
|
embedder=self._provider.embedder, model_id=self._provider.model_id
|
|
42
35
|
)
|
|
43
36
|
|
|
44
37
|
retrieve_vec: Sequence[float] = self._provider.embed(text)
|
|
45
38
|
|
|
46
|
-
logger.debug(f"Retrieving display texts
|
|
39
|
+
logger.debug(f"Retrieving display texts in table {table_name}")
|
|
47
40
|
|
|
48
41
|
search_results = self._vector_search_repo.get_display_texts_by_similarity(
|
|
49
42
|
table_name=table_name,
|
|
50
|
-
run_id=run.run_id,
|
|
51
43
|
retrieve_vec=retrieve_vec,
|
|
52
44
|
dimension=dimension,
|
|
53
45
|
limit=limit,
|
|
46
|
+
datasource_ids=datasource_ids,
|
|
54
47
|
)
|
|
55
48
|
|
|
56
|
-
logger.debug(f"Retrieved {len(search_results)} display texts
|
|
49
|
+
logger.debug(f"Retrieved {len(search_results)} display texts in table {table_name}")
|
|
57
50
|
|
|
58
51
|
if logger.isEnabledFor(logging.DEBUG):
|
|
59
|
-
|
|
60
|
-
|
|
52
|
+
if search_results:
|
|
53
|
+
closest_result = min(search_results, key=lambda result: result.cosine_distance)
|
|
54
|
+
logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
|
|
61
55
|
|
|
62
|
-
|
|
63
|
-
|
|
56
|
+
farthest_result = max(search_results, key=lambda result: result.cosine_distance)
|
|
57
|
+
logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
|
|
58
|
+
else:
|
|
59
|
+
logger.debug("No results found")
|
|
64
60
|
|
|
65
61
|
return search_results
|
|
66
|
-
|
|
67
|
-
def resolve_run_name(self, *, project_id: str, run_name: str | None) -> str:
|
|
68
|
-
return resolve_run_name_from_repo(run_repository=self._run_repo, project_id=project_id, run_name=run_name)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from duckdb import DuckDBPyConnection
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.datasources.types import DatasourceId
|
|
4
|
+
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
5
|
+
from databao_context_engine.llm.factory import create_ollama_embedding_provider, create_ollama_service
|
|
6
|
+
from databao_context_engine.project.layout import ProjectLayout
|
|
7
|
+
from databao_context_engine.retrieve_embeddings.retrieve_runner import retrieve
|
|
8
|
+
from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
|
|
9
|
+
from databao_context_engine.services.factories import create_shard_resolver
|
|
10
|
+
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
11
|
+
from databao_context_engine.storage.repositories.factories import create_vector_search_repository
|
|
12
|
+
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
|
|
13
|
+
from databao_context_engine.system.properties import get_db_path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def retrieve_embeddings(
|
|
17
|
+
project_layout: ProjectLayout,
|
|
18
|
+
retrieve_text: str,
|
|
19
|
+
limit: int | None,
|
|
20
|
+
datasource_ids: list[DatasourceId] | None,
|
|
21
|
+
) -> list[VectorSearchResult]:
|
|
22
|
+
with open_duckdb_connection(get_db_path(project_layout.project_dir)) as conn:
|
|
23
|
+
ollama_service = create_ollama_service()
|
|
24
|
+
embedding_provider = create_ollama_embedding_provider(ollama_service)
|
|
25
|
+
retrieve_service = _create_retrieve_service(conn, embedding_provider=embedding_provider)
|
|
26
|
+
return retrieve(
|
|
27
|
+
retrieve_service=retrieve_service,
|
|
28
|
+
text=retrieve_text,
|
|
29
|
+
limit=limit,
|
|
30
|
+
datasource_ids=datasource_ids,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _create_retrieve_service(
|
|
35
|
+
conn: DuckDBPyConnection,
|
|
36
|
+
*,
|
|
37
|
+
embedding_provider: EmbeddingProvider,
|
|
38
|
+
) -> RetrieveService:
|
|
39
|
+
vector_search_repo = create_vector_search_repository(conn)
|
|
40
|
+
shard_resolver = create_shard_resolver(conn)
|
|
41
|
+
|
|
42
|
+
return RetrieveService(
|
|
43
|
+
vector_search_repo=vector_search_repo,
|
|
44
|
+
shard_resolver=shard_resolver,
|
|
45
|
+
provider=embedding_provider,
|
|
46
|
+
)
|
|
File without changes
|
|
@@ -7,17 +7,17 @@ from yaml import Node, SafeDumper
|
|
|
7
7
|
def default_representer(dumper: SafeDumper, data: object) -> Node:
|
|
8
8
|
if isinstance(data, Mapping):
|
|
9
9
|
return dumper.represent_dict(data)
|
|
10
|
-
|
|
11
|
-
# Doesn't
|
|
10
|
+
if hasattr(data, "__dict__"):
|
|
11
|
+
# Doesn't serialize "private" attributes (that starts with an _)
|
|
12
12
|
data_public_attributes = {key: value for key, value in data.__dict__.items() if not key.startswith("_")}
|
|
13
13
|
if data_public_attributes:
|
|
14
14
|
return dumper.represent_dict(data_public_attributes)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return dumper.represent_str(str(data))
|
|
18
|
-
else:
|
|
15
|
+
|
|
16
|
+
# If there is no public attributes, we default to the string representation
|
|
19
17
|
return dumper.represent_str(str(data))
|
|
20
18
|
|
|
19
|
+
return dumper.represent_str(str(data))
|
|
20
|
+
|
|
21
21
|
|
|
22
22
|
# Registers our default representer only once, when that file is imported
|
|
23
23
|
yaml.add_multi_representer(object, default_representer, Dumper=SafeDumper)
|
|
@@ -5,7 +5,7 @@ from typing import cast
|
|
|
5
5
|
from databao_context_engine.llm.descriptions.provider import DescriptionProvider
|
|
6
6
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
7
7
|
from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
|
|
8
|
-
from databao_context_engine.
|
|
8
|
+
from databao_context_engine.serialization.yaml import to_yaml_string
|
|
9
9
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
10
10
|
from databao_context_engine.services.models import ChunkEmbedding
|
|
11
11
|
from databao_context_engine.services.persistence_service import PersistenceService
|
|
@@ -14,9 +14,22 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ChunkEmbeddingMode(Enum):
|
|
17
|
+
"""Mode controlling how chunks are embedded."""
|
|
18
|
+
|
|
17
19
|
EMBEDDABLE_TEXT_ONLY = "EMBEDDABLE_TEXT_ONLY"
|
|
20
|
+
"""
|
|
21
|
+
The embedding is generated only from the string defined by the plugin as embeddable for a chunk.
|
|
22
|
+
"""
|
|
23
|
+
|
|
18
24
|
GENERATED_DESCRIPTION_ONLY = "GENERATED_DESCRIPTION_ONLY"
|
|
25
|
+
"""
|
|
26
|
+
The embedding is generated only from a description of the chunk generated by a LLM.
|
|
27
|
+
"""
|
|
28
|
+
|
|
19
29
|
EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION = "EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION"
|
|
30
|
+
"""
|
|
31
|
+
The embedding is generated from both the embeddable string of the chunk and the description of the chunk generated by a LLM.
|
|
32
|
+
"""
|
|
20
33
|
|
|
21
34
|
def should_generate_description(self) -> bool:
|
|
22
35
|
return self in (
|
|
@@ -44,26 +57,24 @@ class ChunkEmbeddingService:
|
|
|
44
57
|
if self._chunk_embedding_mode.should_generate_description() and description_provider is None:
|
|
45
58
|
raise ValueError("A DescriptionProvider must be provided when generating descriptions")
|
|
46
59
|
|
|
47
|
-
def embed_chunks(self, *,
|
|
48
|
-
"""
|
|
49
|
-
Turn plugin chunks into persisted chunks and embeddings
|
|
60
|
+
def embed_chunks(self, *, chunks: list[EmbeddableChunk], result: str, full_type: str, datasource_id: str) -> None:
|
|
61
|
+
"""Turn plugin chunks into persisted chunks and embeddings.
|
|
50
62
|
|
|
51
63
|
Flow:
|
|
52
|
-
1) Embed each chunk into an embedded vector
|
|
53
|
-
2) Get or create embedding table for the appropriate model and embedding dimensions
|
|
54
|
-
3) Persist chunks and embeddings vectors in a single transaction
|
|
64
|
+
1) Embed each chunk into an embedded vector.
|
|
65
|
+
2) Get or create embedding table for the appropriate model and embedding dimensions.
|
|
66
|
+
3) Persist chunks and embeddings vectors in a single transaction.
|
|
55
67
|
"""
|
|
56
|
-
|
|
57
68
|
if not chunks:
|
|
58
69
|
return
|
|
59
70
|
|
|
60
71
|
logger.debug(
|
|
61
|
-
f"Embedding {len(chunks)} chunks for datasource
|
|
72
|
+
f"Embedding {len(chunks)} chunks for datasource {datasource_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
|
|
62
73
|
)
|
|
63
74
|
|
|
64
75
|
enriched_embeddings: list[ChunkEmbedding] = []
|
|
65
76
|
for chunk in chunks:
|
|
66
|
-
chunk_display_text = to_yaml_string(chunk.content)
|
|
77
|
+
chunk_display_text = chunk.content if isinstance(chunk.content, str) else to_yaml_string(chunk.content)
|
|
67
78
|
|
|
68
79
|
generated_description = ""
|
|
69
80
|
match self._chunk_embedding_mode:
|
|
@@ -98,7 +109,8 @@ class ChunkEmbeddingService:
|
|
|
98
109
|
)
|
|
99
110
|
|
|
100
111
|
self._persistence_service.write_chunks_and_embeddings(
|
|
101
|
-
datasource_run_id=datasource_run_id,
|
|
102
112
|
chunk_embeddings=enriched_embeddings,
|
|
103
113
|
table_name=table_name,
|
|
114
|
+
full_type=full_type,
|
|
115
|
+
datasource_id=datasource_id,
|
|
104
116
|
)
|
|
@@ -1,20 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from duckdb import DuckDBPyConnection
|
|
2
2
|
|
|
3
|
-
from databao_context_engine.build_sources.internal.build_service import BuildService
|
|
4
3
|
from databao_context_engine.llm.descriptions.provider import DescriptionProvider
|
|
5
4
|
from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
|
|
6
|
-
from databao_context_engine.retrieve_embeddings.internal.retrieve_service import RetrieveService
|
|
7
5
|
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode, ChunkEmbeddingService
|
|
8
6
|
from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
|
|
9
7
|
from databao_context_engine.services.persistence_service import PersistenceService
|
|
10
8
|
from databao_context_engine.services.table_name_policy import TableNamePolicy
|
|
11
9
|
from databao_context_engine.storage.repositories.factories import (
|
|
12
10
|
create_chunk_repository,
|
|
13
|
-
create_datasource_run_repository,
|
|
14
11
|
create_embedding_repository,
|
|
15
12
|
create_registry_repository,
|
|
16
|
-
create_run_repository,
|
|
17
|
-
create_vector_search_repository,
|
|
18
13
|
)
|
|
19
14
|
|
|
20
15
|
|
|
@@ -46,43 +41,3 @@ def create_chunk_embedding_service(
|
|
|
46
41
|
description_provider=description_provider,
|
|
47
42
|
chunk_embedding_mode=chunk_embedding_mode,
|
|
48
43
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def create_build_service(
|
|
52
|
-
conn: DuckDBPyConnection,
|
|
53
|
-
*,
|
|
54
|
-
embedding_provider: EmbeddingProvider,
|
|
55
|
-
description_provider: DescriptionProvider | None,
|
|
56
|
-
chunk_embedding_mode: ChunkEmbeddingMode,
|
|
57
|
-
) -> BuildService:
|
|
58
|
-
run_repo = create_run_repository(conn)
|
|
59
|
-
datasource_run_repo = create_datasource_run_repository(conn)
|
|
60
|
-
chunk_embedding_service = create_chunk_embedding_service(
|
|
61
|
-
conn,
|
|
62
|
-
embedding_provider=embedding_provider,
|
|
63
|
-
description_provider=description_provider,
|
|
64
|
-
chunk_embedding_mode=chunk_embedding_mode,
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
return BuildService(
|
|
68
|
-
run_repo=run_repo,
|
|
69
|
-
datasource_run_repo=datasource_run_repo,
|
|
70
|
-
chunk_embedding_service=chunk_embedding_service,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def create_retrieve_service(
|
|
75
|
-
conn: DuckDBPyConnection,
|
|
76
|
-
*,
|
|
77
|
-
embedding_provider: EmbeddingProvider,
|
|
78
|
-
) -> RetrieveService:
|
|
79
|
-
run_repo = create_run_repository(conn)
|
|
80
|
-
vector_search_repo = create_vector_search_repository(conn)
|
|
81
|
-
shard_resolver = create_shard_resolver(conn)
|
|
82
|
-
|
|
83
|
-
return RetrieveService(
|
|
84
|
-
run_repo=run_repo,
|
|
85
|
-
vector_search_repo=vector_search_repo,
|
|
86
|
-
shard_resolver=shard_resolver,
|
|
87
|
-
provider=embedding_provider,
|
|
88
|
-
)
|
|
@@ -24,11 +24,13 @@ class PersistenceService:
|
|
|
24
24
|
self._dim = dim
|
|
25
25
|
|
|
26
26
|
def write_chunks_and_embeddings(
|
|
27
|
-
self, *,
|
|
27
|
+
self, *, chunk_embeddings: list[ChunkEmbedding], table_name: str, full_type: str, datasource_id: str
|
|
28
28
|
):
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
"""Atomically persist chunks and their vectors.
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If chunk_embeddings is an empty list.
|
|
33
|
+
|
|
32
34
|
"""
|
|
33
35
|
if not chunk_embeddings:
|
|
34
36
|
raise ValueError("chunk_embeddings must be a non-empty list")
|
|
@@ -36,21 +38,19 @@ class PersistenceService:
|
|
|
36
38
|
with transaction(self._conn):
|
|
37
39
|
for chunk_embedding in chunk_embeddings:
|
|
38
40
|
chunk_dto = self.create_chunk(
|
|
39
|
-
|
|
41
|
+
full_type=full_type,
|
|
42
|
+
datasource_id=datasource_id,
|
|
40
43
|
embeddable_text=chunk_embedding.chunk.embeddable_text,
|
|
41
44
|
display_text=chunk_embedding.display_text,
|
|
42
|
-
generated_description=chunk_embedding.generated_description,
|
|
43
45
|
)
|
|
44
46
|
self.create_embedding(table_name=table_name, chunk_id=chunk_dto.chunk_id, vec=chunk_embedding.vec)
|
|
45
47
|
|
|
46
|
-
def create_chunk(
|
|
47
|
-
self, *, datasource_run_id: int, embeddable_text: str, display_text: str, generated_description: str
|
|
48
|
-
) -> ChunkDTO:
|
|
48
|
+
def create_chunk(self, *, full_type: str, datasource_id: str, embeddable_text: str, display_text: str) -> ChunkDTO:
|
|
49
49
|
return self._chunk_repo.create(
|
|
50
|
-
|
|
50
|
+
full_type=full_type,
|
|
51
|
+
datasource_id=datasource_id,
|
|
51
52
|
embeddable_text=embeddable_text,
|
|
52
53
|
display_text=display_text,
|
|
53
|
-
generated_description=generated_description,
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
def create_embedding(self, *, table_name: str, chunk_id: int, vec: Sequence[float]):
|
|
@@ -1,24 +1,28 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Iterator
|
|
4
5
|
|
|
5
6
|
import duckdb
|
|
6
|
-
|
|
7
|
-
from databao_context_engine.system.properties import get_db_path
|
|
7
|
+
from duckdb import DuckDBPyConnection
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@contextmanager
|
|
13
|
-
def open_duckdb_connection(db_path: str | Path
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def open_duckdb_connection(db_path: str | Path) -> Iterator[DuckDBPyConnection]:
|
|
14
|
+
"""Open a DuckDB connection with vector search enabled and close on exist.
|
|
15
|
+
|
|
16
|
+
It also loads the vss extension and enables HNSW experimental persistence on the DuckDB.
|
|
17
17
|
|
|
18
18
|
Usage:
|
|
19
19
|
with open_duckdb_connection() as conn:
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
The opened DuckDB connection.
|
|
23
|
+
|
|
20
24
|
"""
|
|
21
|
-
path = str(db_path
|
|
25
|
+
path = str(db_path)
|
|
22
26
|
conn = duckdb.connect(path)
|
|
23
27
|
logger.debug(f"Connected to DuckDB database at {path}")
|
|
24
28
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
class RepositoryError(Exception):
|
|
2
|
-
"""Base exception for repository errors"""
|
|
2
|
+
"""Base exception for repository errors."""
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class IntegrityError(RepositoryError):
|
|
6
|
-
"""Raised when a DB constraint is violated"""
|
|
6
|
+
"""Raised when a DB constraint is violated."""
|
|
@@ -8,12 +8,10 @@ from typing import LiteralString
|
|
|
8
8
|
|
|
9
9
|
import duckdb
|
|
10
10
|
|
|
11
|
-
from databao_context_engine.system.properties import get_db_path
|
|
12
|
-
|
|
13
11
|
logger = logging.getLogger(__name__)
|
|
14
12
|
|
|
15
13
|
|
|
16
|
-
def migrate(db_path: str | Path
|
|
14
|
+
def migrate(db_path: str | Path, migration_files: list[Path] | None = None) -> None:
|
|
17
15
|
if migration_files is None:
|
|
18
16
|
migration_files = [
|
|
19
17
|
migration
|
|
@@ -21,7 +19,7 @@ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | Non
|
|
|
21
19
|
if isinstance(migration, Path) and ".sql" == migration.suffix
|
|
22
20
|
]
|
|
23
21
|
|
|
24
|
-
db = Path(db_path
|
|
22
|
+
db = Path(db_path).expanduser().resolve()
|
|
25
23
|
db.parent.mkdir(parents=True, exist_ok=True)
|
|
26
24
|
logger.debug("Running migrations on database: %s", db)
|
|
27
25
|
|
|
@@ -71,7 +69,7 @@ class _Migration:
|
|
|
71
69
|
def _create_migration(file: Path) -> _Migration:
|
|
72
70
|
query_bytes = file.read_bytes()
|
|
73
71
|
query = query_bytes.decode("utf-8")
|
|
74
|
-
checksum = hashlib.md5(query_bytes).hexdigest()
|
|
72
|
+
checksum = hashlib.md5(query_bytes, usedforsecurity=False).hexdigest()
|
|
75
73
|
version = _extract_version_from_name(file.name)
|
|
76
74
|
return _Migration(name=file.name, version=version, checksum=checksum, query=query)
|
|
77
75
|
|
|
@@ -2,36 +2,15 @@ INSTALL vss;
|
|
|
2
2
|
LOAD vss;
|
|
3
3
|
SET hnsw_enable_experimental_persistence = true;
|
|
4
4
|
|
|
5
|
-
CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
|
|
6
|
-
CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
|
|
7
5
|
CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
|
|
8
6
|
|
|
9
|
-
CREATE TABLE IF NOT EXISTS run (
|
|
10
|
-
run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
|
|
11
|
-
project_id TEXT NOT NULL,
|
|
12
|
-
started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
13
|
-
ended_at TIMESTAMP,
|
|
14
|
-
nemory_version TEXT,
|
|
15
|
-
run_name TEXT NOT NULL,
|
|
16
|
-
);
|
|
17
|
-
|
|
18
|
-
CREATE TABLE IF NOT EXISTS datasource_run (
|
|
19
|
-
datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
|
|
20
|
-
run_id BIGINT NOT NULL REFERENCES run(run_id),
|
|
21
|
-
plugin TEXT NOT NULL,
|
|
22
|
-
source_id TEXT NOT NULL,
|
|
23
|
-
storage_directory TEXT NOT NULL,
|
|
24
|
-
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
25
|
-
full_type TEXT NOT NULL,
|
|
26
|
-
);
|
|
27
|
-
|
|
28
7
|
CREATE TABLE IF NOT EXISTS chunk (
|
|
29
|
-
chunk_id
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
8
|
+
chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
|
|
9
|
+
full_type TEXT NOT NULL,
|
|
10
|
+
datasource_id TEXT NOT NULL,
|
|
11
|
+
embeddable_text TEXT NOT NULL,
|
|
12
|
+
display_text TEXT,
|
|
13
|
+
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
35
14
|
);
|
|
36
15
|
|
|
37
16
|
CREATE TABLE IF NOT EXISTS embedding_model_registry (
|
|
@@ -57,7 +36,3 @@ OR IGNORE INTO
|
|
|
57
36
|
embedding_model_registry(embedder, model_id, dim, table_name)
|
|
58
37
|
VALUES
|
|
59
38
|
('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
|
|
60
|
-
|
|
61
|
-
CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
|
|
62
|
-
CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
|
|
63
|
-
CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
|
|
@@ -4,34 +4,13 @@ from datetime import datetime
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
@dataclass(frozen=True)
|
|
8
|
-
class RunDTO:
|
|
9
|
-
run_id: int
|
|
10
|
-
run_name: str
|
|
11
|
-
project_id: str
|
|
12
|
-
started_at: datetime
|
|
13
|
-
ended_at: Optional[datetime]
|
|
14
|
-
nemory_version: str
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass(frozen=True)
|
|
18
|
-
class DatasourceRunDTO:
|
|
19
|
-
datasource_run_id: int
|
|
20
|
-
run_id: int
|
|
21
|
-
plugin: str
|
|
22
|
-
full_type: str
|
|
23
|
-
source_id: str
|
|
24
|
-
storage_directory: str
|
|
25
|
-
created_at: datetime
|
|
26
|
-
|
|
27
|
-
|
|
28
7
|
@dataclass(frozen=True)
|
|
29
8
|
class ChunkDTO:
|
|
30
9
|
chunk_id: int
|
|
31
|
-
|
|
10
|
+
full_type: str
|
|
11
|
+
datasource_id: str
|
|
32
12
|
embeddable_text: str
|
|
33
13
|
display_text: Optional[str]
|
|
34
|
-
generated_description: str
|
|
35
14
|
created_at: datetime
|
|
36
15
|
|
|
37
16
|
|
|
@@ -14,22 +14,22 @@ class ChunkRepository:
|
|
|
14
14
|
def create(
|
|
15
15
|
self,
|
|
16
16
|
*,
|
|
17
|
-
|
|
17
|
+
full_type: str,
|
|
18
|
+
datasource_id: str,
|
|
18
19
|
embeddable_text: str,
|
|
19
20
|
display_text: Optional[str],
|
|
20
|
-
generated_description: str,
|
|
21
21
|
) -> ChunkDTO:
|
|
22
22
|
try:
|
|
23
23
|
row = self._conn.execute(
|
|
24
24
|
"""
|
|
25
25
|
INSERT INTO
|
|
26
|
-
chunk(
|
|
26
|
+
chunk(full_type, datasource_id, embeddable_text, display_text)
|
|
27
27
|
VALUES
|
|
28
28
|
(?, ?, ?, ?)
|
|
29
29
|
RETURNING
|
|
30
30
|
*
|
|
31
31
|
""",
|
|
32
|
-
[
|
|
32
|
+
[full_type, datasource_id, embeddable_text, display_text],
|
|
33
33
|
).fetchone()
|
|
34
34
|
if row is None:
|
|
35
35
|
raise RuntimeError("chunk creation returned no object")
|
|
@@ -55,22 +55,26 @@ class ChunkRepository:
|
|
|
55
55
|
self,
|
|
56
56
|
chunk_id: int,
|
|
57
57
|
*,
|
|
58
|
+
full_type: Optional[str] = None,
|
|
59
|
+
datasource_id: Optional[str] = None,
|
|
58
60
|
embeddable_text: Optional[str] = None,
|
|
59
61
|
display_text: Optional[str] = None,
|
|
60
|
-
generated_description: Optional[str] = None,
|
|
61
62
|
) -> Optional[ChunkDTO]:
|
|
62
63
|
sets: list[Any] = []
|
|
63
64
|
params: list[Any] = []
|
|
64
65
|
|
|
66
|
+
if full_type is not None:
|
|
67
|
+
sets.append("full_type = ?")
|
|
68
|
+
params.append(full_type)
|
|
69
|
+
if datasource_id is not None:
|
|
70
|
+
sets.append("datasource_id = ?")
|
|
71
|
+
params.append(datasource_id)
|
|
65
72
|
if embeddable_text is not None:
|
|
66
73
|
sets.append("embeddable_text = ?")
|
|
67
74
|
params.append(embeddable_text)
|
|
68
75
|
if display_text is not None:
|
|
69
76
|
sets.append("display_text = ?")
|
|
70
77
|
params.append(display_text)
|
|
71
|
-
if generated_description is not None:
|
|
72
|
-
sets.append("generated_description = ?")
|
|
73
|
-
params.append(generated_description)
|
|
74
78
|
|
|
75
79
|
if not sets:
|
|
76
80
|
return self.get(chunk_id)
|
|
@@ -119,12 +123,12 @@ class ChunkRepository:
|
|
|
119
123
|
|
|
120
124
|
@staticmethod
|
|
121
125
|
def _row_to_dto(row: Tuple) -> ChunkDTO:
|
|
122
|
-
chunk_id,
|
|
126
|
+
chunk_id, full_type, datasource_id, embeddable_text, display_text, created_at = row
|
|
123
127
|
return ChunkDTO(
|
|
124
128
|
chunk_id=int(chunk_id),
|
|
125
|
-
|
|
126
|
-
|
|
129
|
+
full_type=full_type,
|
|
130
|
+
datasource_id=datasource_id,
|
|
131
|
+
embeddable_text=embeddable_text,
|
|
127
132
|
display_text=display_text,
|
|
128
|
-
generated_description=str(generated_description),
|
|
129
133
|
created_at=created_at,
|
|
130
134
|
)
|