databao-context-engine 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +18 -6
- databao_context_engine/build_sources/__init__.py +4 -0
- databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +27 -23
- databao_context_engine/build_sources/build_service.py +53 -0
- databao_context_engine/build_sources/build_wiring.py +84 -0
- databao_context_engine/build_sources/export_results.py +41 -0
- databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +3 -7
- databao_context_engine/cli/add_datasource_config.py +41 -15
- databao_context_engine/cli/commands.py +12 -43
- databao_context_engine/cli/info.py +2 -2
- databao_context_engine/databao_context_engine.py +137 -0
- databao_context_engine/databao_context_project_manager.py +96 -6
- databao_context_engine/datasources/add_config.py +34 -0
- databao_context_engine/{datasource_config → datasources}/check_config.py +18 -7
- databao_context_engine/datasources/datasource_context.py +93 -0
- databao_context_engine/{project → datasources}/datasource_discovery.py +17 -16
- databao_context_engine/{project → datasources}/types.py +64 -15
- databao_context_engine/init_project.py +25 -3
- databao_context_engine/introspection/property_extract.py +59 -30
- databao_context_engine/llm/errors.py +2 -8
- databao_context_engine/llm/install.py +13 -20
- databao_context_engine/llm/service.py +1 -3
- databao_context_engine/mcp/all_results_tool.py +2 -2
- databao_context_engine/mcp/mcp_runner.py +4 -2
- databao_context_engine/mcp/mcp_server.py +1 -4
- databao_context_engine/mcp/retrieve_tool.py +3 -11
- databao_context_engine/plugin_loader.py +111 -0
- databao_context_engine/pluginlib/build_plugin.py +25 -9
- databao_context_engine/pluginlib/config.py +16 -2
- databao_context_engine/plugins/databases/athena_introspector.py +85 -22
- databao_context_engine/plugins/databases/base_introspector.py +5 -3
- databao_context_engine/plugins/databases/clickhouse_introspector.py +22 -11
- databao_context_engine/plugins/databases/duckdb_introspector.py +1 -1
- databao_context_engine/plugins/databases/introspection_scope.py +11 -9
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
- databao_context_engine/plugins/databases/mssql_introspector.py +26 -17
- databao_context_engine/plugins/databases/mysql_introspector.py +23 -12
- databao_context_engine/plugins/databases/postgresql_introspector.py +2 -2
- databao_context_engine/plugins/databases/snowflake_introspector.py +43 -10
- databao_context_engine/plugins/plugin_loader.py +43 -42
- databao_context_engine/plugins/resources/parquet_introspector.py +2 -3
- databao_context_engine/project/info.py +31 -2
- databao_context_engine/project/init_project.py +16 -7
- databao_context_engine/project/layout.py +3 -3
- databao_context_engine/retrieve_embeddings/__init__.py +3 -0
- databao_context_engine/retrieve_embeddings/{internal/export_results.py → export_results.py} +2 -2
- databao_context_engine/retrieve_embeddings/{internal/retrieve_runner.py → retrieve_runner.py} +5 -9
- databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +3 -17
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +49 -0
- databao_context_engine/{serialisation → serialization}/yaml.py +1 -1
- databao_context_engine/services/chunk_embedding_service.py +23 -11
- databao_context_engine/services/factories.py +1 -46
- databao_context_engine/services/persistence_service.py +11 -11
- databao_context_engine/storage/connection.py +11 -7
- databao_context_engine/storage/exceptions/exceptions.py +2 -2
- databao_context_engine/storage/migrate.py +2 -4
- databao_context_engine/storage/migrations/V01__init.sql +6 -31
- databao_context_engine/storage/models.py +2 -23
- databao_context_engine/storage/repositories/chunk_repository.py +16 -12
- databao_context_engine/storage/repositories/factories.py +1 -12
- databao_context_engine/storage/repositories/vector_search_repository.py +8 -13
- databao_context_engine/system/properties.py +4 -2
- databao_context_engine-0.1.3.dist-info/METADATA +75 -0
- {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.3.dist-info}/RECORD +68 -77
- databao_context_engine/build_sources/internal/build_service.py +0 -77
- databao_context_engine/build_sources/internal/build_wiring.py +0 -52
- databao_context_engine/build_sources/internal/export_results.py +0 -43
- databao_context_engine/build_sources/public/api.py +0 -4
- databao_context_engine/databao_engine.py +0 -85
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +0 -50
- databao_context_engine/datasource_config/datasource_context.py +0 -60
- databao_context_engine/project/runs.py +0 -39
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +0 -3
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/services/run_name_policy.py +0 -8
- databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
- databao_context_engine/storage/repositories/run_repository.py +0 -157
- databao_context_engine-0.1.2.dist-info/METADATA +0 -187
- /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
- /databao_context_engine/{build_sources/public → serialization}/__init__.py +0 -0
- {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.3.dist-info}/WHEEL +0 -0
- {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
from databao_context_engine.build_sources.internal.plugin_execution import BuiltDatasourceContext, execute
|
|
7
|
-
from databao_context_engine.pluginlib.build_plugin import (
|
|
8
|
-
BuildPlugin,
|
|
9
|
-
)
|
|
10
|
-
from databao_context_engine.project.types import PreparedDatasource
|
|
11
|
-
from databao_context_engine.serialisation.yaml import to_yaml_string
|
|
12
|
-
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingService
|
|
13
|
-
from databao_context_engine.storage.models import RunDTO
|
|
14
|
-
from databao_context_engine.storage.repositories.datasource_run_repository import DatasourceRunRepository
|
|
15
|
-
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class BuildService:
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
*,
|
|
24
|
-
run_repo: RunRepository,
|
|
25
|
-
datasource_run_repo: DatasourceRunRepository,
|
|
26
|
-
chunk_embedding_service: ChunkEmbeddingService,
|
|
27
|
-
) -> None:
|
|
28
|
-
self._run_repo = run_repo
|
|
29
|
-
self._datasource_run_repo = datasource_run_repo
|
|
30
|
-
self._chunk_embedding_service = chunk_embedding_service
|
|
31
|
-
|
|
32
|
-
def start_run(self, *, project_id: str, dce_version: str) -> RunDTO:
|
|
33
|
-
"""
|
|
34
|
-
Create a new run row and return (run_id, started_at).
|
|
35
|
-
"""
|
|
36
|
-
return self._run_repo.create(project_id=project_id, dce_version=dce_version)
|
|
37
|
-
|
|
38
|
-
def finalize_run(self, *, run_id: int):
|
|
39
|
-
"""
|
|
40
|
-
Mark the run as complete (sets ended_at).
|
|
41
|
-
"""
|
|
42
|
-
self._run_repo.update(run_id=run_id, ended_at=datetime.now())
|
|
43
|
-
|
|
44
|
-
def process_prepared_source(
|
|
45
|
-
self,
|
|
46
|
-
*,
|
|
47
|
-
run_id: int,
|
|
48
|
-
prepared_source: PreparedDatasource,
|
|
49
|
-
plugin: BuildPlugin,
|
|
50
|
-
) -> BuiltDatasourceContext:
|
|
51
|
-
"""
|
|
52
|
-
Process a single source.
|
|
53
|
-
|
|
54
|
-
1) Execute the plugin
|
|
55
|
-
2) Divide the results into chunks
|
|
56
|
-
3) Embed and persist the chunks
|
|
57
|
-
"""
|
|
58
|
-
result = execute(prepared_source, plugin)
|
|
59
|
-
|
|
60
|
-
chunks = plugin.divide_context_into_chunks(result.context)
|
|
61
|
-
if not chunks:
|
|
62
|
-
logger.info("No chunks for %s — skipping.", prepared_source.path.name)
|
|
63
|
-
return result
|
|
64
|
-
|
|
65
|
-
datasource_run = self._datasource_run_repo.create(
|
|
66
|
-
run_id=run_id,
|
|
67
|
-
plugin=plugin.name,
|
|
68
|
-
full_type=prepared_source.datasource_type.full_type,
|
|
69
|
-
source_id=result.datasource_id,
|
|
70
|
-
storage_directory=str(prepared_source.path.parent),
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
self._chunk_embedding_service.embed_chunks(
|
|
74
|
-
datasource_run_id=datasource_run.datasource_run_id, chunks=chunks, result=to_yaml_string(result.context)
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
return result
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from databao_context_engine.build_sources.internal.build_runner import BuildContextResult, build
|
|
5
|
-
from databao_context_engine.llm.factory import (
|
|
6
|
-
create_ollama_description_provider,
|
|
7
|
-
create_ollama_embedding_provider,
|
|
8
|
-
create_ollama_service,
|
|
9
|
-
)
|
|
10
|
-
from databao_context_engine.project.info import get_dce_version
|
|
11
|
-
from databao_context_engine.project.layout import ensure_project_dir
|
|
12
|
-
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
|
|
13
|
-
from databao_context_engine.services.factories import (
|
|
14
|
-
create_build_service,
|
|
15
|
-
)
|
|
16
|
-
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
17
|
-
from databao_context_engine.system.properties import get_db_path
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddingMode) -> list[BuildContextResult]:
|
|
23
|
-
"""
|
|
24
|
-
Public build entrypoint
|
|
25
|
-
- Instantiates the build service
|
|
26
|
-
- Delegates the actual build logic to the build runner
|
|
27
|
-
"""
|
|
28
|
-
project_layout = ensure_project_dir(project_dir)
|
|
29
|
-
|
|
30
|
-
logger.debug(f"Starting to build datasources in project {project_dir.resolve()}")
|
|
31
|
-
|
|
32
|
-
with open_duckdb_connection(get_db_path()) as conn:
|
|
33
|
-
ollama_service = create_ollama_service()
|
|
34
|
-
embedding_provider = create_ollama_embedding_provider(ollama_service)
|
|
35
|
-
description_provider = (
|
|
36
|
-
create_ollama_description_provider(ollama_service)
|
|
37
|
-
if chunk_embedding_mode.should_generate_description()
|
|
38
|
-
else None
|
|
39
|
-
)
|
|
40
|
-
build_service = create_build_service(
|
|
41
|
-
conn,
|
|
42
|
-
embedding_provider=embedding_provider,
|
|
43
|
-
description_provider=description_provider,
|
|
44
|
-
chunk_embedding_mode=chunk_embedding_mode,
|
|
45
|
-
)
|
|
46
|
-
dce_config = project_layout.read_config_file()
|
|
47
|
-
return build(
|
|
48
|
-
project_dir=project_dir,
|
|
49
|
-
build_service=build_service,
|
|
50
|
-
project_id=str(dce_config.project_id),
|
|
51
|
-
dce_version=get_dce_version(),
|
|
52
|
-
)
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from databao_context_engine.build_sources.internal.plugin_execution import BuiltDatasourceContext
|
|
5
|
-
from databao_context_engine.datasource_config.datasource_context import get_context_header_for_datasource
|
|
6
|
-
from databao_context_engine.project.layout import ALL_RESULTS_FILE_NAME, get_output_dir
|
|
7
|
-
from databao_context_engine.project.types import DatasourceId
|
|
8
|
-
from databao_context_engine.serialisation.yaml import write_yaml_to_stream
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def create_run_dir(project_dir: Path, run_name: str) -> Path:
|
|
14
|
-
output_dir = get_output_dir(project_dir)
|
|
15
|
-
|
|
16
|
-
run_dir = output_dir.joinpath(run_name)
|
|
17
|
-
run_dir.mkdir(parents=True, exist_ok=False)
|
|
18
|
-
|
|
19
|
-
return run_dir
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def export_build_result(run_dir: Path, result: BuiltDatasourceContext) -> Path:
|
|
23
|
-
datasource_id = DatasourceId.from_string_repr(result.datasource_id)
|
|
24
|
-
export_file_path = run_dir.joinpath(datasource_id.relative_path_to_context_file())
|
|
25
|
-
|
|
26
|
-
# Make sure the parent folder exists
|
|
27
|
-
export_file_path.parent.mkdir(exist_ok=True)
|
|
28
|
-
|
|
29
|
-
with export_file_path.open("w") as export_file:
|
|
30
|
-
write_yaml_to_stream(data=result, file_stream=export_file)
|
|
31
|
-
|
|
32
|
-
logger.info(f"Exported result to {export_file_path.resolve()}")
|
|
33
|
-
|
|
34
|
-
return export_file_path
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def append_result_to_all_results(run_dir: Path, result: BuiltDatasourceContext):
|
|
38
|
-
path = run_dir.joinpath(ALL_RESULTS_FILE_NAME)
|
|
39
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
-
with path.open("a", encoding="utf-8") as export_file:
|
|
41
|
-
export_file.write(get_context_header_for_datasource(DatasourceId.from_string_repr(result.datasource_id)))
|
|
42
|
-
write_yaml_to_stream(data=result, file_stream=export_file)
|
|
43
|
-
export_file.write("\n")
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
from databao_context_engine.datasource_config.datasource_context import (
|
|
7
|
-
DatasourceContext,
|
|
8
|
-
get_all_contexts,
|
|
9
|
-
get_context_header_for_datasource,
|
|
10
|
-
get_datasource_context,
|
|
11
|
-
)
|
|
12
|
-
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
13
|
-
from databao_context_engine.project.datasource_discovery import get_datasource_list
|
|
14
|
-
from databao_context_engine.project.layout import ensure_project_dir
|
|
15
|
-
from databao_context_engine.project.types import Datasource, DatasourceId
|
|
16
|
-
from databao_context_engine.retrieve_embeddings.public.api import retrieve_embeddings
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class ContextSearchResult:
|
|
21
|
-
datasource_id: DatasourceId
|
|
22
|
-
datasource_type: DatasourceType
|
|
23
|
-
distance: float
|
|
24
|
-
context_result: str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class DatabaoContextEngine:
|
|
28
|
-
project_dir: Path
|
|
29
|
-
|
|
30
|
-
def __init__(self, project_dir: Path) -> None:
|
|
31
|
-
self.project_layout = ensure_project_dir(project_dir=project_dir)
|
|
32
|
-
self.project_dir = project_dir
|
|
33
|
-
|
|
34
|
-
def get_datasource_list(self) -> list[Datasource]:
|
|
35
|
-
# TODO: Should this return the list of built datasources rather than the list of datasources within the src folder?
|
|
36
|
-
return get_datasource_list(self.project_dir)
|
|
37
|
-
|
|
38
|
-
def get_datasource_context(self, datasource_id: DatasourceId, run_name: str | None = None) -> DatasourceContext:
|
|
39
|
-
return get_datasource_context(
|
|
40
|
-
project_layout=self.project_layout, datasource_id=datasource_id, run_name=run_name
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
def get_all_contexts(self, run_name: str | None = None) -> list[DatasourceContext]:
|
|
44
|
-
return get_all_contexts(project_layout=self.project_layout, run_name=run_name)
|
|
45
|
-
|
|
46
|
-
def get_all_contexts_formatted(self, run_name: str | None = None) -> str:
|
|
47
|
-
all_contexts = self.get_all_contexts(run_name=run_name)
|
|
48
|
-
|
|
49
|
-
all_results = os.linesep.join(
|
|
50
|
-
[f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
return all_results
|
|
54
|
-
|
|
55
|
-
def search_context(
|
|
56
|
-
self,
|
|
57
|
-
retrieve_text: str,
|
|
58
|
-
run_name: str | None,
|
|
59
|
-
limit: int | None,
|
|
60
|
-
export_to_file: bool,
|
|
61
|
-
datasource_ids: list[DatasourceId] | None = None,
|
|
62
|
-
) -> list[ContextSearchResult]:
|
|
63
|
-
# TODO: Filter with datasource_ids
|
|
64
|
-
# TODO: Remove the need for a run_name
|
|
65
|
-
|
|
66
|
-
results = retrieve_embeddings(
|
|
67
|
-
project_layout=self.project_layout,
|
|
68
|
-
retrieve_text=retrieve_text,
|
|
69
|
-
run_name=run_name,
|
|
70
|
-
limit=limit,
|
|
71
|
-
export_to_file=export_to_file,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
return [
|
|
75
|
-
ContextSearchResult(
|
|
76
|
-
datasource_id=result.datasource_id,
|
|
77
|
-
datasource_type=result.datasource_type,
|
|
78
|
-
distance=result.cosine_distance,
|
|
79
|
-
context_result=result.display_text,
|
|
80
|
-
)
|
|
81
|
-
for result in results
|
|
82
|
-
]
|
|
83
|
-
|
|
84
|
-
def run_sql(self, datasource_id: DatasourceId, sql: str, params: list[str]) -> dict[str, Any]:
|
|
85
|
-
raise NotImplementedError("Running SQL is not supported yet")
|
|
File without changes
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from databao_context_engine.introspection.property_extract import get_property_list_from_type
|
|
5
|
-
from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, DatasourceType
|
|
6
|
-
from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
|
|
7
|
-
from databao_context_engine.plugins.plugin_loader import get_plugin_for_type
|
|
8
|
-
from databao_context_engine.project.layout import (
|
|
9
|
-
create_datasource_config_file as create_datasource_config_file_internal,
|
|
10
|
-
)
|
|
11
|
-
from databao_context_engine.project.types import DatasourceId
|
|
12
|
-
from databao_context_engine.serialisation.yaml import to_yaml_string
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_config_file_structure_for_datasource_type(datasource_type: DatasourceType) -> list[ConfigPropertyDefinition]:
|
|
16
|
-
plugin = get_plugin_for_type(datasource_type)
|
|
17
|
-
|
|
18
|
-
if isinstance(plugin, CustomiseConfigProperties):
|
|
19
|
-
return plugin.get_config_file_properties()
|
|
20
|
-
elif isinstance(plugin, BuildDatasourcePlugin):
|
|
21
|
-
return get_property_list_from_type(plugin.config_file_type)
|
|
22
|
-
else:
|
|
23
|
-
raise ValueError(
|
|
24
|
-
f"Impossible to create a config for type {datasource_type.full_type}. The plugin for this type is not a BuildDatasourcePlugin or CustomiseConfigProperties"
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def create_datasource_config_file(
|
|
29
|
-
project_dir: Path,
|
|
30
|
-
datasource_type: DatasourceType,
|
|
31
|
-
datasource_name: str,
|
|
32
|
-
config_content: dict[str, Any],
|
|
33
|
-
overwrite_existing: bool,
|
|
34
|
-
) -> Path:
|
|
35
|
-
basic_config = {"type": datasource_type.subtype, "name": datasource_name}
|
|
36
|
-
|
|
37
|
-
return create_datasource_config_file_internal(
|
|
38
|
-
project_dir,
|
|
39
|
-
get_datasource_id_for_config_file(datasource_type, datasource_name),
|
|
40
|
-
to_yaml_string(basic_config | config_content),
|
|
41
|
-
overwrite_existing=overwrite_existing,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def get_datasource_id_for_config_file(datasource_type: DatasourceType, datasource_name: str) -> DatasourceId:
|
|
46
|
-
return DatasourceId(
|
|
47
|
-
datasource_config_folder=datasource_type.config_folder,
|
|
48
|
-
datasource_name=datasource_name,
|
|
49
|
-
config_file_suffix=".yaml",
|
|
50
|
-
)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from databao_context_engine.project.layout import ProjectLayout
|
|
6
|
-
from databao_context_engine.project.runs import get_run_dir, resolve_run_name
|
|
7
|
-
from databao_context_engine.project.types import DatasourceId
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass(eq=True, frozen=True)
|
|
11
|
-
class DatasourceContext:
|
|
12
|
-
datasource_id: DatasourceId
|
|
13
|
-
# TODO: Read the context as a BuildExecutionResult instead of a Yaml string?
|
|
14
|
-
context: str
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def get_datasource_context(
|
|
18
|
-
project_layout: ProjectLayout, datasource_id: DatasourceId, run_name: str | None = None
|
|
19
|
-
) -> DatasourceContext:
|
|
20
|
-
run_dir = _resolve_run_dir(project_layout, run_name)
|
|
21
|
-
|
|
22
|
-
context_path = run_dir.joinpath(datasource_id.relative_path_to_context_file())
|
|
23
|
-
if not context_path.is_file():
|
|
24
|
-
raise ValueError(f"Context file not found for datasource {str(datasource_id)} in run {run_dir.name}")
|
|
25
|
-
|
|
26
|
-
context = context_path.read_text()
|
|
27
|
-
return DatasourceContext(datasource_id=datasource_id, context=context)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def get_all_contexts(project_layout: ProjectLayout, run_name: str | None = None) -> list[DatasourceContext]:
|
|
31
|
-
run_dir = _resolve_run_dir(project_layout, run_name)
|
|
32
|
-
|
|
33
|
-
result = []
|
|
34
|
-
for main_type_dir in sorted((p for p in run_dir.iterdir() if p.is_dir()), key=lambda p: p.name.lower()):
|
|
35
|
-
for context_path in sorted(
|
|
36
|
-
(p for p in main_type_dir.iterdir() if p.suffix in [".yaml", ".yml"]), key=lambda p: p.name.lower()
|
|
37
|
-
):
|
|
38
|
-
result.append(
|
|
39
|
-
DatasourceContext(
|
|
40
|
-
# FIXME: The extension will always be yaml here even if the datasource is a file with a different extension
|
|
41
|
-
datasource_id=DatasourceId.from_datasource_config_file_path(context_path),
|
|
42
|
-
context=context_path.read_text(),
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
return result
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def get_context_header_for_datasource(datasource_id: DatasourceId) -> str:
|
|
50
|
-
return f"# ===== {str(datasource_id)} ====={os.linesep}"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def _resolve_run_dir(project_layout: ProjectLayout, run_name: str | None) -> Path:
|
|
54
|
-
resolved_run_name = resolve_run_name(project_layout=project_layout, run_name=run_name)
|
|
55
|
-
|
|
56
|
-
run_dir = get_run_dir(project_dir=project_layout.project_dir, run_name=resolved_run_name)
|
|
57
|
-
if not run_dir.is_dir():
|
|
58
|
-
raise ValueError(f"Run {resolved_run_name} does not exist at {run_dir.resolve()}")
|
|
59
|
-
|
|
60
|
-
return run_dir
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from databao_context_engine.project.layout import ProjectLayout, get_output_dir
|
|
4
|
-
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
5
|
-
from databao_context_engine.storage.repositories.factories import create_run_repository
|
|
6
|
-
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
7
|
-
from databao_context_engine.system.properties import get_db_path
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def resolve_run_name(*, project_layout: ProjectLayout, run_name: str | None) -> str:
|
|
11
|
-
project_id = str(project_layout.read_config_file().project_id)
|
|
12
|
-
|
|
13
|
-
with open_duckdb_connection(get_db_path()) as conn:
|
|
14
|
-
run_repository = create_run_repository(conn)
|
|
15
|
-
|
|
16
|
-
return resolve_run_name_from_repo(run_repository=run_repository, project_id=project_id, run_name=run_name)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def resolve_run_name_from_repo(*, run_repository: RunRepository, project_id: str, run_name: str | None) -> str:
|
|
20
|
-
if run_name is None:
|
|
21
|
-
latest = run_repository.get_latest_run_for_project(project_id=project_id)
|
|
22
|
-
if latest is None:
|
|
23
|
-
raise LookupError(f"No runs found for project '{project_id}'. Run a build first.")
|
|
24
|
-
return latest.run_name
|
|
25
|
-
else:
|
|
26
|
-
run = run_repository.get_by_run_name(project_id=project_id, run_name=run_name)
|
|
27
|
-
if run is None:
|
|
28
|
-
raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
|
|
29
|
-
return run.run_name
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def get_run_dir(project_dir: Path, run_name: str) -> Path:
|
|
33
|
-
run_dir = get_output_dir(project_dir).joinpath(run_name)
|
|
34
|
-
if not run_dir.is_dir():
|
|
35
|
-
raise ValueError(
|
|
36
|
-
f"The run with name {run_name} doesn't exist in the project. [project_dir: {project_dir.resolve()}]"
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
return run_dir
|
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from databao_context_engine.llm.factory import create_ollama_embedding_provider, create_ollama_service
|
|
2
|
-
from databao_context_engine.project.layout import ProjectLayout
|
|
3
|
-
from databao_context_engine.retrieve_embeddings.internal.retrieve_runner import retrieve
|
|
4
|
-
from databao_context_engine.services.factories import create_retrieve_service
|
|
5
|
-
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
6
|
-
from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
|
|
7
|
-
from databao_context_engine.system.properties import get_db_path
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def retrieve_embeddings(
|
|
11
|
-
project_layout: ProjectLayout,
|
|
12
|
-
retrieve_text: str,
|
|
13
|
-
run_name: str | None,
|
|
14
|
-
limit: int | None,
|
|
15
|
-
export_to_file: bool,
|
|
16
|
-
) -> list[VectorSearchResult]:
|
|
17
|
-
with open_duckdb_connection(get_db_path()) as conn:
|
|
18
|
-
ollama_service = create_ollama_service()
|
|
19
|
-
embedding_provider = create_ollama_embedding_provider(ollama_service)
|
|
20
|
-
retrieve_service = create_retrieve_service(conn, embedding_provider=embedding_provider)
|
|
21
|
-
return retrieve(
|
|
22
|
-
project_dir=project_layout.project_dir,
|
|
23
|
-
retrieve_service=retrieve_service,
|
|
24
|
-
project_id=str(project_layout.read_config_file().project_id),
|
|
25
|
-
text=retrieve_text,
|
|
26
|
-
run_name=run_name,
|
|
27
|
-
limit=limit,
|
|
28
|
-
export_to_file=export_to_file,
|
|
29
|
-
)
|
|
File without changes
|
|
File without changes
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
from typing import Any, Optional, Tuple
|
|
2
|
-
|
|
3
|
-
import duckdb
|
|
4
|
-
from _duckdb import ConstraintException
|
|
5
|
-
|
|
6
|
-
from databao_context_engine.storage.exceptions.exceptions import IntegrityError
|
|
7
|
-
from databao_context_engine.storage.models import DatasourceRunDTO
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class DatasourceRunRepository:
|
|
11
|
-
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
12
|
-
self._conn = conn
|
|
13
|
-
|
|
14
|
-
def create(
|
|
15
|
-
self,
|
|
16
|
-
*,
|
|
17
|
-
run_id: int,
|
|
18
|
-
plugin: str,
|
|
19
|
-
full_type: str,
|
|
20
|
-
source_id: str,
|
|
21
|
-
storage_directory: str,
|
|
22
|
-
) -> DatasourceRunDTO:
|
|
23
|
-
try:
|
|
24
|
-
row = self._conn.execute(
|
|
25
|
-
"""
|
|
26
|
-
INSERT INTO
|
|
27
|
-
datasource_run(run_id, plugin, full_type, source_id, storage_directory)
|
|
28
|
-
VALUES
|
|
29
|
-
(?, ?, ?, ?, ?)
|
|
30
|
-
RETURNING
|
|
31
|
-
*
|
|
32
|
-
""",
|
|
33
|
-
[run_id, plugin, full_type, source_id, storage_directory],
|
|
34
|
-
).fetchone()
|
|
35
|
-
if row is None:
|
|
36
|
-
raise RuntimeError("datasource_run creation returned no object")
|
|
37
|
-
return self._row_to_dto(row)
|
|
38
|
-
except ConstraintException as e:
|
|
39
|
-
raise IntegrityError from e
|
|
40
|
-
|
|
41
|
-
def get(self, datasource_run_id: int) -> Optional[DatasourceRunDTO]:
|
|
42
|
-
row = self._conn.execute(
|
|
43
|
-
"""
|
|
44
|
-
SELECT
|
|
45
|
-
*
|
|
46
|
-
FROM
|
|
47
|
-
datasource_run
|
|
48
|
-
WHERE
|
|
49
|
-
datasource_run_id = ?
|
|
50
|
-
""",
|
|
51
|
-
[datasource_run_id],
|
|
52
|
-
).fetchone()
|
|
53
|
-
return self._row_to_dto(row) if row else None
|
|
54
|
-
|
|
55
|
-
def update(
|
|
56
|
-
self,
|
|
57
|
-
datasource_run_id: int,
|
|
58
|
-
*,
|
|
59
|
-
plugin: Optional[str] = None,
|
|
60
|
-
full_type: Optional[str] = None,
|
|
61
|
-
source_id: Optional[str] = None,
|
|
62
|
-
storage_directory: Optional[str] = None,
|
|
63
|
-
) -> Optional[DatasourceRunDTO]:
|
|
64
|
-
sets: list[Any] = []
|
|
65
|
-
params: list[Any] = []
|
|
66
|
-
|
|
67
|
-
if plugin is not None:
|
|
68
|
-
sets.append("plugin = ?")
|
|
69
|
-
params.append(plugin)
|
|
70
|
-
if full_type is not None:
|
|
71
|
-
sets.append("full_type = ?")
|
|
72
|
-
params.append(full_type)
|
|
73
|
-
if source_id is not None:
|
|
74
|
-
sets.append("source_id = ?")
|
|
75
|
-
params.append(source_id)
|
|
76
|
-
if storage_directory is not None:
|
|
77
|
-
sets.append("storage_directory = ?")
|
|
78
|
-
params.append(storage_directory)
|
|
79
|
-
|
|
80
|
-
if not sets:
|
|
81
|
-
return self.get(datasource_run_id)
|
|
82
|
-
|
|
83
|
-
params.append(datasource_run_id)
|
|
84
|
-
self._conn.execute(
|
|
85
|
-
f"""
|
|
86
|
-
UPDATE
|
|
87
|
-
datasource_run
|
|
88
|
-
SET
|
|
89
|
-
{", ".join(sets)}
|
|
90
|
-
WHERE
|
|
91
|
-
datasource_run_id = ?
|
|
92
|
-
""",
|
|
93
|
-
params,
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
return self.get(datasource_run_id)
|
|
97
|
-
|
|
98
|
-
def delete(self, datasource_run_id: int) -> int:
|
|
99
|
-
row = self._conn.execute(
|
|
100
|
-
"""
|
|
101
|
-
DELETE FROM
|
|
102
|
-
datasource_run
|
|
103
|
-
WHERE
|
|
104
|
-
datasource_run_id = ?
|
|
105
|
-
RETURNING
|
|
106
|
-
datasource_run_id
|
|
107
|
-
""",
|
|
108
|
-
[datasource_run_id],
|
|
109
|
-
).fetchone()
|
|
110
|
-
return 1 if row else 0
|
|
111
|
-
|
|
112
|
-
def list(self) -> list[DatasourceRunDTO]:
|
|
113
|
-
rows = self._conn.execute(
|
|
114
|
-
"""
|
|
115
|
-
SELECT
|
|
116
|
-
*
|
|
117
|
-
FROM
|
|
118
|
-
datasource_run
|
|
119
|
-
ORDER BY
|
|
120
|
-
datasource_run_id DESC
|
|
121
|
-
"""
|
|
122
|
-
).fetchall()
|
|
123
|
-
return [self._row_to_dto(r) for r in rows]
|
|
124
|
-
|
|
125
|
-
@staticmethod
|
|
126
|
-
def _row_to_dto(row: Tuple) -> DatasourceRunDTO:
|
|
127
|
-
datasource_run_id, run_id, plugin, source_id, storage_directory, created_at, full_type = row
|
|
128
|
-
return DatasourceRunDTO(
|
|
129
|
-
datasource_run_id=int(datasource_run_id),
|
|
130
|
-
run_id=int(run_id),
|
|
131
|
-
plugin=str(plugin),
|
|
132
|
-
full_type=str(full_type),
|
|
133
|
-
source_id=str(source_id),
|
|
134
|
-
storage_directory=str(storage_directory),
|
|
135
|
-
created_at=created_at,
|
|
136
|
-
)
|