databao-context-engine 0.1.4.dev1__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +14 -1
- databao_context_engine/build_sources/build_runner.py +7 -7
- databao_context_engine/build_sources/build_wiring.py +8 -10
- databao_context_engine/build_sources/plugin_execution.py +9 -12
- databao_context_engine/cli/add_datasource_config.py +9 -30
- databao_context_engine/cli/commands.py +29 -13
- databao_context_engine/databao_context_engine.py +3 -13
- databao_context_engine/databao_context_project_manager.py +56 -29
- databao_context_engine/datasources/check_config.py +13 -16
- databao_context_engine/datasources/datasource_context.py +21 -24
- databao_context_engine/datasources/datasource_discovery.py +45 -44
- databao_context_engine/datasources/types.py +53 -42
- databao_context_engine/generate_configs_schemas.py +4 -5
- databao_context_engine/introspection/property_extract.py +52 -47
- databao_context_engine/llm/__init__.py +10 -0
- databao_context_engine/llm/api.py +57 -0
- databao_context_engine/llm/descriptions/ollama.py +1 -3
- databao_context_engine/llm/factory.py +5 -2
- databao_context_engine/llm/install.py +13 -10
- databao_context_engine/llm/runtime.py +3 -5
- databao_context_engine/mcp/mcp_server.py +1 -3
- databao_context_engine/plugin_loader.py +6 -7
- databao_context_engine/pluginlib/build_plugin.py +0 -33
- databao_context_engine/plugins/databases/athena/__init__.py +0 -0
- databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{athena_introspector.py → athena/athena_introspector.py} +2 -5
- databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +1 -3
- databao_context_engine/plugins/databases/base_introspector.py +11 -14
- databao_context_engine/plugins/databases/clickhouse/__init__.py +0 -0
- databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +2 -5
- databao_context_engine/plugins/databases/duckdb/__init__.py +0 -0
- databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +4 -7
- databao_context_engine/plugins/databases/mssql/__init__.py +0 -0
- databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +9 -10
- databao_context_engine/plugins/databases/mysql/__init__.py +0 -0
- databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
- databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +8 -8
- databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
- databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +9 -16
- databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
- databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +8 -9
- databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
- databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
- databao_context_engine/plugins/dbt/__init__.py +0 -0
- databao_context_engine/plugins/dbt/dbt_chunker.py +47 -0
- databao_context_engine/plugins/dbt/dbt_context_extractor.py +106 -0
- databao_context_engine/plugins/dbt/dbt_plugin.py +25 -0
- databao_context_engine/plugins/dbt/types.py +44 -0
- databao_context_engine/plugins/dbt/types_artifacts.py +58 -0
- databao_context_engine/plugins/files/__init__.py +0 -0
- databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
- databao_context_engine/plugins/plugin_loader.py +13 -15
- databao_context_engine/plugins/resources/parquet_introspector.py +1 -1
- databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
- databao_context_engine/project/layout.py +12 -13
- databao_context_engine/retrieve_embeddings/retrieve_runner.py +3 -16
- databao_context_engine/retrieve_embeddings/retrieve_service.py +13 -6
- databao_context_engine/retrieve_embeddings/retrieve_wiring.py +4 -7
- databao_context_engine/serialization/yaml.py +5 -5
- databao_context_engine/storage/migrate.py +1 -1
- databao_context_engine/storage/repositories/vector_search_repository.py +18 -6
- databao_context_engine-0.1.6.dist-info/METADATA +228 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/RECORD +71 -55
- databao_context_engine/datasources/add_config.py +0 -34
- databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
- databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
- databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
- databao_context_engine/retrieve_embeddings/export_results.py +0 -12
- databao_context_engine-0.1.4.dev1.dist-info/METADATA +0 -75
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/WHEEL +0 -0
- {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/entry_points.txt +0 -0
|
@@ -8,6 +8,13 @@ from databao_context_engine.datasources.check_config import (
|
|
|
8
8
|
from databao_context_engine.datasources.datasource_context import DatasourceContext
|
|
9
9
|
from databao_context_engine.datasources.types import Datasource, DatasourceId
|
|
10
10
|
from databao_context_engine.init_project import init_dce_project, init_or_get_dce_project
|
|
11
|
+
from databao_context_engine.llm import (
|
|
12
|
+
OllamaError,
|
|
13
|
+
OllamaPermanentError,
|
|
14
|
+
OllamaTransientError,
|
|
15
|
+
download_ollama_models_if_needed,
|
|
16
|
+
install_ollama_if_needed,
|
|
17
|
+
)
|
|
11
18
|
from databao_context_engine.plugin_loader import DatabaoContextPluginLoader
|
|
12
19
|
from databao_context_engine.pluginlib.build_plugin import (
|
|
13
20
|
BuildDatasourcePlugin,
|
|
@@ -16,7 +23,7 @@ from databao_context_engine.pluginlib.build_plugin import (
|
|
|
16
23
|
DatasourceType,
|
|
17
24
|
)
|
|
18
25
|
from databao_context_engine.pluginlib.config import ConfigPropertyDefinition
|
|
19
|
-
from databao_context_engine.project.info import DceInfo, get_databao_context_engine_info
|
|
26
|
+
from databao_context_engine.project.info import DceInfo, DceProjectInfo, get_databao_context_engine_info
|
|
20
27
|
from databao_context_engine.project.init_project import InitErrorReason, InitProjectError
|
|
21
28
|
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
|
|
22
29
|
|
|
@@ -35,6 +42,7 @@ __all__ = [
|
|
|
35
42
|
"DatasourceType",
|
|
36
43
|
"get_databao_context_engine_info",
|
|
37
44
|
"DceInfo",
|
|
45
|
+
"DceProjectInfo",
|
|
38
46
|
"init_dce_project",
|
|
39
47
|
"init_or_get_dce_project",
|
|
40
48
|
"InitErrorReason",
|
|
@@ -44,4 +52,9 @@ __all__ = [
|
|
|
44
52
|
"BuildPlugin",
|
|
45
53
|
"BuildDatasourcePlugin",
|
|
46
54
|
"BuildFilePlugin",
|
|
55
|
+
"install_ollama_if_needed",
|
|
56
|
+
"download_ollama_models_if_needed",
|
|
57
|
+
"OllamaError",
|
|
58
|
+
"OllamaTransientError",
|
|
59
|
+
"OllamaPermanentError",
|
|
47
60
|
]
|
|
@@ -13,7 +13,7 @@ from databao_context_engine.datasources.datasource_discovery import discover_dat
|
|
|
13
13
|
from databao_context_engine.datasources.types import DatasourceId
|
|
14
14
|
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
15
15
|
from databao_context_engine.plugins.plugin_loader import load_plugins
|
|
16
|
-
from databao_context_engine.project.layout import
|
|
16
|
+
from databao_context_engine.project.layout import ProjectLayout
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -36,7 +36,7 @@ class BuildContextResult:
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def build(
|
|
39
|
-
|
|
39
|
+
project_layout: ProjectLayout,
|
|
40
40
|
*,
|
|
41
41
|
build_service: BuildService,
|
|
42
42
|
) -> list[BuildContextResult]:
|
|
@@ -55,15 +55,15 @@ def build(
|
|
|
55
55
|
"""
|
|
56
56
|
plugins = load_plugins()
|
|
57
57
|
|
|
58
|
-
datasources = discover_datasources(
|
|
58
|
+
datasources = discover_datasources(project_layout)
|
|
59
59
|
|
|
60
60
|
if not datasources:
|
|
61
|
-
logger.info("No sources discovered under %s",
|
|
61
|
+
logger.info("No sources discovered under %s", project_layout.src_dir)
|
|
62
62
|
return []
|
|
63
63
|
|
|
64
64
|
number_of_failed_builds = 0
|
|
65
65
|
build_result = []
|
|
66
|
-
reset_all_results(
|
|
66
|
+
reset_all_results(project_layout.output_dir)
|
|
67
67
|
for discovered_datasource in datasources:
|
|
68
68
|
try:
|
|
69
69
|
prepared_source = prepare_source(discovered_datasource)
|
|
@@ -87,14 +87,14 @@ def build(
|
|
|
87
87
|
plugin=plugin,
|
|
88
88
|
)
|
|
89
89
|
|
|
90
|
-
output_dir =
|
|
90
|
+
output_dir = project_layout.output_dir
|
|
91
91
|
|
|
92
92
|
context_file_path = export_build_result(output_dir, result)
|
|
93
93
|
append_result_to_all_results(output_dir, result)
|
|
94
94
|
|
|
95
95
|
build_result.append(
|
|
96
96
|
BuildContextResult(
|
|
97
|
-
datasource_id=
|
|
97
|
+
datasource_id=discovered_datasource.datasource_id,
|
|
98
98
|
datasource_type=DatasourceType(full_type=result.datasource_type),
|
|
99
99
|
context_built_at=result.context_built_at,
|
|
100
100
|
context_file_path=context_file_path,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
2
|
|
|
4
3
|
from duckdb import DuckDBPyConnection
|
|
5
4
|
|
|
@@ -12,7 +11,7 @@ from databao_context_engine.llm.factory import (
|
|
|
12
11
|
create_ollama_embedding_provider,
|
|
13
12
|
create_ollama_service,
|
|
14
13
|
)
|
|
15
|
-
from databao_context_engine.project.layout import
|
|
14
|
+
from databao_context_engine.project.layout import ProjectLayout
|
|
16
15
|
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
|
|
17
16
|
from databao_context_engine.services.factories import create_chunk_embedding_service
|
|
18
17
|
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
@@ -22,7 +21,9 @@ from databao_context_engine.system.properties import get_db_path
|
|
|
22
21
|
logger = logging.getLogger(__name__)
|
|
23
22
|
|
|
24
23
|
|
|
25
|
-
def build_all_datasources(
|
|
24
|
+
def build_all_datasources(
|
|
25
|
+
project_layout: ProjectLayout, chunk_embedding_mode: ChunkEmbeddingMode
|
|
26
|
+
) -> list[BuildContextResult]:
|
|
26
27
|
"""Build the context for all datasources in the project.
|
|
27
28
|
|
|
28
29
|
- Instantiates the build service
|
|
@@ -31,21 +32,18 @@ def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddin
|
|
|
31
32
|
Returns:
|
|
32
33
|
A list of all the contexts built.
|
|
33
34
|
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
logger.debug(f"Starting to build datasources in project {project_dir.resolve()}")
|
|
35
|
+
logger.debug(f"Starting to build datasources in project {project_layout.project_dir.resolve()}")
|
|
37
36
|
|
|
38
37
|
# Think about alternative solutions. This solution will mirror the current behaviour
|
|
39
38
|
# The current behaviour only builds what is currently in the /src folder
|
|
40
39
|
# This will need to change in the future when we can pick which datasources to build
|
|
41
|
-
db_path = get_db_path(project_dir)
|
|
40
|
+
db_path = get_db_path(project_layout.project_dir)
|
|
42
41
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
42
|
if db_path.exists():
|
|
44
43
|
db_path.unlink()
|
|
45
44
|
|
|
45
|
+
migrate(db_path)
|
|
46
46
|
with open_duckdb_connection(db_path) as conn:
|
|
47
|
-
migrate(db_path)
|
|
48
|
-
|
|
49
47
|
ollama_service = create_ollama_service()
|
|
50
48
|
embedding_provider = create_ollama_embedding_provider(ollama_service)
|
|
51
49
|
description_provider = (
|
|
@@ -60,7 +58,7 @@ def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddin
|
|
|
60
58
|
chunk_embedding_mode=chunk_embedding_mode,
|
|
61
59
|
)
|
|
62
60
|
return build(
|
|
63
|
-
|
|
61
|
+
project_layout=project_layout,
|
|
64
62
|
build_service=build_service,
|
|
65
63
|
)
|
|
66
64
|
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Any, cast
|
|
4
4
|
|
|
5
|
-
from databao_context_engine.datasources.types import
|
|
5
|
+
from databao_context_engine.datasources.types import PreparedConfig, PreparedDatasource
|
|
6
6
|
from databao_context_engine.pluginlib.build_plugin import (
|
|
7
7
|
BuildDatasourcePlugin,
|
|
8
8
|
BuildFilePlugin,
|
|
@@ -39,11 +39,8 @@ class BuiltDatasourceContext:
|
|
|
39
39
|
|
|
40
40
|
def execute(prepared_datasource: PreparedDatasource, plugin: BuildPlugin) -> BuiltDatasourceContext:
|
|
41
41
|
built_context = _execute(prepared_datasource, plugin)
|
|
42
|
-
|
|
43
|
-
datasource_id = DatasourceId.from_datasource_config_file_path(prepared_datasource.path)
|
|
44
|
-
|
|
45
42
|
return BuiltDatasourceContext(
|
|
46
|
-
datasource_id=str(datasource_id),
|
|
43
|
+
datasource_id=str(prepared_datasource.datasource_id),
|
|
47
44
|
datasource_type=prepared_datasource.datasource_type.full_type,
|
|
48
45
|
context_built_at=datetime.now(),
|
|
49
46
|
context=built_context,
|
|
@@ -61,10 +58,10 @@ def _execute(prepared_datasource: PreparedDatasource, plugin: BuildPlugin) -> An
|
|
|
61
58
|
config=prepared_datasource.config,
|
|
62
59
|
datasource_name=prepared_datasource.datasource_name,
|
|
63
60
|
)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
61
|
+
|
|
62
|
+
file_plugin = cast(BuildFilePlugin, plugin)
|
|
63
|
+
return execute_file_plugin(
|
|
64
|
+
plugin=file_plugin,
|
|
65
|
+
datasource_type=prepared_datasource.datasource_type,
|
|
66
|
+
file_path=prepared_datasource.path,
|
|
67
|
+
)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
from typing import Any
|
|
5
4
|
|
|
@@ -28,12 +27,10 @@ def add_datasource_config_interactive(project_dir: Path) -> DatasourceId:
|
|
|
28
27
|
)
|
|
29
28
|
datasource_name = click.prompt("Datasource name?", type=str)
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
)
|
|
34
|
-
if is_datasource_existing:
|
|
30
|
+
datasource_id = project_manager.datasource_config_exists(datasource_name=datasource_name)
|
|
31
|
+
if datasource_id is not None:
|
|
35
32
|
click.confirm(
|
|
36
|
-
f"A config file already exists for this datasource
|
|
33
|
+
f"A config file already exists for this datasource {datasource_id.relative_path_to_config_file()}. Do you want to overwrite it?",
|
|
37
34
|
abort=True,
|
|
38
35
|
default=False,
|
|
39
36
|
)
|
|
@@ -52,25 +49,15 @@ def add_datasource_config_interactive(project_dir: Path) -> DatasourceId:
|
|
|
52
49
|
|
|
53
50
|
|
|
54
51
|
def _ask_for_datasource_type(supported_datasource_types: set[DatasourceType]) -> DatasourceType:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
all_config_folders = sorted(supported_types_by_folder.keys())
|
|
58
|
-
config_folder = click.prompt(
|
|
59
|
-
"What type of datasource do you want to add?",
|
|
60
|
-
type=click.Choice(all_config_folders),
|
|
61
|
-
default=all_config_folders[0] if len(all_config_folders) == 1 else None,
|
|
62
|
-
)
|
|
63
|
-
click.echo(f"Selected type: {config_folder}")
|
|
64
|
-
|
|
65
|
-
all_subtypes_for_folder = sorted(supported_types_by_folder[config_folder])
|
|
52
|
+
all_datasource_types = sorted([ds_type.full_type for ds_type in supported_datasource_types])
|
|
66
53
|
config_type = click.prompt(
|
|
67
|
-
"What
|
|
68
|
-
type=click.Choice(
|
|
69
|
-
default=
|
|
54
|
+
"What type of datasource do you want to add?",
|
|
55
|
+
type=click.Choice(all_datasource_types),
|
|
56
|
+
default=all_datasource_types[0] if len(all_datasource_types) == 1 else None,
|
|
70
57
|
)
|
|
71
|
-
click.echo(f"Selected
|
|
58
|
+
click.echo(f"Selected type: {config_type}")
|
|
72
59
|
|
|
73
|
-
return DatasourceType
|
|
60
|
+
return DatasourceType(full_type=config_type)
|
|
74
61
|
|
|
75
62
|
|
|
76
63
|
def _ask_for_config_details(config_file_structure: list[ConfigPropertyDefinition]) -> dict[str, Any]:
|
|
@@ -146,11 +133,3 @@ def _build_config_content_from_properties(
|
|
|
146
133
|
config_content[config_file_property.property_key] = property_value
|
|
147
134
|
|
|
148
135
|
return config_content
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def _group_supported_types_by_folder(all_plugin_types: set[DatasourceType]) -> dict[str, list[str]]:
|
|
152
|
-
main_to_subtypes = defaultdict(list)
|
|
153
|
-
for datasource_type in all_plugin_types:
|
|
154
|
-
main_to_subtypes[datasource_type.main_type].append(datasource_type.subtype)
|
|
155
|
-
|
|
156
|
-
return main_to_subtypes
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Literal
|
|
@@ -13,11 +14,11 @@ from databao_context_engine import (
|
|
|
13
14
|
InitErrorReason,
|
|
14
15
|
InitProjectError,
|
|
15
16
|
init_dce_project,
|
|
17
|
+
install_ollama_if_needed,
|
|
16
18
|
)
|
|
17
19
|
from databao_context_engine.cli.datasources import add_datasource_config_cli, check_datasource_connection_cli
|
|
18
20
|
from databao_context_engine.cli.info import echo_info
|
|
19
21
|
from databao_context_engine.config.logging import configure_logging
|
|
20
|
-
from databao_context_engine.llm.install import resolve_ollama_bin
|
|
21
22
|
from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
|
|
22
23
|
|
|
23
24
|
|
|
@@ -33,7 +34,7 @@ from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
|
|
|
33
34
|
@click.pass_context
|
|
34
35
|
def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> None:
|
|
35
36
|
if verbose and quiet:
|
|
36
|
-
|
|
37
|
+
click.echo("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
|
|
37
38
|
exit(1)
|
|
38
39
|
|
|
39
40
|
if project_dir is None:
|
|
@@ -79,7 +80,7 @@ def init(ctx: Context) -> None:
|
|
|
79
80
|
click.echo(f"Project initialized successfully at {project_dir.resolve()}")
|
|
80
81
|
|
|
81
82
|
try:
|
|
82
|
-
|
|
83
|
+
install_ollama_if_needed()
|
|
83
84
|
except RuntimeError as e:
|
|
84
85
|
click.echo(str(e), err=True)
|
|
85
86
|
|
|
@@ -171,29 +172,44 @@ def build(
|
|
|
171
172
|
)
|
|
172
173
|
@click.option(
|
|
173
174
|
"-o",
|
|
174
|
-
"--output-
|
|
175
|
-
type=click.
|
|
176
|
-
|
|
177
|
-
|
|
175
|
+
"--output-file",
|
|
176
|
+
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
|
|
177
|
+
help="If provided, the results are written as YAML into the file given as a path, rather than printed to the console.",
|
|
178
|
+
)
|
|
179
|
+
@click.option(
|
|
180
|
+
"-d",
|
|
181
|
+
"--datasource-id",
|
|
182
|
+
"datasource_ids_as_str",
|
|
183
|
+
type=click.STRING,
|
|
184
|
+
multiple=True,
|
|
185
|
+
help="A datasourceId to restrict the search to. If not provided, search across all datasources. This option can be provided multiple times",
|
|
178
186
|
)
|
|
179
187
|
@click.pass_context
|
|
180
188
|
def retrieve(
|
|
181
189
|
ctx: Context,
|
|
182
190
|
retrieve_text: tuple[str, ...],
|
|
183
191
|
limit: int | None,
|
|
184
|
-
|
|
192
|
+
output_file: str | None,
|
|
193
|
+
datasource_ids_as_str: tuple[str, ...] | None,
|
|
185
194
|
) -> None:
|
|
186
195
|
"""Search the project's built context for the most relevant chunks."""
|
|
187
196
|
text = " ".join(retrieve_text)
|
|
188
197
|
|
|
198
|
+
datasource_ids = (
|
|
199
|
+
[DatasourceId.from_string_repr(datasource_id) for datasource_id in datasource_ids_as_str]
|
|
200
|
+
if datasource_ids_as_str is not None
|
|
201
|
+
else None
|
|
202
|
+
)
|
|
203
|
+
|
|
189
204
|
databao_engine = DatabaoContextEngine(project_dir=ctx.obj["project_dir"])
|
|
190
205
|
|
|
191
|
-
retrieve_results = databao_engine.search_context(
|
|
192
|
-
retrieve_text=text, limit=limit, export_to_file=output_format == "file"
|
|
193
|
-
)
|
|
206
|
+
retrieve_results = databao_engine.search_context(retrieve_text=text, limit=limit, datasource_ids=datasource_ids)
|
|
194
207
|
|
|
195
|
-
|
|
196
|
-
|
|
208
|
+
display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
|
|
209
|
+
if output_file is not None:
|
|
210
|
+
Path(output_file).expanduser().write_text(f"---{os.linesep}".join(display_texts))
|
|
211
|
+
click.echo(f"Found {len(retrieve_results)} results, written to {output_file}")
|
|
212
|
+
else:
|
|
197
213
|
click.echo("\n".join(display_texts))
|
|
198
214
|
|
|
199
215
|
|
|
@@ -88,17 +88,14 @@ class DatabaoContextEngine:
|
|
|
88
88
|
def get_all_contexts_formatted(self) -> str:
|
|
89
89
|
all_contexts = self.get_all_contexts()
|
|
90
90
|
|
|
91
|
-
|
|
91
|
+
return os.linesep.join(
|
|
92
92
|
[f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
return all_results
|
|
96
|
-
|
|
97
95
|
def search_context(
|
|
98
96
|
self,
|
|
99
97
|
retrieve_text: str,
|
|
100
98
|
limit: int | None = None,
|
|
101
|
-
export_to_file: bool = False,
|
|
102
99
|
datasource_ids: list[DatasourceId] | None = None,
|
|
103
100
|
) -> list[ContextSearchResult]:
|
|
104
101
|
"""Search in the avaialable context for the closest matches to the given text.
|
|
@@ -106,20 +103,13 @@ class DatabaoContextEngine:
|
|
|
106
103
|
Args:
|
|
107
104
|
retrieve_text: The text to search for in the contexts.
|
|
108
105
|
limit: The maximum number of results to return. If None is provided, a default limit of 10 will be used.
|
|
109
|
-
|
|
110
|
-
datasource_ids: Not Implemented yet: providing this argument changes nothing to the search
|
|
106
|
+
datasource_ids: If provided, the search results will only come from the datasources with these IDs.
|
|
111
107
|
|
|
112
108
|
Returns:
|
|
113
109
|
A list of the results found for the search, sorted by distance.
|
|
114
110
|
"""
|
|
115
|
-
# TODO: Filter with datasource_ids
|
|
116
|
-
# TODO: When no run_name is required, we can extract the "export_to_file" side-effect and let the caller (the CLI) do it themselves
|
|
117
|
-
|
|
118
111
|
results = retrieve_embeddings(
|
|
119
|
-
project_layout=self._project_layout,
|
|
120
|
-
retrieve_text=retrieve_text,
|
|
121
|
-
limit=limit,
|
|
122
|
-
export_to_file=export_to_file,
|
|
112
|
+
project_layout=self._project_layout, retrieve_text=retrieve_text, limit=limit, datasource_ids=datasource_ids
|
|
123
113
|
)
|
|
124
114
|
|
|
125
115
|
return [
|
|
@@ -4,10 +4,6 @@ from typing import Any, overload
|
|
|
4
4
|
|
|
5
5
|
from databao_context_engine.build_sources import BuildContextResult, build_all_datasources
|
|
6
6
|
from databao_context_engine.databao_context_engine import DatabaoContextEngine
|
|
7
|
-
from databao_context_engine.datasources.add_config import (
|
|
8
|
-
create_datasource_config_file,
|
|
9
|
-
get_datasource_id_for_config_file,
|
|
10
|
-
)
|
|
11
7
|
from databao_context_engine.datasources.check_config import (
|
|
12
8
|
CheckDatasourceConnectionResult,
|
|
13
9
|
)
|
|
@@ -17,7 +13,14 @@ from databao_context_engine.datasources.check_config import (
|
|
|
17
13
|
from databao_context_engine.datasources.datasource_discovery import get_datasource_list
|
|
18
14
|
from databao_context_engine.datasources.types import Datasource, DatasourceId
|
|
19
15
|
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
20
|
-
from databao_context_engine.project.layout import
|
|
16
|
+
from databao_context_engine.project.layout import (
|
|
17
|
+
ProjectLayout,
|
|
18
|
+
ensure_project_dir,
|
|
19
|
+
)
|
|
20
|
+
from databao_context_engine.project.layout import (
|
|
21
|
+
create_datasource_config_file as create_datasource_config_file_internal,
|
|
22
|
+
)
|
|
23
|
+
from databao_context_engine.serialization.yaml import to_yaml_string
|
|
21
24
|
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
|
|
22
25
|
|
|
23
26
|
|
|
@@ -45,6 +48,7 @@ class DatabaoContextProjectManager:
|
|
|
45
48
|
"""
|
|
46
49
|
|
|
47
50
|
project_dir: Path
|
|
51
|
+
_project_layout: ProjectLayout
|
|
48
52
|
|
|
49
53
|
def __init__(self, project_dir: Path) -> None:
|
|
50
54
|
"""Initialize the DatabaoContextProjectManager.
|
|
@@ -52,7 +56,7 @@ class DatabaoContextProjectManager:
|
|
|
52
56
|
Args:
|
|
53
57
|
project_dir: The root directory of the Databao Context Project.
|
|
54
58
|
"""
|
|
55
|
-
ensure_project_dir(project_dir=project_dir)
|
|
59
|
+
self._project_layout = ensure_project_dir(project_dir=project_dir)
|
|
56
60
|
self.project_dir = project_dir
|
|
57
61
|
|
|
58
62
|
def get_configured_datasource_list(self) -> list[Datasource]:
|
|
@@ -64,7 +68,7 @@ class DatabaoContextProjectManager:
|
|
|
64
68
|
Returns:
|
|
65
69
|
The list of datasources configured in the project.
|
|
66
70
|
"""
|
|
67
|
-
return get_datasource_list(self.
|
|
71
|
+
return get_datasource_list(self._project_layout)
|
|
68
72
|
|
|
69
73
|
def build_context(
|
|
70
74
|
self,
|
|
@@ -83,7 +87,7 @@ class DatabaoContextProjectManager:
|
|
|
83
87
|
The list of all built results.
|
|
84
88
|
"""
|
|
85
89
|
# TODO: Filter which datasources to build by datasource_ids
|
|
86
|
-
return build_all_datasources(
|
|
90
|
+
return build_all_datasources(project_layout=self._project_layout, chunk_embedding_mode=chunk_embedding_mode)
|
|
87
91
|
|
|
88
92
|
def check_datasource_connection(
|
|
89
93
|
self, datasource_ids: list[DatasourceId] | None = None
|
|
@@ -97,7 +101,9 @@ class DatabaoContextProjectManager:
|
|
|
97
101
|
The list of all connection check results, sorted by datasource id.
|
|
98
102
|
"""
|
|
99
103
|
return sorted(
|
|
100
|
-
check_datasource_connection_internal(
|
|
104
|
+
check_datasource_connection_internal(
|
|
105
|
+
project_layout=self._project_layout, datasource_ids=datasource_ids
|
|
106
|
+
).values(),
|
|
101
107
|
key=lambda result: str(result.datasource_id),
|
|
102
108
|
)
|
|
103
109
|
|
|
@@ -121,8 +127,8 @@ class DatabaoContextProjectManager:
|
|
|
121
127
|
The path to the created datasource configuration file.
|
|
122
128
|
"""
|
|
123
129
|
# TODO: Before creating the datasource, validate the config content based on which plugin will be used
|
|
124
|
-
|
|
125
|
-
|
|
130
|
+
config_file = _create_datasource_config_file(
|
|
131
|
+
project_layout=self._project_layout,
|
|
126
132
|
datasource_type=datasource_type,
|
|
127
133
|
datasource_name=datasource_name,
|
|
128
134
|
config_content=config_content,
|
|
@@ -130,48 +136,51 @@ class DatabaoContextProjectManager:
|
|
|
130
136
|
)
|
|
131
137
|
|
|
132
138
|
return DatasourceConfigFile(
|
|
133
|
-
datasource_id=DatasourceId.from_datasource_config_file_path(
|
|
134
|
-
config_file_path=
|
|
139
|
+
datasource_id=DatasourceId.from_datasource_config_file_path(self._project_layout, config_file),
|
|
140
|
+
config_file_path=config_file,
|
|
135
141
|
)
|
|
136
142
|
|
|
137
143
|
@overload
|
|
138
|
-
def datasource_config_exists(self, *,
|
|
144
|
+
def datasource_config_exists(self, *, datasource_name: str) -> DatasourceId | None: ...
|
|
139
145
|
@overload
|
|
140
|
-
def datasource_config_exists(self, *, datasource_id: DatasourceId) ->
|
|
146
|
+
def datasource_config_exists(self, *, datasource_id: DatasourceId) -> DatasourceId | None: ...
|
|
141
147
|
|
|
142
148
|
def datasource_config_exists(
|
|
143
149
|
self,
|
|
144
150
|
*,
|
|
145
|
-
datasource_type: DatasourceType | None = None,
|
|
146
151
|
datasource_name: str | None = None,
|
|
147
152
|
datasource_id: DatasourceId | None = None,
|
|
148
|
-
) ->
|
|
153
|
+
) -> DatasourceId | None:
|
|
149
154
|
"""Check if a datasource configuration file already exists in the project.
|
|
150
155
|
|
|
151
156
|
Args:
|
|
152
|
-
|
|
153
|
-
datasource_name: The name of the datasource. This must be used in conjunction with datasource_type.
|
|
157
|
+
datasource_name: The name of the datasource.
|
|
154
158
|
datasource_id: The id of the datasource. If provided, datasource_type and datasource_name will be ignored.
|
|
155
159
|
|
|
156
160
|
Returns:
|
|
157
|
-
|
|
161
|
+
datasource_id if there is already a datasource configuration file for this datasource, None otherwise.
|
|
158
162
|
|
|
159
163
|
Raises:
|
|
160
164
|
ValueError: If the wrong set of arguments is provided.
|
|
161
165
|
"""
|
|
162
|
-
if
|
|
163
|
-
datasource_id =
|
|
164
|
-
|
|
166
|
+
if datasource_name is not None:
|
|
167
|
+
datasource_id = DatasourceId.from_string_repr(f"{datasource_name}.yaml")
|
|
168
|
+
config_file = self._project_layout.src_dir / datasource_id.relative_path_to_config_file()
|
|
169
|
+
if config_file.is_file():
|
|
170
|
+
return datasource_id
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
if datasource_id is None:
|
|
165
174
|
raise ValueError("Either datasource_id or both datasource_type and datasource_name must be provided")
|
|
166
175
|
|
|
167
176
|
try:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
datasource_id
|
|
171
|
-
|
|
172
|
-
return
|
|
177
|
+
config_file = self._project_layout.src_dir.joinpath(datasource_id.relative_path_to_config_file())
|
|
178
|
+
if config_file.is_file():
|
|
179
|
+
raise ValueError(f"A config file already exists for {str(datasource_id)}")
|
|
180
|
+
|
|
181
|
+
return datasource_id
|
|
173
182
|
except ValueError:
|
|
174
|
-
return
|
|
183
|
+
return None
|
|
175
184
|
|
|
176
185
|
def get_engine_for_project(self) -> DatabaoContextEngine:
|
|
177
186
|
"""Instantiate a DatabaoContextEngine for the project.
|
|
@@ -180,3 +189,21 @@ class DatabaoContextProjectManager:
|
|
|
180
189
|
A DatabaoContextEngine instance for the project.
|
|
181
190
|
"""
|
|
182
191
|
return DatabaoContextEngine(project_dir=self.project_dir)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _create_datasource_config_file(
|
|
195
|
+
project_layout: ProjectLayout,
|
|
196
|
+
datasource_type: DatasourceType,
|
|
197
|
+
datasource_name: str,
|
|
198
|
+
config_content: dict[str, Any],
|
|
199
|
+
overwrite_existing: bool,
|
|
200
|
+
) -> Path:
|
|
201
|
+
last_datasource_name = datasource_name.split("/")[-1]
|
|
202
|
+
basic_config = {"type": datasource_type.full_type, "name": last_datasource_name}
|
|
203
|
+
|
|
204
|
+
return create_datasource_config_file_internal(
|
|
205
|
+
project_layout,
|
|
206
|
+
f"{datasource_name}.yaml",
|
|
207
|
+
to_yaml_string(basic_config | config_content),
|
|
208
|
+
overwrite_existing=overwrite_existing,
|
|
209
|
+
)
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from enum import Enum
|
|
5
|
-
from pathlib import Path
|
|
6
5
|
|
|
7
6
|
from pydantic import ValidationError
|
|
8
7
|
|
|
@@ -15,7 +14,7 @@ from databao_context_engine.datasources.types import DatasourceId, PreparedConfi
|
|
|
15
14
|
from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, NotSupportedError
|
|
16
15
|
from databao_context_engine.pluginlib.plugin_utils import check_connection_for_datasource
|
|
17
16
|
from databao_context_engine.plugins.plugin_loader import load_plugins
|
|
18
|
-
from databao_context_engine.project.layout import
|
|
17
|
+
from databao_context_engine.project.layout import ProjectLayout
|
|
19
18
|
|
|
20
19
|
logger = logging.getLogger(__name__)
|
|
21
20
|
|
|
@@ -55,21 +54,19 @@ class CheckDatasourceConnectionResult:
|
|
|
55
54
|
|
|
56
55
|
|
|
57
56
|
def check_datasource_connection(
|
|
58
|
-
|
|
57
|
+
project_layout: ProjectLayout, *, datasource_ids: list[DatasourceId] | None = None
|
|
59
58
|
) -> dict[DatasourceId, CheckDatasourceConnectionResult]:
|
|
60
|
-
ensure_project_dir(project_dir)
|
|
61
|
-
|
|
62
59
|
if datasource_ids:
|
|
63
60
|
logger.info(f"Validating datasource(s): {datasource_ids}")
|
|
64
|
-
datasources_to_traverse = get_datasource_descriptors(
|
|
61
|
+
datasources_to_traverse = get_datasource_descriptors(project_layout, datasource_ids)
|
|
65
62
|
else:
|
|
66
|
-
datasources_to_traverse = discover_datasources(
|
|
63
|
+
datasources_to_traverse = discover_datasources(project_layout)
|
|
67
64
|
|
|
68
65
|
plugins = load_plugins(exclude_file_plugins=True)
|
|
69
66
|
|
|
70
67
|
result = {}
|
|
71
68
|
for discovered_datasource in datasources_to_traverse:
|
|
72
|
-
result_key = DatasourceId.from_datasource_config_file_path(discovered_datasource.path)
|
|
69
|
+
result_key = DatasourceId.from_datasource_config_file_path(project_layout, discovered_datasource.path)
|
|
73
70
|
|
|
74
71
|
try:
|
|
75
72
|
prepared_source = prepare_source(discovered_datasource)
|
|
@@ -127,16 +124,16 @@ def _get_validation_result_from_error(datasource_id: DatasourceId, e: Exception)
|
|
|
127
124
|
summary="Config file is invalid",
|
|
128
125
|
full_message=str(e),
|
|
129
126
|
)
|
|
130
|
-
|
|
127
|
+
if isinstance(e, NotImplementedError | NotSupportedError):
|
|
131
128
|
return CheckDatasourceConnectionResult(
|
|
132
129
|
datasource_id=datasource_id,
|
|
133
130
|
connection_status=DatasourceConnectionStatus.UNKNOWN,
|
|
134
131
|
summary="Plugin doesn't support validating its config",
|
|
135
132
|
)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
133
|
+
|
|
134
|
+
return CheckDatasourceConnectionResult(
|
|
135
|
+
datasource_id=datasource_id,
|
|
136
|
+
connection_status=DatasourceConnectionStatus.INVALID,
|
|
137
|
+
summary="Connection with the datasource can not be established",
|
|
138
|
+
full_message=str(e),
|
|
139
|
+
)
|