databao-context-engine 0.1.4.dev1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. databao_context_engine/__init__.py +14 -1
  2. databao_context_engine/build_sources/build_runner.py +7 -7
  3. databao_context_engine/build_sources/build_wiring.py +8 -10
  4. databao_context_engine/build_sources/plugin_execution.py +9 -12
  5. databao_context_engine/cli/add_datasource_config.py +9 -30
  6. databao_context_engine/cli/commands.py +29 -13
  7. databao_context_engine/databao_context_engine.py +3 -13
  8. databao_context_engine/databao_context_project_manager.py +56 -29
  9. databao_context_engine/datasources/check_config.py +13 -16
  10. databao_context_engine/datasources/datasource_context.py +21 -24
  11. databao_context_engine/datasources/datasource_discovery.py +45 -44
  12. databao_context_engine/datasources/types.py +53 -42
  13. databao_context_engine/generate_configs_schemas.py +4 -5
  14. databao_context_engine/introspection/property_extract.py +52 -47
  15. databao_context_engine/llm/__init__.py +10 -0
  16. databao_context_engine/llm/api.py +57 -0
  17. databao_context_engine/llm/descriptions/ollama.py +1 -3
  18. databao_context_engine/llm/factory.py +5 -2
  19. databao_context_engine/llm/install.py +13 -10
  20. databao_context_engine/llm/runtime.py +3 -5
  21. databao_context_engine/mcp/mcp_server.py +1 -3
  22. databao_context_engine/plugin_loader.py +6 -7
  23. databao_context_engine/pluginlib/build_plugin.py +0 -33
  24. databao_context_engine/plugins/databases/athena/__init__.py +0 -0
  25. databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
  26. databao_context_engine/plugins/databases/{athena_introspector.py → athena/athena_introspector.py} +2 -5
  27. databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +1 -3
  28. databao_context_engine/plugins/databases/base_introspector.py +11 -14
  29. databao_context_engine/plugins/databases/clickhouse/__init__.py +0 -0
  30. databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
  31. databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +2 -5
  32. databao_context_engine/plugins/databases/duckdb/__init__.py +0 -0
  33. databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
  34. databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +4 -7
  35. databao_context_engine/plugins/databases/mssql/__init__.py +0 -0
  36. databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
  37. databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +9 -10
  38. databao_context_engine/plugins/databases/mysql/__init__.py +0 -0
  39. databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
  40. databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +8 -8
  41. databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
  42. databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
  43. databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +9 -16
  44. databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
  45. databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
  46. databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +8 -9
  47. databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
  48. databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
  49. databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
  50. databao_context_engine/plugins/dbt/__init__.py +0 -0
  51. databao_context_engine/plugins/dbt/dbt_chunker.py +47 -0
  52. databao_context_engine/plugins/dbt/dbt_context_extractor.py +106 -0
  53. databao_context_engine/plugins/dbt/dbt_plugin.py +25 -0
  54. databao_context_engine/plugins/dbt/types.py +44 -0
  55. databao_context_engine/plugins/dbt/types_artifacts.py +58 -0
  56. databao_context_engine/plugins/files/__init__.py +0 -0
  57. databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
  58. databao_context_engine/plugins/plugin_loader.py +13 -15
  59. databao_context_engine/plugins/resources/parquet_introspector.py +1 -1
  60. databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
  61. databao_context_engine/project/layout.py +12 -13
  62. databao_context_engine/retrieve_embeddings/retrieve_runner.py +3 -16
  63. databao_context_engine/retrieve_embeddings/retrieve_service.py +13 -6
  64. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +4 -7
  65. databao_context_engine/serialization/yaml.py +5 -5
  66. databao_context_engine/storage/migrate.py +1 -1
  67. databao_context_engine/storage/repositories/vector_search_repository.py +18 -6
  68. databao_context_engine-0.1.6.dist-info/METADATA +228 -0
  69. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/RECORD +71 -55
  70. databao_context_engine/datasources/add_config.py +0 -34
  71. databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
  72. databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
  73. databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
  74. databao_context_engine/retrieve_embeddings/export_results.py +0 -12
  75. databao_context_engine-0.1.4.dev1.dist-info/METADATA +0 -75
  76. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/WHEEL +0 -0
  77. {databao_context_engine-0.1.4.dev1.dist-info → databao_context_engine-0.1.6.dist-info}/entry_points.txt +0 -0
@@ -8,6 +8,13 @@ from databao_context_engine.datasources.check_config import (
8
8
  from databao_context_engine.datasources.datasource_context import DatasourceContext
9
9
  from databao_context_engine.datasources.types import Datasource, DatasourceId
10
10
  from databao_context_engine.init_project import init_dce_project, init_or_get_dce_project
11
+ from databao_context_engine.llm import (
12
+ OllamaError,
13
+ OllamaPermanentError,
14
+ OllamaTransientError,
15
+ download_ollama_models_if_needed,
16
+ install_ollama_if_needed,
17
+ )
11
18
  from databao_context_engine.plugin_loader import DatabaoContextPluginLoader
12
19
  from databao_context_engine.pluginlib.build_plugin import (
13
20
  BuildDatasourcePlugin,
@@ -16,7 +23,7 @@ from databao_context_engine.pluginlib.build_plugin import (
16
23
  DatasourceType,
17
24
  )
18
25
  from databao_context_engine.pluginlib.config import ConfigPropertyDefinition
19
- from databao_context_engine.project.info import DceInfo, get_databao_context_engine_info
26
+ from databao_context_engine.project.info import DceInfo, DceProjectInfo, get_databao_context_engine_info
20
27
  from databao_context_engine.project.init_project import InitErrorReason, InitProjectError
21
28
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
22
29
 
@@ -35,6 +42,7 @@ __all__ = [
35
42
  "DatasourceType",
36
43
  "get_databao_context_engine_info",
37
44
  "DceInfo",
45
+ "DceProjectInfo",
38
46
  "init_dce_project",
39
47
  "init_or_get_dce_project",
40
48
  "InitErrorReason",
@@ -44,4 +52,9 @@ __all__ = [
44
52
  "BuildPlugin",
45
53
  "BuildDatasourcePlugin",
46
54
  "BuildFilePlugin",
55
+ "install_ollama_if_needed",
56
+ "download_ollama_models_if_needed",
57
+ "OllamaError",
58
+ "OllamaTransientError",
59
+ "OllamaPermanentError",
47
60
  ]
@@ -13,7 +13,7 @@ from databao_context_engine.datasources.datasource_discovery import discover_dat
13
13
  from databao_context_engine.datasources.types import DatasourceId
14
14
  from databao_context_engine.pluginlib.build_plugin import DatasourceType
15
15
  from databao_context_engine.plugins.plugin_loader import load_plugins
16
- from databao_context_engine.project.layout import get_output_dir
16
+ from databao_context_engine.project.layout import ProjectLayout
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
@@ -36,7 +36,7 @@ class BuildContextResult:
36
36
 
37
37
 
38
38
  def build(
39
- project_dir: Path,
39
+ project_layout: ProjectLayout,
40
40
  *,
41
41
  build_service: BuildService,
42
42
  ) -> list[BuildContextResult]:
@@ -55,15 +55,15 @@ def build(
55
55
  """
56
56
  plugins = load_plugins()
57
57
 
58
- datasources = discover_datasources(project_dir)
58
+ datasources = discover_datasources(project_layout)
59
59
 
60
60
  if not datasources:
61
- logger.info("No sources discovered under %s", project_dir)
61
+ logger.info("No sources discovered under %s", project_layout.src_dir)
62
62
  return []
63
63
 
64
64
  number_of_failed_builds = 0
65
65
  build_result = []
66
- reset_all_results(get_output_dir(project_dir))
66
+ reset_all_results(project_layout.output_dir)
67
67
  for discovered_datasource in datasources:
68
68
  try:
69
69
  prepared_source = prepare_source(discovered_datasource)
@@ -87,14 +87,14 @@ def build(
87
87
  plugin=plugin,
88
88
  )
89
89
 
90
- output_dir = get_output_dir(project_dir)
90
+ output_dir = project_layout.output_dir
91
91
 
92
92
  context_file_path = export_build_result(output_dir, result)
93
93
  append_result_to_all_results(output_dir, result)
94
94
 
95
95
  build_result.append(
96
96
  BuildContextResult(
97
- datasource_id=DatasourceId.from_string_repr(result.datasource_id),
97
+ datasource_id=discovered_datasource.datasource_id,
98
98
  datasource_type=DatasourceType(full_type=result.datasource_type),
99
99
  context_built_at=result.context_built_at,
100
100
  context_file_path=context_file_path,
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from pathlib import Path
3
2
 
4
3
  from duckdb import DuckDBPyConnection
5
4
 
@@ -12,7 +11,7 @@ from databao_context_engine.llm.factory import (
12
11
  create_ollama_embedding_provider,
13
12
  create_ollama_service,
14
13
  )
15
- from databao_context_engine.project.layout import ensure_project_dir
14
+ from databao_context_engine.project.layout import ProjectLayout
16
15
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
17
16
  from databao_context_engine.services.factories import create_chunk_embedding_service
18
17
  from databao_context_engine.storage.connection import open_duckdb_connection
@@ -22,7 +21,9 @@ from databao_context_engine.system.properties import get_db_path
22
21
  logger = logging.getLogger(__name__)
23
22
 
24
23
 
25
- def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddingMode) -> list[BuildContextResult]:
24
+ def build_all_datasources(
25
+ project_layout: ProjectLayout, chunk_embedding_mode: ChunkEmbeddingMode
26
+ ) -> list[BuildContextResult]:
26
27
  """Build the context for all datasources in the project.
27
28
 
28
29
  - Instantiates the build service
@@ -31,21 +32,18 @@ def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddin
31
32
  Returns:
32
33
  A list of all the contexts built.
33
34
  """
34
- ensure_project_dir(project_dir)
35
-
36
- logger.debug(f"Starting to build datasources in project {project_dir.resolve()}")
35
+ logger.debug(f"Starting to build datasources in project {project_layout.project_dir.resolve()}")
37
36
 
38
37
  # Think about alternative solutions. This solution will mirror the current behaviour
39
38
  # The current behaviour only builds what is currently in the /src folder
40
39
  # This will need to change in the future when we can pick which datasources to build
41
- db_path = get_db_path(project_dir)
40
+ db_path = get_db_path(project_layout.project_dir)
42
41
  db_path.parent.mkdir(parents=True, exist_ok=True)
43
42
  if db_path.exists():
44
43
  db_path.unlink()
45
44
 
45
+ migrate(db_path)
46
46
  with open_duckdb_connection(db_path) as conn:
47
- migrate(db_path)
48
-
49
47
  ollama_service = create_ollama_service()
50
48
  embedding_provider = create_ollama_embedding_provider(ollama_service)
51
49
  description_provider = (
@@ -60,7 +58,7 @@ def build_all_datasources(project_dir: Path, chunk_embedding_mode: ChunkEmbeddin
60
58
  chunk_embedding_mode=chunk_embedding_mode,
61
59
  )
62
60
  return build(
63
- project_dir=project_dir,
61
+ project_layout=project_layout,
64
62
  build_service=build_service,
65
63
  )
66
64
 
@@ -2,7 +2,7 @@ from dataclasses import dataclass
2
2
  from datetime import datetime
3
3
  from typing import Any, cast
4
4
 
5
- from databao_context_engine.datasources.types import DatasourceId, PreparedConfig, PreparedDatasource
5
+ from databao_context_engine.datasources.types import PreparedConfig, PreparedDatasource
6
6
  from databao_context_engine.pluginlib.build_plugin import (
7
7
  BuildDatasourcePlugin,
8
8
  BuildFilePlugin,
@@ -39,11 +39,8 @@ class BuiltDatasourceContext:
39
39
 
40
40
  def execute(prepared_datasource: PreparedDatasource, plugin: BuildPlugin) -> BuiltDatasourceContext:
41
41
  built_context = _execute(prepared_datasource, plugin)
42
-
43
- datasource_id = DatasourceId.from_datasource_config_file_path(prepared_datasource.path)
44
-
45
42
  return BuiltDatasourceContext(
46
- datasource_id=str(datasource_id),
43
+ datasource_id=str(prepared_datasource.datasource_id),
47
44
  datasource_type=prepared_datasource.datasource_type.full_type,
48
45
  context_built_at=datetime.now(),
49
46
  context=built_context,
@@ -61,10 +58,10 @@ def _execute(prepared_datasource: PreparedDatasource, plugin: BuildPlugin) -> An
61
58
  config=prepared_datasource.config,
62
59
  datasource_name=prepared_datasource.datasource_name,
63
60
  )
64
- else:
65
- file_plugin = cast(BuildFilePlugin, plugin)
66
- return execute_file_plugin(
67
- plugin=file_plugin,
68
- datasource_type=prepared_datasource.datasource_type,
69
- file_path=prepared_datasource.path,
70
- )
61
+
62
+ file_plugin = cast(BuildFilePlugin, plugin)
63
+ return execute_file_plugin(
64
+ plugin=file_plugin,
65
+ datasource_type=prepared_datasource.datasource_type,
66
+ file_path=prepared_datasource.path,
67
+ )
@@ -1,5 +1,4 @@
1
1
  import os
2
- from collections import defaultdict
3
2
  from pathlib import Path
4
3
  from typing import Any
5
4
 
@@ -28,12 +27,10 @@ def add_datasource_config_interactive(project_dir: Path) -> DatasourceId:
28
27
  )
29
28
  datasource_name = click.prompt("Datasource name?", type=str)
30
29
 
31
- is_datasource_existing = project_manager.datasource_config_exists(
32
- datasource_type=datasource_type, datasource_name=datasource_name
33
- )
34
- if is_datasource_existing:
30
+ datasource_id = project_manager.datasource_config_exists(datasource_name=datasource_name)
31
+ if datasource_id is not None:
35
32
  click.confirm(
36
- f"A config file already exists for this datasource ({datasource_type.config_folder}/{datasource_name}). Do you want to overwrite it?",
33
+ f"A config file already exists for this datasource {datasource_id.relative_path_to_config_file()}. Do you want to overwrite it?",
37
34
  abort=True,
38
35
  default=False,
39
36
  )
@@ -52,25 +49,15 @@ def add_datasource_config_interactive(project_dir: Path) -> DatasourceId:
52
49
 
53
50
 
54
51
  def _ask_for_datasource_type(supported_datasource_types: set[DatasourceType]) -> DatasourceType:
55
- supported_types_by_folder = _group_supported_types_by_folder(supported_datasource_types)
56
-
57
- all_config_folders = sorted(supported_types_by_folder.keys())
58
- config_folder = click.prompt(
59
- "What type of datasource do you want to add?",
60
- type=click.Choice(all_config_folders),
61
- default=all_config_folders[0] if len(all_config_folders) == 1 else None,
62
- )
63
- click.echo(f"Selected type: {config_folder}")
64
-
65
- all_subtypes_for_folder = sorted(supported_types_by_folder[config_folder])
52
+ all_datasource_types = sorted([ds_type.full_type for ds_type in supported_datasource_types])
66
53
  config_type = click.prompt(
67
- "What is the subtype of this datasource?",
68
- type=click.Choice(all_subtypes_for_folder),
69
- default=all_subtypes_for_folder[0] if len(all_subtypes_for_folder) == 1 else None,
54
+ "What type of datasource do you want to add?",
55
+ type=click.Choice(all_datasource_types),
56
+ default=all_datasource_types[0] if len(all_datasource_types) == 1 else None,
70
57
  )
71
- click.echo(f"Selected subtype: {config_type}")
58
+ click.echo(f"Selected type: {config_type}")
72
59
 
73
- return DatasourceType.from_main_and_subtypes(config_folder, config_type)
60
+ return DatasourceType(full_type=config_type)
74
61
 
75
62
 
76
63
  def _ask_for_config_details(config_file_structure: list[ConfigPropertyDefinition]) -> dict[str, Any]:
@@ -146,11 +133,3 @@ def _build_config_content_from_properties(
146
133
  config_content[config_file_property.property_key] = property_value
147
134
 
148
135
  return config_content
149
-
150
-
151
- def _group_supported_types_by_folder(all_plugin_types: set[DatasourceType]) -> dict[str, list[str]]:
152
- main_to_subtypes = defaultdict(list)
153
- for datasource_type in all_plugin_types:
154
- main_to_subtypes[datasource_type.main_type].append(datasource_type.subtype)
155
-
156
- return main_to_subtypes
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import sys
2
3
  from pathlib import Path
3
4
  from typing import Literal
@@ -13,11 +14,11 @@ from databao_context_engine import (
13
14
  InitErrorReason,
14
15
  InitProjectError,
15
16
  init_dce_project,
17
+ install_ollama_if_needed,
16
18
  )
17
19
  from databao_context_engine.cli.datasources import add_datasource_config_cli, check_datasource_connection_cli
18
20
  from databao_context_engine.cli.info import echo_info
19
21
  from databao_context_engine.config.logging import configure_logging
20
- from databao_context_engine.llm.install import resolve_ollama_bin
21
22
  from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
22
23
 
23
24
 
@@ -33,7 +34,7 @@ from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
33
34
  @click.pass_context
34
35
  def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> None:
35
36
  if verbose and quiet:
36
- print("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
37
+ click.echo("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
37
38
  exit(1)
38
39
 
39
40
  if project_dir is None:
@@ -79,7 +80,7 @@ def init(ctx: Context) -> None:
79
80
  click.echo(f"Project initialized successfully at {project_dir.resolve()}")
80
81
 
81
82
  try:
82
- resolve_ollama_bin()
83
+ install_ollama_if_needed()
83
84
  except RuntimeError as e:
84
85
  click.echo(str(e), err=True)
85
86
 
@@ -171,29 +172,44 @@ def build(
171
172
  )
172
173
  @click.option(
173
174
  "-o",
174
- "--output-format",
175
- type=click.Choice(["file", "streamed"]),
176
- default="file",
177
- help="The output format [file (default), streamed]",
175
+ "--output-file",
176
+ type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
177
+ help="If provided, the results are written as YAML into the file given as a path, rather than printed to the console.",
178
+ )
179
+ @click.option(
180
+ "-d",
181
+ "--datasource-id",
182
+ "datasource_ids_as_str",
183
+ type=click.STRING,
184
+ multiple=True,
185
+ help="A datasourceId to restrict the search to. If not provided, search across all datasources. This option can be provided multiple times",
178
186
  )
179
187
  @click.pass_context
180
188
  def retrieve(
181
189
  ctx: Context,
182
190
  retrieve_text: tuple[str, ...],
183
191
  limit: int | None,
184
- output_format: Literal["file", "streamed"],
192
+ output_file: str | None,
193
+ datasource_ids_as_str: tuple[str, ...] | None,
185
194
  ) -> None:
186
195
  """Search the project's built context for the most relevant chunks."""
187
196
  text = " ".join(retrieve_text)
188
197
 
198
+ datasource_ids = (
199
+ [DatasourceId.from_string_repr(datasource_id) for datasource_id in datasource_ids_as_str]
200
+ if datasource_ids_as_str is not None
201
+ else None
202
+ )
203
+
189
204
  databao_engine = DatabaoContextEngine(project_dir=ctx.obj["project_dir"])
190
205
 
191
- retrieve_results = databao_engine.search_context(
192
- retrieve_text=text, limit=limit, export_to_file=output_format == "file"
193
- )
206
+ retrieve_results = databao_engine.search_context(retrieve_text=text, limit=limit, datasource_ids=datasource_ids)
194
207
 
195
- if output_format == "streamed":
196
- display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
208
+ display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
209
+ if output_file is not None:
210
+ Path(output_file).expanduser().write_text(f"---{os.linesep}".join(display_texts))
211
+ click.echo(f"Found {len(retrieve_results)} results, written to {output_file}")
212
+ else:
197
213
  click.echo("\n".join(display_texts))
198
214
 
199
215
 
@@ -88,17 +88,14 @@ class DatabaoContextEngine:
88
88
  def get_all_contexts_formatted(self) -> str:
89
89
  all_contexts = self.get_all_contexts()
90
90
 
91
- all_results = os.linesep.join(
91
+ return os.linesep.join(
92
92
  [f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
93
93
  )
94
94
 
95
- return all_results
96
-
97
95
  def search_context(
98
96
  self,
99
97
  retrieve_text: str,
100
98
  limit: int | None = None,
101
- export_to_file: bool = False,
102
99
  datasource_ids: list[DatasourceId] | None = None,
103
100
  ) -> list[ContextSearchResult]:
104
101
  """Search in the avaialable context for the closest matches to the given text.
@@ -106,20 +103,13 @@ class DatabaoContextEngine:
106
103
  Args:
107
104
  retrieve_text: The text to search for in the contexts.
108
105
  limit: The maximum number of results to return. If None is provided, a default limit of 10 will be used.
109
- export_to_file: Whether the results should be exported to a file as a side-effect. If True, the results will be exported in a file in the run directory.
110
- datasource_ids: Not Implemented yet: providing this argument changes nothing to the search
106
+ datasource_ids: If provided, the search results will only come from the datasources with these IDs.
111
107
 
112
108
  Returns:
113
109
  A list of the results found for the search, sorted by distance.
114
110
  """
115
- # TODO: Filter with datasource_ids
116
- # TODO: When no run_name is required, we can extract the "export_to_file" side-effect and let the caller (the CLI) do it themselves
117
-
118
111
  results = retrieve_embeddings(
119
- project_layout=self._project_layout,
120
- retrieve_text=retrieve_text,
121
- limit=limit,
122
- export_to_file=export_to_file,
112
+ project_layout=self._project_layout, retrieve_text=retrieve_text, limit=limit, datasource_ids=datasource_ids
123
113
  )
124
114
 
125
115
  return [
@@ -4,10 +4,6 @@ from typing import Any, overload
4
4
 
5
5
  from databao_context_engine.build_sources import BuildContextResult, build_all_datasources
6
6
  from databao_context_engine.databao_context_engine import DatabaoContextEngine
7
- from databao_context_engine.datasources.add_config import (
8
- create_datasource_config_file,
9
- get_datasource_id_for_config_file,
10
- )
11
7
  from databao_context_engine.datasources.check_config import (
12
8
  CheckDatasourceConnectionResult,
13
9
  )
@@ -17,7 +13,14 @@ from databao_context_engine.datasources.check_config import (
17
13
  from databao_context_engine.datasources.datasource_discovery import get_datasource_list
18
14
  from databao_context_engine.datasources.types import Datasource, DatasourceId
19
15
  from databao_context_engine.pluginlib.build_plugin import DatasourceType
20
- from databao_context_engine.project.layout import ensure_datasource_config_file_doesnt_exist, ensure_project_dir
16
+ from databao_context_engine.project.layout import (
17
+ ProjectLayout,
18
+ ensure_project_dir,
19
+ )
20
+ from databao_context_engine.project.layout import (
21
+ create_datasource_config_file as create_datasource_config_file_internal,
22
+ )
23
+ from databao_context_engine.serialization.yaml import to_yaml_string
21
24
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
22
25
 
23
26
 
@@ -45,6 +48,7 @@ class DatabaoContextProjectManager:
45
48
  """
46
49
 
47
50
  project_dir: Path
51
+ _project_layout: ProjectLayout
48
52
 
49
53
  def __init__(self, project_dir: Path) -> None:
50
54
  """Initialize the DatabaoContextProjectManager.
@@ -52,7 +56,7 @@ class DatabaoContextProjectManager:
52
56
  Args:
53
57
  project_dir: The root directory of the Databao Context Project.
54
58
  """
55
- ensure_project_dir(project_dir=project_dir)
59
+ self._project_layout = ensure_project_dir(project_dir=project_dir)
56
60
  self.project_dir = project_dir
57
61
 
58
62
  def get_configured_datasource_list(self) -> list[Datasource]:
@@ -64,7 +68,7 @@ class DatabaoContextProjectManager:
64
68
  Returns:
65
69
  The list of datasources configured in the project.
66
70
  """
67
- return get_datasource_list(self.project_dir)
71
+ return get_datasource_list(self._project_layout)
68
72
 
69
73
  def build_context(
70
74
  self,
@@ -83,7 +87,7 @@ class DatabaoContextProjectManager:
83
87
  The list of all built results.
84
88
  """
85
89
  # TODO: Filter which datasources to build by datasource_ids
86
- return build_all_datasources(project_dir=self.project_dir, chunk_embedding_mode=chunk_embedding_mode)
90
+ return build_all_datasources(project_layout=self._project_layout, chunk_embedding_mode=chunk_embedding_mode)
87
91
 
88
92
  def check_datasource_connection(
89
93
  self, datasource_ids: list[DatasourceId] | None = None
@@ -97,7 +101,9 @@ class DatabaoContextProjectManager:
97
101
  The list of all connection check results, sorted by datasource id.
98
102
  """
99
103
  return sorted(
100
- check_datasource_connection_internal(project_dir=self.project_dir, datasource_ids=datasource_ids).values(),
104
+ check_datasource_connection_internal(
105
+ project_layout=self._project_layout, datasource_ids=datasource_ids
106
+ ).values(),
101
107
  key=lambda result: str(result.datasource_id),
102
108
  )
103
109
 
@@ -121,8 +127,8 @@ class DatabaoContextProjectManager:
121
127
  The path to the created datasource configuration file.
122
128
  """
123
129
  # TODO: Before creating the datasource, validate the config content based on which plugin will be used
124
- config_file_path = create_datasource_config_file(
125
- project_dir=self.project_dir,
130
+ config_file = _create_datasource_config_file(
131
+ project_layout=self._project_layout,
126
132
  datasource_type=datasource_type,
127
133
  datasource_name=datasource_name,
128
134
  config_content=config_content,
@@ -130,48 +136,51 @@ class DatabaoContextProjectManager:
130
136
  )
131
137
 
132
138
  return DatasourceConfigFile(
133
- datasource_id=DatasourceId.from_datasource_config_file_path(config_file_path),
134
- config_file_path=config_file_path,
139
+ datasource_id=DatasourceId.from_datasource_config_file_path(self._project_layout, config_file),
140
+ config_file_path=config_file,
135
141
  )
136
142
 
137
143
  @overload
138
- def datasource_config_exists(self, *, datasource_type: DatasourceType, datasource_name: str) -> bool: ...
144
+ def datasource_config_exists(self, *, datasource_name: str) -> DatasourceId | None: ...
139
145
  @overload
140
- def datasource_config_exists(self, *, datasource_id: DatasourceId) -> bool: ...
146
+ def datasource_config_exists(self, *, datasource_id: DatasourceId) -> DatasourceId | None: ...
141
147
 
142
148
  def datasource_config_exists(
143
149
  self,
144
150
  *,
145
- datasource_type: DatasourceType | None = None,
146
151
  datasource_name: str | None = None,
147
152
  datasource_id: DatasourceId | None = None,
148
- ) -> bool:
153
+ ) -> DatasourceId | None:
149
154
  """Check if a datasource configuration file already exists in the project.
150
155
 
151
156
  Args:
152
- datasource_type: The type of the datasource. This must be used in conjunction with datasource_name.
153
- datasource_name: The name of the datasource. This must be used in conjunction with datasource_type.
157
+ datasource_name: The name of the datasource.
154
158
  datasource_id: The id of the datasource. If provided, datasource_type and datasource_name will be ignored.
155
159
 
156
160
  Returns:
157
- True if there is already a datasource configuration file for this datasource, False otherwise.
161
+ datasource_id if there is already a datasource configuration file for this datasource, None otherwise.
158
162
 
159
163
  Raises:
160
164
  ValueError: If the wrong set of arguments is provided.
161
165
  """
162
- if datasource_type is not None and datasource_name is not None:
163
- datasource_id = get_datasource_id_for_config_file(datasource_type, datasource_name)
164
- elif datasource_id is None:
166
+ if datasource_name is not None:
167
+ datasource_id = DatasourceId.from_string_repr(f"{datasource_name}.yaml")
168
+ config_file = self._project_layout.src_dir / datasource_id.relative_path_to_config_file()
169
+ if config_file.is_file():
170
+ return datasource_id
171
+ return None
172
+
173
+ if datasource_id is None:
165
174
  raise ValueError("Either datasource_id or both datasource_type and datasource_name must be provided")
166
175
 
167
176
  try:
168
- ensure_datasource_config_file_doesnt_exist(
169
- project_dir=self.project_dir,
170
- datasource_id=datasource_id,
171
- )
172
- return False
177
+ config_file = self._project_layout.src_dir.joinpath(datasource_id.relative_path_to_config_file())
178
+ if config_file.is_file():
179
+ raise ValueError(f"A config file already exists for {str(datasource_id)}")
180
+
181
+ return datasource_id
173
182
  except ValueError:
174
- return True
183
+ return None
175
184
 
176
185
  def get_engine_for_project(self) -> DatabaoContextEngine:
177
186
  """Instantiate a DatabaoContextEngine for the project.
@@ -180,3 +189,21 @@ class DatabaoContextProjectManager:
180
189
  A DatabaoContextEngine instance for the project.
181
190
  """
182
191
  return DatabaoContextEngine(project_dir=self.project_dir)
192
+
193
+
194
+ def _create_datasource_config_file(
195
+ project_layout: ProjectLayout,
196
+ datasource_type: DatasourceType,
197
+ datasource_name: str,
198
+ config_content: dict[str, Any],
199
+ overwrite_existing: bool,
200
+ ) -> Path:
201
+ last_datasource_name = datasource_name.split("/")[-1]
202
+ basic_config = {"type": datasource_type.full_type, "name": last_datasource_name}
203
+
204
+ return create_datasource_config_file_internal(
205
+ project_layout,
206
+ f"{datasource_name}.yaml",
207
+ to_yaml_string(basic_config | config_content),
208
+ overwrite_existing=overwrite_existing,
209
+ )
@@ -2,7 +2,6 @@ import logging
2
2
  import os
3
3
  from dataclasses import dataclass
4
4
  from enum import Enum
5
- from pathlib import Path
6
5
 
7
6
  from pydantic import ValidationError
8
7
 
@@ -15,7 +14,7 @@ from databao_context_engine.datasources.types import DatasourceId, PreparedConfi
15
14
  from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, NotSupportedError
16
15
  from databao_context_engine.pluginlib.plugin_utils import check_connection_for_datasource
17
16
  from databao_context_engine.plugins.plugin_loader import load_plugins
18
- from databao_context_engine.project.layout import ensure_project_dir
17
+ from databao_context_engine.project.layout import ProjectLayout
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
20
 
@@ -55,21 +54,19 @@ class CheckDatasourceConnectionResult:
55
54
 
56
55
 
57
56
  def check_datasource_connection(
58
- project_dir: Path, *, datasource_ids: list[DatasourceId] | None = None
57
+ project_layout: ProjectLayout, *, datasource_ids: list[DatasourceId] | None = None
59
58
  ) -> dict[DatasourceId, CheckDatasourceConnectionResult]:
60
- ensure_project_dir(project_dir)
61
-
62
59
  if datasource_ids:
63
60
  logger.info(f"Validating datasource(s): {datasource_ids}")
64
- datasources_to_traverse = get_datasource_descriptors(project_dir, datasource_ids)
61
+ datasources_to_traverse = get_datasource_descriptors(project_layout, datasource_ids)
65
62
  else:
66
- datasources_to_traverse = discover_datasources(project_dir)
63
+ datasources_to_traverse = discover_datasources(project_layout)
67
64
 
68
65
  plugins = load_plugins(exclude_file_plugins=True)
69
66
 
70
67
  result = {}
71
68
  for discovered_datasource in datasources_to_traverse:
72
- result_key = DatasourceId.from_datasource_config_file_path(discovered_datasource.path)
69
+ result_key = DatasourceId.from_datasource_config_file_path(project_layout, discovered_datasource.path)
73
70
 
74
71
  try:
75
72
  prepared_source = prepare_source(discovered_datasource)
@@ -127,16 +124,16 @@ def _get_validation_result_from_error(datasource_id: DatasourceId, e: Exception)
127
124
  summary="Config file is invalid",
128
125
  full_message=str(e),
129
126
  )
130
- elif isinstance(e, NotImplementedError | NotSupportedError):
127
+ if isinstance(e, NotImplementedError | NotSupportedError):
131
128
  return CheckDatasourceConnectionResult(
132
129
  datasource_id=datasource_id,
133
130
  connection_status=DatasourceConnectionStatus.UNKNOWN,
134
131
  summary="Plugin doesn't support validating its config",
135
132
  )
136
- else:
137
- return CheckDatasourceConnectionResult(
138
- datasource_id=datasource_id,
139
- connection_status=DatasourceConnectionStatus.INVALID,
140
- summary="Connection with the datasource can not be established",
141
- full_message=str(e),
142
- )
133
+
134
+ return CheckDatasourceConnectionResult(
135
+ datasource_id=datasource_id,
136
+ connection_status=DatasourceConnectionStatus.INVALID,
137
+ summary="Connection with the datasource can not be established",
138
+ full_message=str(e),
139
+ )