databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. databao_context_engine/__init__.py +32 -7
  2. databao_context_engine/build_sources/__init__.py +4 -0
  3. databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
  4. databao_context_engine/build_sources/build_service.py +53 -0
  5. databao_context_engine/build_sources/build_wiring.py +82 -0
  6. databao_context_engine/build_sources/export_results.py +41 -0
  7. databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
  8. databao_context_engine/cli/add_datasource_config.py +49 -44
  9. databao_context_engine/cli/commands.py +40 -55
  10. databao_context_engine/cli/info.py +3 -2
  11. databao_context_engine/databao_context_engine.py +127 -0
  12. databao_context_engine/databao_context_project_manager.py +147 -30
  13. databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
  14. databao_context_engine/datasources/datasource_context.py +90 -0
  15. databao_context_engine/datasources/datasource_discovery.py +143 -0
  16. databao_context_engine/datasources/types.py +194 -0
  17. databao_context_engine/generate_configs_schemas.py +4 -5
  18. databao_context_engine/init_project.py +25 -3
  19. databao_context_engine/introspection/property_extract.py +76 -57
  20. databao_context_engine/llm/__init__.py +10 -0
  21. databao_context_engine/llm/api.py +57 -0
  22. databao_context_engine/llm/descriptions/ollama.py +1 -3
  23. databao_context_engine/llm/errors.py +2 -8
  24. databao_context_engine/llm/factory.py +5 -2
  25. databao_context_engine/llm/install.py +26 -30
  26. databao_context_engine/llm/runtime.py +3 -5
  27. databao_context_engine/llm/service.py +1 -3
  28. databao_context_engine/mcp/mcp_runner.py +4 -2
  29. databao_context_engine/mcp/mcp_server.py +9 -11
  30. databao_context_engine/plugin_loader.py +110 -0
  31. databao_context_engine/pluginlib/build_plugin.py +12 -29
  32. databao_context_engine/pluginlib/config.py +16 -2
  33. databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
  34. databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
  35. databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
  36. databao_context_engine/plugins/databases/base_introspector.py +11 -12
  37. databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
  38. databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
  39. databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
  40. databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
  41. databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
  42. databao_context_engine/plugins/databases/introspection_scope.py +11 -9
  43. databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
  44. databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
  45. databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
  46. databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
  47. databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
  48. databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
  49. databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
  50. databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
  51. databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
  52. databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
  53. databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
  54. databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
  55. databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
  56. databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
  57. databao_context_engine/plugins/duckdb_tools.py +18 -0
  58. databao_context_engine/plugins/files/__init__.py +0 -0
  59. databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
  60. databao_context_engine/plugins/plugin_loader.py +58 -52
  61. databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
  62. databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
  63. databao_context_engine/project/info.py +34 -2
  64. databao_context_engine/project/init_project.py +16 -7
  65. databao_context_engine/project/layout.py +14 -15
  66. databao_context_engine/retrieve_embeddings/__init__.py +3 -0
  67. databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
  68. databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
  69. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
  70. databao_context_engine/serialization/__init__.py +0 -0
  71. databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
  72. databao_context_engine/services/chunk_embedding_service.py +23 -11
  73. databao_context_engine/services/factories.py +1 -46
  74. databao_context_engine/services/persistence_service.py +11 -11
  75. databao_context_engine/storage/connection.py +11 -7
  76. databao_context_engine/storage/exceptions/exceptions.py +2 -2
  77. databao_context_engine/storage/migrate.py +3 -5
  78. databao_context_engine/storage/migrations/V01__init.sql +6 -31
  79. databao_context_engine/storage/models.py +2 -23
  80. databao_context_engine/storage/repositories/chunk_repository.py +16 -12
  81. databao_context_engine/storage/repositories/factories.py +1 -12
  82. databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
  83. databao_context_engine/system/properties.py +4 -2
  84. databao_context_engine-0.1.5.dist-info/METADATA +228 -0
  85. databao_context_engine-0.1.5.dist-info/RECORD +135 -0
  86. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
  87. databao_context_engine/build_sources/internal/build_service.py +0 -77
  88. databao_context_engine/build_sources/internal/build_wiring.py +0 -52
  89. databao_context_engine/build_sources/internal/export_results.py +0 -43
  90. databao_context_engine/build_sources/public/api.py +0 -4
  91. databao_context_engine/databao_engine.py +0 -85
  92. databao_context_engine/datasource_config/add_config.py +0 -50
  93. databao_context_engine/datasource_config/datasource_context.py +0 -60
  94. databao_context_engine/mcp/all_results_tool.py +0 -5
  95. databao_context_engine/mcp/retrieve_tool.py +0 -22
  96. databao_context_engine/plugins/databases/athena_introspector.py +0 -101
  97. databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
  98. databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
  99. databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
  100. databao_context_engine/project/datasource_discovery.py +0 -141
  101. databao_context_engine/project/runs.py +0 -39
  102. databao_context_engine/project/types.py +0 -134
  103. databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
  104. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
  105. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
  106. databao_context_engine/retrieve_embeddings/public/api.py +0 -3
  107. databao_context_engine/services/run_name_policy.py +0 -8
  108. databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
  109. databao_context_engine/storage/repositories/run_repository.py +0 -157
  110. databao_context_engine-0.1.1.dist-info/METADATA +0 -186
  111. databao_context_engine-0.1.1.dist-info/RECORD +0 -135
  112. /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
  113. /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
  114. /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
  115. /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
  116. /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
  117. /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
  118. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import sys
2
3
  from pathlib import Path
3
4
  from typing import Literal
@@ -13,13 +14,12 @@ from databao_context_engine import (
13
14
  InitErrorReason,
14
15
  InitProjectError,
15
16
  init_dce_project,
17
+ install_ollama_if_needed,
16
18
  )
17
19
  from databao_context_engine.cli.datasources import add_datasource_config_cli, check_datasource_connection_cli
18
20
  from databao_context_engine.cli.info import echo_info
19
21
  from databao_context_engine.config.logging import configure_logging
20
- from databao_context_engine.llm.install import resolve_ollama_bin
21
22
  from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
22
- from databao_context_engine.storage.migrate import migrate
23
23
 
24
24
 
25
25
  @click.group()
@@ -34,7 +34,7 @@ from databao_context_engine.storage.migrate import migrate
34
34
  @click.pass_context
35
35
  def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> None:
36
36
  if verbose and quiet:
37
- print("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
37
+ click.echo("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
38
38
  exit(1)
39
39
 
40
40
  if project_dir is None:
@@ -49,25 +49,18 @@ def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> No
49
49
  ctx.obj["quiet"] = quiet
50
50
  ctx.obj["project_dir"] = project_path
51
51
 
52
- migrate()
53
-
54
52
 
55
53
  @dce.command()
56
54
  @click.pass_context
57
55
  def info(ctx: Context) -> None:
58
- """
59
- Display system-wide information
60
- """
61
-
56
+ """Display system-wide information."""
62
57
  echo_info(ctx.obj["project_dir"])
63
58
 
64
59
 
65
60
  @dce.command()
66
61
  @click.pass_context
67
62
  def init(ctx: Context) -> None:
68
- """
69
- Create an empty Databao Context Engine project
70
- """
63
+ """Create an empty Databao Context Engine project."""
71
64
  project_dir = ctx.obj["project_dir"]
72
65
  try:
73
66
  init_dce_project(project_dir=project_dir)
@@ -87,7 +80,7 @@ def init(ctx: Context) -> None:
87
80
  click.echo(f"Project initialized successfully at {project_dir.resolve()}")
88
81
 
89
82
  try:
90
- resolve_ollama_bin()
83
+ install_ollama_if_needed()
91
84
  except RuntimeError as e:
92
85
  click.echo(str(e), err=True)
93
86
 
@@ -97,17 +90,14 @@ def init(ctx: Context) -> None:
97
90
 
98
91
  @dce.group()
99
92
  def datasource() -> None:
100
- """
101
- Manage datasource configurations
102
- """
93
+ """Manage datasource configurations."""
103
94
  pass
104
95
 
105
96
 
106
97
  @datasource.command(name="add")
107
98
  @click.pass_context
108
99
  def add_datasource_config(ctx: Context) -> None:
109
- """
110
- Add a new datasource configuration.
100
+ """Add a new datasource configuration.
111
101
 
112
102
  The command will ask all relevant information for that datasource and save it in your Databao Context Engine project.
113
103
  """
@@ -122,15 +112,13 @@ def add_datasource_config(ctx: Context) -> None:
122
112
  )
123
113
  @click.pass_context
124
114
  def check_datasource_config(ctx: Context, datasources_config_files: list[str] | None) -> None:
125
- """
126
- Check whether a datasource configuration is valid.
115
+ """Check whether a datasource configuration is valid.
127
116
 
128
117
  The configuration is considered as valid if a connection with the datasource can be established.
129
118
 
130
119
  By default, all datasources declared in the project will be checked.
131
120
  You can explicitely list which datasources to validate by using the [DATASOURCES_CONFIG_FILES] argument. Each argument must be the path to the file within the src folder (e.g: my-folder/my-config.yaml)
132
121
  """
133
-
134
122
  datasource_ids = (
135
123
  [DatasourceId.from_string_repr(datasource_config_file) for datasource_config_file in datasources_config_files]
136
124
  if datasources_config_files is not None
@@ -157,10 +145,9 @@ def build(
157
145
  "embeddable_text_only", "generated_description_only", "embeddable_text_and_generated_description"
158
146
  ],
159
147
  ) -> None:
160
- """
161
- Build context for all datasources
148
+ """Build context for all datasources.
162
149
 
163
- The output of the build command will be saved in a "run" folder in the output directory.
150
+ The output of the build command will be saved in the output directory.
164
151
 
165
152
  Internally, this indexes the context to be used by the MCP server and the "retrieve" command.
166
153
  """
@@ -177,12 +164,6 @@ def build(
177
164
  nargs=-1,
178
165
  required=True,
179
166
  )
180
- @click.option(
181
- "-r",
182
- "--run-name",
183
- type=click.STRING,
184
- help="Build run to use (the run folder name). Defaults to the latest run in the project.",
185
- )
186
167
  @click.option(
187
168
  "-l",
188
169
  "--limit",
@@ -191,42 +172,48 @@ def build(
191
172
  )
192
173
  @click.option(
193
174
  "-o",
194
- "--output-format",
195
- type=click.Choice(["file", "streamed"]),
196
- default="file",
197
- help="The output format [file (default), streamed]",
175
+ "--output-file",
176
+ type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
177
+ help="If provided, the results are written as YAML into the file given as a path, rather than printed to the console.",
178
+ )
179
+ @click.option(
180
+ "-d",
181
+ "--datasource-id",
182
+ "datasource_ids_as_str",
183
+ type=click.STRING,
184
+ multiple=True,
185
+ help="A datasourceId to restrict the search to. If not provided, search across all datasources. This option can be provided multiple times",
198
186
  )
199
187
  @click.pass_context
200
188
  def retrieve(
201
189
  ctx: Context,
202
190
  retrieve_text: tuple[str, ...],
203
- run_name: str | None,
204
191
  limit: int | None,
205
- output_format: Literal["file", "streamed"],
192
+ output_file: str | None,
193
+ datasource_ids_as_str: tuple[str, ...] | None,
206
194
  ) -> None:
207
- """
208
- Search the project's built context for the most relevant chunks.
209
- """
195
+ """Search the project's built context for the most relevant chunks."""
210
196
  text = " ".join(retrieve_text)
211
197
 
198
+ datasource_ids = (
199
+ [DatasourceId.from_string_repr(datasource_id) for datasource_id in datasource_ids_as_str]
200
+ if datasource_ids_as_str is not None
201
+ else None
202
+ )
203
+
212
204
  databao_engine = DatabaoContextEngine(project_dir=ctx.obj["project_dir"])
213
205
 
214
- retrieve_results = databao_engine.search_context(
215
- retrieve_text=text, run_name=run_name, limit=limit, export_to_file=output_format == "file"
216
- )
206
+ retrieve_results = databao_engine.search_context(retrieve_text=text, limit=limit, datasource_ids=datasource_ids)
217
207
 
218
- if output_format == "streamed":
219
- display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
208
+ display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
209
+ if output_file is not None:
210
+ Path(output_file).expanduser().write_text(f"---{os.linesep}".join(display_texts))
211
+ click.echo(f"Found {len(retrieve_results)} results, written to {output_file}")
212
+ else:
220
213
  click.echo("\n".join(display_texts))
221
214
 
222
215
 
223
216
  @dce.command()
224
- @click.option(
225
- "-r",
226
- "--run-name",
227
- type=click.STRING,
228
- help="Name of the build run you want to use (aka. the name of the run folder in your project's output). Defaults to the latest one in the project.",
229
- )
230
217
  @click.option(
231
218
  "-H",
232
219
  "--host",
@@ -247,10 +234,8 @@ def retrieve(
247
234
  help="Transport to use. Defaults to stdio",
248
235
  )
249
236
  @click.pass_context
250
- def mcp(ctx: Context, run_name: str | None, host: str | None, port: int | None, transport: McpTransport) -> None:
251
- """
252
- Run Databao Context Engine's MCP server
253
- """
237
+ def mcp(ctx: Context, host: str | None, port: int | None, transport: McpTransport) -> None:
238
+ """Run Databao Context Engine's MCP server."""
254
239
  if transport == "stdio":
255
240
  configure_logging(verbose=False, quiet=True, project_dir=ctx.obj["project_dir"])
256
- run_mcp_server(project_dir=ctx.obj["project_dir"], run_name=run_name, transport=transport, host=host, port=port)
241
+ run_mcp_server(project_dir=ctx.obj["project_dir"], transport=transport, host=host, port=port)
@@ -15,6 +15,7 @@ def _generate_info_string(command_info: DceInfo) -> str:
15
15
  info_lines = []
16
16
  info_lines.append(f"Databao context engine version: {command_info.version}")
17
17
  info_lines.append(f"Databao context engine storage dir: {command_info.dce_path}")
18
+ info_lines.append(f"Databao context engine plugins: {command_info.plugin_ids}")
18
19
 
19
20
  info_lines.append("")
20
21
 
@@ -23,10 +24,10 @@ def _generate_info_string(command_info: DceInfo) -> str:
23
24
 
24
25
  info_lines.append("")
25
26
 
26
- if command_info.project_info.is_initialised:
27
+ if command_info.project_info.is_initialized:
27
28
  info_lines.append(f"Project dir: {command_info.project_info.project_path.resolve()}")
28
29
  info_lines.append(f"Project ID: {str(command_info.project_info.project_id)}")
29
30
  else:
30
- info_lines.append(f"Project not initialised at {command_info.project_info.project_path.resolve()}")
31
+ info_lines.append(f"Project not initialized at {command_info.project_info.project_path.resolve()}")
31
32
 
32
33
  return os.linesep.join(info_lines)
@@ -0,0 +1,127 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from databao_context_engine.datasources.datasource_context import (
7
+ DatasourceContext,
8
+ get_all_contexts,
9
+ get_context_header_for_datasource,
10
+ get_datasource_context,
11
+ get_introspected_datasource_list,
12
+ )
13
+ from databao_context_engine.datasources.types import Datasource, DatasourceId
14
+ from databao_context_engine.pluginlib.build_plugin import DatasourceType
15
+ from databao_context_engine.project.layout import ProjectLayout, ensure_project_dir
16
+ from databao_context_engine.retrieve_embeddings import retrieve_embeddings
17
+
18
+
19
+ @dataclass
20
+ class ContextSearchResult:
21
+ """The result of a search in the project's contexts.
22
+
23
+ Attributes:
24
+ datasource_id: The ID of the datasource that generated the result.
25
+ datasource_type: The type of the datasource that generated the result.
26
+ distance: The distance between the search text and the result.
27
+ context_result: The actual content of the result that was found as a YAML string.
28
+ This content will be a subpart of the full context of the datasource.
29
+ In some cases, its content won't contain the exact same attributes as what can be
30
+ found directly in the full context.
31
+ """
32
+
33
+ datasource_id: DatasourceId
34
+ datasource_type: DatasourceType
35
+ distance: float
36
+ context_result: str
37
+
38
+
39
+ class DatabaoContextEngine:
40
+ """Engine for reading and using the contexts generated in a Databao Context Project.
41
+
42
+ The Databao Context Project should already have datasources configured and built (see DatabaoContextProjectManager), so that they can be used in the Engine.
43
+
44
+ Attributes:
45
+ project_dir: The root directory of the Databao Context Project.
46
+ """
47
+
48
+ project_dir: Path
49
+ _project_layout: ProjectLayout
50
+
51
+ def __init__(self, project_dir: Path) -> None:
52
+ """Initialize the DatabaoContextEngine.
53
+
54
+ Args:
55
+ project_dir: The root directory of the Databao Context Project.
56
+ There must be a valid DatabaoContextProject in this directory.
57
+ """
58
+ self._project_layout = ensure_project_dir(project_dir=project_dir)
59
+ self.project_dir = project_dir
60
+
61
+ def get_introspected_datasource_list(self) -> list[Datasource]:
62
+ """Return the list of datasources for which a context is available.
63
+
64
+ Returns:
65
+ A list of the datasources for which a context is available.
66
+ """
67
+ return get_introspected_datasource_list(self._project_layout)
68
+
69
+ def get_datasource_context(self, datasource_id: DatasourceId) -> DatasourceContext:
70
+ """Return the context available for a given datasource.
71
+
72
+ Args:
73
+ datasource_id: The ID of the datasource.
74
+
75
+ Returns:
76
+ The context for this datasource.
77
+ """
78
+ return get_datasource_context(project_layout=self._project_layout, datasource_id=datasource_id)
79
+
80
+ def get_all_contexts(self) -> list[DatasourceContext]:
81
+ """Return all contexts generated in the project.
82
+
83
+ Returns:
84
+ A list of all contexts generated in the project.
85
+ """
86
+ return get_all_contexts(project_layout=self._project_layout)
87
+
88
+ def get_all_contexts_formatted(self) -> str:
89
+ all_contexts = self.get_all_contexts()
90
+
91
+ return os.linesep.join(
92
+ [f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
93
+ )
94
+
95
+ def search_context(
96
+ self,
97
+ retrieve_text: str,
98
+ limit: int | None = None,
99
+ datasource_ids: list[DatasourceId] | None = None,
100
+ ) -> list[ContextSearchResult]:
101
+ """Search in the avaialable context for the closest matches to the given text.
102
+
103
+ Args:
104
+ retrieve_text: The text to search for in the contexts.
105
+ limit: The maximum number of results to return. If None is provided, a default limit of 10 will be used.
106
+ datasource_ids: If provided, the search results will only come from the datasources with these IDs.
107
+
108
+ Returns:
109
+ A list of the results found for the search, sorted by distance.
110
+ """
111
+ results = retrieve_embeddings(
112
+ project_layout=self._project_layout, retrieve_text=retrieve_text, limit=limit, datasource_ids=datasource_ids
113
+ )
114
+
115
+ return [
116
+ ContextSearchResult(
117
+ datasource_id=result.datasource_id,
118
+ datasource_type=result.datasource_type,
119
+ distance=result.cosine_distance,
120
+ context_result=result.display_text,
121
+ )
122
+ for result in results
123
+ ]
124
+
125
+ def run_sql(self, datasource_id: DatasourceId, sql: str, params: list[str]) -> dict[str, Any]:
126
+ """Not Implemented yet. This will allow to run a SQL query against a datasource (if the datasource supports it)."""
127
+ raise NotImplementedError("Running SQL is not supported yet")
@@ -2,45 +2,108 @@ from dataclasses import dataclass
2
2
  from pathlib import Path
3
3
  from typing import Any, overload
4
4
 
5
- from databao_context_engine.build_sources.public.api import BuildContextResult, build_all_datasources
6
- from databao_context_engine.datasource_config.add_config import (
7
- create_datasource_config_file,
8
- get_datasource_id_for_config_file,
9
- )
10
- from databao_context_engine.datasource_config.check_config import (
5
+ from databao_context_engine.build_sources import BuildContextResult, build_all_datasources
6
+ from databao_context_engine.databao_context_engine import DatabaoContextEngine
7
+ from databao_context_engine.datasources.check_config import (
11
8
  CheckDatasourceConnectionResult,
9
+ )
10
+ from databao_context_engine.datasources.check_config import (
12
11
  check_datasource_connection as check_datasource_connection_internal,
13
12
  )
13
+ from databao_context_engine.datasources.datasource_discovery import get_datasource_list
14
+ from databao_context_engine.datasources.types import Datasource, DatasourceId
14
15
  from databao_context_engine.pluginlib.build_plugin import DatasourceType
15
- from databao_context_engine.project.datasource_discovery import DatasourceId
16
- from databao_context_engine.project.layout import ensure_datasource_config_file_doesnt_exist, ensure_project_dir
16
+ from databao_context_engine.project.layout import (
17
+ ProjectLayout,
18
+ ensure_project_dir,
19
+ )
20
+ from databao_context_engine.project.layout import (
21
+ create_datasource_config_file as create_datasource_config_file_internal,
22
+ )
23
+ from databao_context_engine.serialization.yaml import to_yaml_string
17
24
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
18
25
 
19
26
 
20
27
  @dataclass
21
28
  class DatasourceConfigFile:
29
+ """A datasource config file that was created by the DatabaoContextProjectManager.
30
+
31
+ Attributes:
32
+ datasource_id: The unique identifier for the datasource.
33
+ config_file_path: The path to the datasource configuration file.
34
+ """
35
+
22
36
  datasource_id: DatasourceId
23
37
  config_file_path: Path
24
38
 
25
39
 
26
40
  class DatabaoContextProjectManager:
41
+ """Project Manager for Databao Context Projects.
42
+
43
+ This project manager is responsible for configuring and building a Databao Context Project.
44
+ The project_dir should already have been initialized before a Project manager can be used.
45
+
46
+ Attributes:
47
+ project_dir: The root directory of the Databao Context Project.
48
+ """
49
+
27
50
  project_dir: Path
51
+ _project_layout: ProjectLayout
28
52
 
29
53
  def __init__(self, project_dir: Path) -> None:
30
- ensure_project_dir(project_dir=project_dir)
54
+ """Initialize the DatabaoContextProjectManager.
55
+
56
+ Args:
57
+ project_dir: The root directory of the Databao Context Project.
58
+ """
59
+ self._project_layout = ensure_project_dir(project_dir=project_dir)
31
60
  self.project_dir = project_dir
32
61
 
62
+ def get_configured_datasource_list(self) -> list[Datasource]:
63
+ """Return the list of datasources configured in the project.
64
+
65
+ This method returns all datasources configured in the src folder of the project,
66
+ no matter whether the datasource configuration is valid or not.
67
+
68
+ Returns:
69
+ The list of datasources configured in the project.
70
+ """
71
+ return get_datasource_list(self._project_layout)
72
+
33
73
  def build_context(
34
- self, datasource_ids: list[DatasourceId] | None, chunk_embedding_mode: ChunkEmbeddingMode
74
+ self,
75
+ datasource_ids: list[DatasourceId] | None = None,
76
+ chunk_embedding_mode: ChunkEmbeddingMode = ChunkEmbeddingMode.EMBEDDABLE_TEXT_ONLY,
35
77
  ) -> list[BuildContextResult]:
78
+ """Build the context for datasources in the project.
79
+
80
+ Any datasource with an invalid configuration will be skipped.
81
+
82
+ Args:
83
+ datasource_ids: The list of datasource ids to build. If None, all datasources will be built.
84
+ chunk_embedding_mode: The mode to use for chunk embedding.
85
+
86
+ Returns:
87
+ The list of all built results.
88
+ """
36
89
  # TODO: Filter which datasources to build by datasource_ids
37
- return build_all_datasources(project_dir=self.project_dir, chunk_embedding_mode=chunk_embedding_mode)
90
+ return build_all_datasources(project_layout=self._project_layout, chunk_embedding_mode=chunk_embedding_mode)
38
91
 
39
92
  def check_datasource_connection(
40
- self, datasource_ids: list[DatasourceId] | None
93
+ self, datasource_ids: list[DatasourceId] | None = None
41
94
  ) -> list[CheckDatasourceConnectionResult]:
95
+ """Check the connection for datasources in the project.
96
+
97
+ Args:
98
+ datasource_ids: The list of datasource ids to check. If None, all datasources will be checked.
99
+
100
+ Returns:
101
+ The list of all connection check results, sorted by datasource id.
102
+ """
42
103
  return sorted(
43
- check_datasource_connection_internal(project_dir=self.project_dir, datasource_ids=datasource_ids).values(),
104
+ check_datasource_connection_internal(
105
+ project_layout=self._project_layout, datasource_ids=datasource_ids
106
+ ).values(),
44
107
  key=lambda result: str(result.datasource_id),
45
108
  )
46
109
 
@@ -51,9 +114,21 @@ class DatabaoContextProjectManager:
51
114
  config_content: dict[str, Any],
52
115
  overwrite_existing: bool = False,
53
116
  ) -> DatasourceConfigFile:
117
+ """Create a new datasource configuration file in the project.
118
+
119
+ Args:
120
+ datasource_type: The type of the datasource to create.
121
+ datasource_name: The name of the datasource to create.
122
+ config_content: The content of the datasource configuration. This is a dictionary that will be written as-is in a yaml file.
123
+ The actual content of the configuration is not checked and might not be valid for the requested type..
124
+ overwrite_existing: Whether to overwrite an existing datasource configuration file if it already exists.
125
+
126
+ Returns:
127
+ The path to the created datasource configuration file.
128
+ """
54
129
  # TODO: Before creating the datasource, validate the config content based on which plugin will be used
55
- config_file_path = create_datasource_config_file(
56
- project_dir=self.project_dir,
130
+ config_file = _create_datasource_config_file(
131
+ project_layout=self._project_layout,
57
132
  datasource_type=datasource_type,
58
133
  datasource_name=datasource_name,
59
134
  config_content=config_content,
@@ -61,32 +136,74 @@ class DatabaoContextProjectManager:
61
136
  )
62
137
 
63
138
  return DatasourceConfigFile(
64
- datasource_id=DatasourceId.from_datasource_config_file_path(config_file_path),
65
- config_file_path=config_file_path,
139
+ datasource_id=DatasourceId.from_datasource_config_file_path(self._project_layout, config_file),
140
+ config_file_path=config_file,
66
141
  )
67
142
 
68
143
  @overload
69
- def datasource_config_exists(self, *, datasource_type: DatasourceType, datasource_name: str) -> bool: ...
144
+ def datasource_config_exists(self, *, datasource_name: str) -> DatasourceId | None: ...
70
145
  @overload
71
- def datasource_config_exists(self, *, datasource_id: DatasourceId) -> bool: ...
146
+ def datasource_config_exists(self, *, datasource_id: DatasourceId) -> DatasourceId | None: ...
72
147
 
73
148
  def datasource_config_exists(
74
149
  self,
75
150
  *,
76
- datasource_type: DatasourceType | None = None,
77
151
  datasource_name: str | None = None,
78
152
  datasource_id: DatasourceId | None = None,
79
- ) -> bool:
80
- if datasource_type is not None and datasource_name is not None:
81
- datasource_id = get_datasource_id_for_config_file(datasource_type, datasource_name)
82
- elif datasource_id is None:
153
+ ) -> DatasourceId | None:
154
+ """Check if a datasource configuration file already exists in the project.
155
+
156
+ Args:
157
+ datasource_name: The name of the datasource.
158
+ datasource_id: The id of the datasource. If provided, datasource_type and datasource_name will be ignored.
159
+
160
+ Returns:
161
+ datasource_id if there is already a datasource configuration file for this datasource, None otherwise.
162
+
163
+ Raises:
164
+ ValueError: If the wrong set of arguments is provided.
165
+ """
166
+ if datasource_name is not None:
167
+ datasource_id = DatasourceId.from_string_repr(f"{datasource_name}.yaml")
168
+ config_file = self._project_layout.src_dir / datasource_id.relative_path_to_config_file()
169
+ if config_file.is_file():
170
+ return datasource_id
171
+ return None
172
+
173
+ if datasource_id is None:
83
174
  raise ValueError("Either datasource_id or both datasource_type and datasource_name must be provided")
84
175
 
85
176
  try:
86
- ensure_datasource_config_file_doesnt_exist(
87
- project_dir=self.project_dir,
88
- datasource_id=datasource_id,
89
- )
90
- return False
177
+ config_file = self._project_layout.src_dir.joinpath(datasource_id.relative_path_to_config_file())
178
+ if config_file.is_file():
179
+ raise ValueError(f"A config file already exists for {str(datasource_id)}")
180
+
181
+ return datasource_id
91
182
  except ValueError:
92
- return True
183
+ return None
184
+
185
+ def get_engine_for_project(self) -> DatabaoContextEngine:
186
+ """Instantiate a DatabaoContextEngine for the project.
187
+
188
+ Returns:
189
+ A DatabaoContextEngine instance for the project.
190
+ """
191
+ return DatabaoContextEngine(project_dir=self.project_dir)
192
+
193
+
194
+ def _create_datasource_config_file(
195
+ project_layout: ProjectLayout,
196
+ datasource_type: DatasourceType,
197
+ datasource_name: str,
198
+ config_content: dict[str, Any],
199
+ overwrite_existing: bool,
200
+ ) -> Path:
201
+ last_datasource_name = datasource_name.split("/")[-1]
202
+ basic_config = {"type": datasource_type.full_type, "name": last_datasource_name}
203
+
204
+ return create_datasource_config_file_internal(
205
+ project_layout,
206
+ f"{datasource_name}.yaml",
207
+ to_yaml_string(basic_config | config_content),
208
+ overwrite_existing=overwrite_existing,
209
+ )