databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,256 @@
1
+ import sys
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ import click
6
+ from click import Context
7
+
8
+ from databao_context_engine import (
9
+ ChunkEmbeddingMode,
10
+ DatabaoContextEngine,
11
+ DatabaoContextProjectManager,
12
+ DatasourceId,
13
+ InitErrorReason,
14
+ InitProjectError,
15
+ init_dce_project,
16
+ )
17
+ from databao_context_engine.cli.datasources import add_datasource_config_cli, check_datasource_connection_cli
18
+ from databao_context_engine.cli.info import echo_info
19
+ from databao_context_engine.config.logging import configure_logging
20
+ from databao_context_engine.llm.install import resolve_ollama_bin
21
+ from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
22
+ from databao_context_engine.storage.migrate import migrate
23
+
24
+
25
+ @click.group()
26
+ @click.option("-v", "--verbose", is_flag=True, help="Enable debug logging")
27
+ @click.option("-q", "--quiet", is_flag=True, help="Disable all console logging")
28
+ @click.option(
29
+ "-d",
30
+ "--project-dir",
31
+ type=click.STRING,
32
+ help="Location of your Databao Context Engine project",
33
+ )
34
+ @click.pass_context
35
+ def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> None:
36
+ if verbose and quiet:
37
+ print("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
38
+ exit(1)
39
+
40
+ if project_dir is None:
41
+ project_path = Path.cwd()
42
+ else:
43
+ project_path = Path(project_dir).expanduser()
44
+
45
+ configure_logging(verbose=verbose, quiet=quiet, project_dir=project_path)
46
+
47
+ ctx.ensure_object(dict)
48
+ ctx.obj["verbose"] = verbose
49
+ ctx.obj["quiet"] = quiet
50
+ ctx.obj["project_dir"] = project_path
51
+
52
+ migrate()
53
+
54
+
55
+ @dce.command()
56
+ @click.pass_context
57
+ def info(ctx: Context) -> None:
58
+ """
59
+ Display system-wide information
60
+ """
61
+
62
+ echo_info(ctx.obj["project_dir"])
63
+
64
+
65
+ @dce.command()
66
+ @click.pass_context
67
+ def init(ctx: Context) -> None:
68
+ """
69
+ Create an empty Databao Context Engine project
70
+ """
71
+ project_dir = ctx.obj["project_dir"]
72
+ try:
73
+ init_dce_project(project_dir=project_dir)
74
+ except InitProjectError as e:
75
+ if e.reason == InitErrorReason.PROJECT_DIR_DOESNT_EXIST:
76
+ if click.confirm(
77
+ f"The directory {ctx.obj['project_dir'].resolve()} does not exist. Do you want to create it?",
78
+ default=True,
79
+ ):
80
+ project_dir.mkdir(parents=True, exist_ok=False)
81
+ init_dce_project(project_dir=project_dir)
82
+ else:
83
+ return
84
+ else:
85
+ raise e
86
+
87
+ click.echo(f"Project initialized successfully at {project_dir.resolve()}")
88
+
89
+ try:
90
+ resolve_ollama_bin()
91
+ except RuntimeError as e:
92
+ click.echo(str(e), err=True)
93
+
94
+ if click.confirm("\nDo you want to configure a datasource now?"):
95
+ add_datasource_config_cli(project_dir)
96
+
97
+
98
+ @dce.group()
99
+ def datasource() -> None:
100
+ """
101
+ Manage datasource configurations
102
+ """
103
+ pass
104
+
105
+
106
+ @datasource.command(name="add")
107
+ @click.pass_context
108
+ def add_datasource_config(ctx: Context) -> None:
109
+ """
110
+ Add a new datasource configuration.
111
+
112
+ The command will ask all relevant information for that datasource and save it in your Databao Context Engine project.
113
+ """
114
+ add_datasource_config_cli(ctx.obj["project_dir"])
115
+
116
+
117
+ @datasource.command(name="check")
118
+ @click.argument(
119
+ "datasources-config-files",
120
+ type=click.STRING,
121
+ nargs=-1,
122
+ )
123
+ @click.pass_context
124
+ def check_datasource_config(ctx: Context, datasources_config_files: list[str] | None) -> None:
125
+ """
126
+ Check whether a datasource configuration is valid.
127
+
128
+ The configuration is considered as valid if a connection with the datasource can be established.
129
+
130
+ By default, all datasources declared in the project will be checked.
131
+ You can explicitely list which datasources to validate by using the [DATASOURCES_CONFIG_FILES] argument. Each argument must be the path to the file within the src folder (e.g: my-folder/my-config.yaml)
132
+ """
133
+
134
+ datasource_ids = (
135
+ [DatasourceId.from_string_repr(datasource_config_file) for datasource_config_file in datasources_config_files]
136
+ if datasources_config_files is not None
137
+ else None
138
+ )
139
+
140
+ check_datasource_connection_cli(ctx.obj["project_dir"], datasource_ids=datasource_ids)
141
+
142
+
143
+ @dce.command()
144
+ @click.option(
145
+ "-m",
146
+ "--chunk-embedding-mode",
147
+ type=click.Choice(
148
+ ["embeddable_text_only", "generated_description_only", "embeddable_text_and_generated_description"]
149
+ ),
150
+ default="embeddable_text_only",
151
+ help="Choose how chunks will be embedded. If a mode with the generated_description is selected, a local LLM model will be downloaded and used.",
152
+ )
153
+ @click.pass_context
154
+ def build(
155
+ ctx: Context,
156
+ chunk_embedding_mode: Literal[
157
+ "embeddable_text_only", "generated_description_only", "embeddable_text_and_generated_description"
158
+ ],
159
+ ) -> None:
160
+ """
161
+ Build context for all datasources
162
+
163
+ The output of the build command will be saved in a "run" folder in the output directory.
164
+
165
+ Internally, this indexes the context to be used by the MCP server and the "retrieve" command.
166
+ """
167
+ result = DatabaoContextProjectManager(project_dir=ctx.obj["project_dir"]).build_context(
168
+ datasource_ids=None, chunk_embedding_mode=ChunkEmbeddingMode(chunk_embedding_mode.upper())
169
+ )
170
+
171
+ click.echo(f"Build complete. Processed {len(result)} datasources.")
172
+
173
+
174
+ @dce.command()
175
+ @click.argument(
176
+ "retrieve-text",
177
+ nargs=-1,
178
+ required=True,
179
+ )
180
+ @click.option(
181
+ "-r",
182
+ "--run-name",
183
+ type=click.STRING,
184
+ help="Build run to use (the run folder name). Defaults to the latest run in the project.",
185
+ )
186
+ @click.option(
187
+ "-l",
188
+ "--limit",
189
+ type=click.INT,
190
+ help="Maximum number of chunk matches to return.",
191
+ )
192
+ @click.option(
193
+ "-o",
194
+ "--output-format",
195
+ type=click.Choice(["file", "streamed"]),
196
+ default="file",
197
+ help="The output format [file (default), streamed]",
198
+ )
199
+ @click.pass_context
200
+ def retrieve(
201
+ ctx: Context,
202
+ retrieve_text: tuple[str, ...],
203
+ run_name: str | None,
204
+ limit: int | None,
205
+ output_format: Literal["file", "streamed"],
206
+ ) -> None:
207
+ """
208
+ Search the project's built context for the most relevant chunks.
209
+ """
210
+ text = " ".join(retrieve_text)
211
+
212
+ databao_engine = DatabaoContextEngine(project_dir=ctx.obj["project_dir"])
213
+
214
+ retrieve_results = databao_engine.search_context(
215
+ retrieve_text=text, run_name=run_name, limit=limit, export_to_file=output_format == "file"
216
+ )
217
+
218
+ if output_format == "streamed":
219
+ display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
220
+ click.echo("\n".join(display_texts))
221
+
222
+
223
+ @dce.command()
224
+ @click.option(
225
+ "-r",
226
+ "--run-name",
227
+ type=click.STRING,
228
+ help="Name of the build run you want to use (aka. the name of the run folder in your project's output). Defaults to the latest one in the project.",
229
+ )
230
+ @click.option(
231
+ "-H",
232
+ "--host",
233
+ type=click.STRING,
234
+ help="Host to bind to. Defaults to 127.0.0.1",
235
+ )
236
+ @click.option(
237
+ "-p",
238
+ "--port",
239
+ type=click.INT,
240
+ help="Port to bind to. Defaults to 8000",
241
+ )
242
+ @click.option(
243
+ "-t",
244
+ "--transport",
245
+ type=click.Choice(["stdio", "streamable-http"]),
246
+ default="stdio",
247
+ help="Transport to use. Defaults to stdio",
248
+ )
249
+ @click.pass_context
250
+ def mcp(ctx: Context, run_name: str | None, host: str | None, port: int | None, transport: McpTransport) -> None:
251
+ """
252
+ Run Databao Context Engine's MCP server
253
+ """
254
+ if transport == "stdio":
255
+ configure_logging(verbose=False, quiet=True, project_dir=ctx.obj["project_dir"])
256
+ run_mcp_server(project_dir=ctx.obj["project_dir"], run_name=run_name, transport=transport, host=host, port=port)
@@ -0,0 +1,64 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import click
5
+
6
+ from databao_context_engine import (
7
+ CheckDatasourceConnectionResult,
8
+ DatabaoContextProjectManager,
9
+ DatasourceConnectionStatus,
10
+ DatasourceId,
11
+ )
12
+ from databao_context_engine.cli.add_datasource_config import add_datasource_config_interactive
13
+
14
+
15
+ def add_datasource_config_cli(project_dir: Path) -> None:
16
+ datasource_id = add_datasource_config_interactive(project_dir)
17
+
18
+ if click.confirm("\nDo you want to check the connection to this new datasource?"):
19
+ check_datasource_connection_cli(project_dir, datasource_ids=[datasource_id])
20
+
21
+
22
+ def check_datasource_connection_cli(project_dir: Path, *, datasource_ids: list[DatasourceId] | None) -> None:
23
+ results = DatabaoContextProjectManager(project_dir=project_dir).check_datasource_connection(
24
+ datasource_ids=datasource_ids
25
+ )
26
+
27
+ _print_check_datasource_connection_results(results)
28
+
29
+
30
+ def _print_check_datasource_connection_results(results: list[CheckDatasourceConnectionResult]) -> None:
31
+ if len(results) > 0:
32
+ valid_datasources = [
33
+ result for result in results if result.connection_status == DatasourceConnectionStatus.VALID
34
+ ]
35
+ invalid_datasources = [
36
+ result for result in results if result.connection_status == DatasourceConnectionStatus.INVALID
37
+ ]
38
+ unknown_datasources = [
39
+ result for result in results if result.connection_status == DatasourceConnectionStatus.UNKNOWN
40
+ ]
41
+
42
+ # Print all errors
43
+ for check_result in invalid_datasources:
44
+ click.echo(
45
+ f"Error for datasource {str(check_result.datasource_id)}:{os.linesep}{check_result.full_message}{os.linesep}"
46
+ )
47
+
48
+ results_summary = (
49
+ os.linesep.join(
50
+ [
51
+ f"{str(check_result.datasource_id)}: {check_result.format(show_summary_only=True)}"
52
+ for check_result in results
53
+ ]
54
+ )
55
+ if results
56
+ else "No datasource found"
57
+ )
58
+
59
+ click.echo(
60
+ f"Validation completed with {len(valid_datasources)} valid datasource(s) and {len(invalid_datasources) + len(unknown_datasources)} invalid (or unknown status) datasource(s)"
61
+ f"{os.linesep}{results_summary}"
62
+ )
63
+ else:
64
+ click.echo("No datasource found")
@@ -0,0 +1,32 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import click
6
+
7
+ from databao_context_engine import DceInfo, get_databao_context_engine_info
8
+
9
+
10
+ def echo_info(project_dir: Path) -> None:
11
+ click.echo(_generate_info_string(get_databao_context_engine_info(project_dir=project_dir)))
12
+
13
+
14
+ def _generate_info_string(command_info: DceInfo) -> str:
15
+ info_lines = []
16
+ info_lines.append(f"Databao context engine version: {command_info.version}")
17
+ info_lines.append(f"Databao context engine storage dir: {command_info.dce_path}")
18
+
19
+ info_lines.append("")
20
+
21
+ info_lines.append(f"OS name: {sys.platform}")
22
+ info_lines.append(f"OS architecture: {os.uname().machine if hasattr(os, 'uname') else 'unknown'}")
23
+
24
+ info_lines.append("")
25
+
26
+ if command_info.project_info.is_initialised:
27
+ info_lines.append(f"Project dir: {command_info.project_info.project_path.resolve()}")
28
+ info_lines.append(f"Project ID: {str(command_info.project_info.project_id)}")
29
+ else:
30
+ info_lines.append(f"Project not initialised at {command_info.project_info.project_path.resolve()}")
31
+
32
+ return os.linesep.join(info_lines)
File without changes
@@ -0,0 +1,16 @@
1
+ version: 1
2
+ formatters:
3
+ main:
4
+ format: '%(asctime)s %(levelname)s %(name)s: %(message)s'
5
+ console:
6
+ format: '%(asctime)s %(levelname)s: %(message)s'
7
+ handlers:
8
+ console:
9
+ class: logging.StreamHandler
10
+ formatter: console
11
+ stream: ext://sys.stdout
12
+ loggers:
13
+ databao_context_engine:
14
+ level: INFO
15
+ propagate: false
16
+ handlers: [ console ]
@@ -0,0 +1,43 @@
1
+ from datetime import datetime
2
+ from logging.config import dictConfig
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import yaml
7
+
8
+ from databao_context_engine.project.layout import get_logs_dir, is_project_dir_valid
9
+
10
+
11
+ def configure_logging(verbose: bool, quiet: bool, project_dir: Path) -> None:
12
+ with Path(__file__).parent.joinpath("log_config.yaml").open(mode="r") as log_config_file:
13
+ log_config = yaml.safe_load(log_config_file)
14
+
15
+ if is_project_dir_valid(project_dir):
16
+ logs_dir_path = get_logs_dir(project_dir)
17
+ logs_dir_path.mkdir(exist_ok=True)
18
+
19
+ file_handler_name = "logFile"
20
+ log_config["handlers"][file_handler_name] = _get_logging_file_handler(logs_dir_path)
21
+ log_config["loggers"]["databao_context_engine"]["handlers"].append(file_handler_name)
22
+
23
+ if quiet:
24
+ log_config["loggers"]["databao_context_engine"]["handlers"].remove("console")
25
+ if verbose:
26
+ log_config["loggers"]["databao_context_engine"]["level"] = "DEBUG"
27
+
28
+ dictConfig(log_config)
29
+
30
+
31
+ def _get_logging_file_handler(logs_dir_path: Path) -> dict[str, Any]:
32
+ return {
33
+ "filename": str(logs_dir_path.joinpath(_get_current_log_filename())),
34
+ "class": "logging.handlers.RotatingFileHandler",
35
+ "formatter": "main",
36
+ "maxBytes": 100000000, # 100MB
37
+ "backupCount": 12,
38
+ }
39
+
40
+
41
+ def _get_current_log_filename() -> str:
42
+ # Creates a new log file every month
43
+ return datetime.now().strftime("log-%Y-%m.txt")
@@ -0,0 +1,92 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import Any, overload
4
+
5
+ from databao_context_engine.build_sources.public.api import BuildContextResult, build_all_datasources
6
+ from databao_context_engine.datasource_config.add_config import (
7
+ create_datasource_config_file,
8
+ get_datasource_id_for_config_file,
9
+ )
10
+ from databao_context_engine.datasource_config.check_config import (
11
+ CheckDatasourceConnectionResult,
12
+ check_datasource_connection as check_datasource_connection_internal,
13
+ )
14
+ from databao_context_engine.pluginlib.build_plugin import DatasourceType
15
+ from databao_context_engine.project.datasource_discovery import DatasourceId
16
+ from databao_context_engine.project.layout import ensure_datasource_config_file_doesnt_exist, ensure_project_dir
17
+ from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
18
+
19
+
20
+ @dataclass
21
+ class DatasourceConfigFile:
22
+ datasource_id: DatasourceId
23
+ config_file_path: Path
24
+
25
+
26
+ class DatabaoContextProjectManager:
27
+ project_dir: Path
28
+
29
+ def __init__(self, project_dir: Path) -> None:
30
+ ensure_project_dir(project_dir=project_dir)
31
+ self.project_dir = project_dir
32
+
33
+ def build_context(
34
+ self, datasource_ids: list[DatasourceId] | None, chunk_embedding_mode: ChunkEmbeddingMode
35
+ ) -> list[BuildContextResult]:
36
+ # TODO: Filter which datasources to build by datasource_ids
37
+ return build_all_datasources(project_dir=self.project_dir, chunk_embedding_mode=chunk_embedding_mode)
38
+
39
+ def check_datasource_connection(
40
+ self, datasource_ids: list[DatasourceId] | None
41
+ ) -> list[CheckDatasourceConnectionResult]:
42
+ return sorted(
43
+ check_datasource_connection_internal(project_dir=self.project_dir, datasource_ids=datasource_ids).values(),
44
+ key=lambda result: str(result.datasource_id),
45
+ )
46
+
47
+ def create_datasource_config(
48
+ self,
49
+ datasource_type: DatasourceType,
50
+ datasource_name: str,
51
+ config_content: dict[str, Any],
52
+ overwrite_existing: bool = False,
53
+ ) -> DatasourceConfigFile:
54
+ # TODO: Before creating the datasource, validate the config content based on which plugin will be used
55
+ config_file_path = create_datasource_config_file(
56
+ project_dir=self.project_dir,
57
+ datasource_type=datasource_type,
58
+ datasource_name=datasource_name,
59
+ config_content=config_content,
60
+ overwrite_existing=overwrite_existing,
61
+ )
62
+
63
+ return DatasourceConfigFile(
64
+ datasource_id=DatasourceId.from_datasource_config_file_path(config_file_path),
65
+ config_file_path=config_file_path,
66
+ )
67
+
68
+ @overload
69
+ def datasource_config_exists(self, *, datasource_type: DatasourceType, datasource_name: str) -> bool: ...
70
+ @overload
71
+ def datasource_config_exists(self, *, datasource_id: DatasourceId) -> bool: ...
72
+
73
+ def datasource_config_exists(
74
+ self,
75
+ *,
76
+ datasource_type: DatasourceType | None = None,
77
+ datasource_name: str | None = None,
78
+ datasource_id: DatasourceId | None = None,
79
+ ) -> bool:
80
+ if datasource_type is not None and datasource_name is not None:
81
+ datasource_id = get_datasource_id_for_config_file(datasource_type, datasource_name)
82
+ elif datasource_id is None:
83
+ raise ValueError("Either datasource_id or both datasource_type and datasource_name must be provided")
84
+
85
+ try:
86
+ ensure_datasource_config_file_doesnt_exist(
87
+ project_dir=self.project_dir,
88
+ datasource_id=datasource_id,
89
+ )
90
+ return False
91
+ except ValueError:
92
+ return True
@@ -0,0 +1,85 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from databao_context_engine.datasource_config.datasource_context import (
7
+ DatasourceContext,
8
+ get_all_contexts,
9
+ get_context_header_for_datasource,
10
+ get_datasource_context,
11
+ )
12
+ from databao_context_engine.pluginlib.build_plugin import DatasourceType
13
+ from databao_context_engine.project.datasource_discovery import get_datasource_list
14
+ from databao_context_engine.project.layout import ensure_project_dir
15
+ from databao_context_engine.project.types import Datasource, DatasourceId
16
+ from databao_context_engine.retrieve_embeddings.public.api import retrieve_embeddings
17
+
18
+
19
+ @dataclass
20
+ class ContextSearchResult:
21
+ datasource_id: DatasourceId
22
+ datasource_type: DatasourceType
23
+ distance: float
24
+ context_result: str
25
+
26
+
27
+ class DatabaoContextEngine:
28
+ project_dir: Path
29
+
30
+ def __init__(self, project_dir: Path) -> None:
31
+ self.project_layout = ensure_project_dir(project_dir=project_dir)
32
+ self.project_dir = project_dir
33
+
34
+ def get_datasource_list(self) -> list[Datasource]:
35
+ # TODO: Should this return the list of built datasources rather than the list of datasources within the src folder?
36
+ return get_datasource_list(self.project_dir)
37
+
38
+ def get_datasource_context(self, datasource_id: DatasourceId, run_name: str | None = None) -> DatasourceContext:
39
+ return get_datasource_context(
40
+ project_layout=self.project_layout, datasource_id=datasource_id, run_name=run_name
41
+ )
42
+
43
+ def get_all_contexts(self, run_name: str | None = None) -> list[DatasourceContext]:
44
+ return get_all_contexts(project_layout=self.project_layout, run_name=run_name)
45
+
46
+ def get_all_contexts_formatted(self, run_name: str | None = None) -> str:
47
+ all_contexts = self.get_all_contexts(run_name=run_name)
48
+
49
+ all_results = os.linesep.join(
50
+ [f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
51
+ )
52
+
53
+ return all_results
54
+
55
+ def search_context(
56
+ self,
57
+ retrieve_text: str,
58
+ run_name: str | None,
59
+ limit: int | None,
60
+ export_to_file: bool,
61
+ datasource_ids: list[DatasourceId] | None = None,
62
+ ) -> list[ContextSearchResult]:
63
+ # TODO: Filter with datasource_ids
64
+ # TODO: Remove the need for a run_name
65
+
66
+ results = retrieve_embeddings(
67
+ project_layout=self.project_layout,
68
+ retrieve_text=retrieve_text,
69
+ run_name=run_name,
70
+ limit=limit,
71
+ export_to_file=export_to_file,
72
+ )
73
+
74
+ return [
75
+ ContextSearchResult(
76
+ datasource_id=result.datasource_id,
77
+ datasource_type=result.datasource_type,
78
+ distance=result.cosine_distance,
79
+ context_result=result.display_text,
80
+ )
81
+ for result in results
82
+ ]
83
+
84
+ def run_sql(self, datasource_id: DatasourceId, sql: str, params: list[str]) -> dict[str, Any]:
85
+ raise NotImplementedError("Running SQL is not supported yet")
File without changes
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ from databao_context_engine.introspection.property_extract import get_property_list_from_type
5
+ from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, DatasourceType
6
+ from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
7
+ from databao_context_engine.plugins.plugin_loader import get_plugin_for_type
8
+ from databao_context_engine.project.layout import (
9
+ create_datasource_config_file as create_datasource_config_file_internal,
10
+ )
11
+ from databao_context_engine.project.types import DatasourceId
12
+ from databao_context_engine.serialisation.yaml import to_yaml_string
13
+
14
+
15
+ def get_config_file_structure_for_datasource_type(datasource_type: DatasourceType) -> list[ConfigPropertyDefinition]:
16
+ plugin = get_plugin_for_type(datasource_type)
17
+
18
+ if isinstance(plugin, CustomiseConfigProperties):
19
+ return plugin.get_config_file_properties()
20
+ elif isinstance(plugin, BuildDatasourcePlugin):
21
+ return get_property_list_from_type(plugin.config_file_type)
22
+ else:
23
+ raise ValueError(
24
+ f"Impossible to create a config for type {datasource_type.full_type}. The plugin for this type is not a BuildDatasourcePlugin or CustomiseConfigProperties"
25
+ )
26
+
27
+
28
+ def create_datasource_config_file(
29
+ project_dir: Path,
30
+ datasource_type: DatasourceType,
31
+ datasource_name: str,
32
+ config_content: dict[str, Any],
33
+ overwrite_existing: bool,
34
+ ) -> Path:
35
+ basic_config = {"type": datasource_type.subtype, "name": datasource_name}
36
+
37
+ return create_datasource_config_file_internal(
38
+ project_dir,
39
+ get_datasource_id_for_config_file(datasource_type, datasource_name),
40
+ to_yaml_string(basic_config | config_content),
41
+ overwrite_existing=overwrite_existing,
42
+ )
43
+
44
+
45
+ def get_datasource_id_for_config_file(datasource_type: DatasourceType, datasource_name: str) -> DatasourceId:
46
+ return DatasourceId(
47
+ datasource_config_folder=datasource_type.config_folder,
48
+ datasource_name=datasource_name,
49
+ config_file_suffix=".yaml",
50
+ )