databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from click import Context
|
|
7
|
+
|
|
8
|
+
from databao_context_engine import (
|
|
9
|
+
ChunkEmbeddingMode,
|
|
10
|
+
DatabaoContextEngine,
|
|
11
|
+
DatabaoContextProjectManager,
|
|
12
|
+
DatasourceId,
|
|
13
|
+
InitErrorReason,
|
|
14
|
+
InitProjectError,
|
|
15
|
+
init_dce_project,
|
|
16
|
+
)
|
|
17
|
+
from databao_context_engine.cli.datasources import add_datasource_config_cli, check_datasource_connection_cli
|
|
18
|
+
from databao_context_engine.cli.info import echo_info
|
|
19
|
+
from databao_context_engine.config.logging import configure_logging
|
|
20
|
+
from databao_context_engine.llm.install import resolve_ollama_bin
|
|
21
|
+
from databao_context_engine.mcp.mcp_runner import McpTransport, run_mcp_server
|
|
22
|
+
from databao_context_engine.storage.migrate import migrate
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@click.group()
|
|
26
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable debug logging")
|
|
27
|
+
@click.option("-q", "--quiet", is_flag=True, help="Disable all console logging")
|
|
28
|
+
@click.option(
|
|
29
|
+
"-d",
|
|
30
|
+
"--project-dir",
|
|
31
|
+
type=click.STRING,
|
|
32
|
+
help="Location of your Databao Context Engine project",
|
|
33
|
+
)
|
|
34
|
+
@click.pass_context
|
|
35
|
+
def dce(ctx: Context, verbose: bool, quiet: bool, project_dir: str | None) -> None:
|
|
36
|
+
if verbose and quiet:
|
|
37
|
+
print("Arguments --quiet and --verbose can not be used together", file=sys.stderr)
|
|
38
|
+
exit(1)
|
|
39
|
+
|
|
40
|
+
if project_dir is None:
|
|
41
|
+
project_path = Path.cwd()
|
|
42
|
+
else:
|
|
43
|
+
project_path = Path(project_dir).expanduser()
|
|
44
|
+
|
|
45
|
+
configure_logging(verbose=verbose, quiet=quiet, project_dir=project_path)
|
|
46
|
+
|
|
47
|
+
ctx.ensure_object(dict)
|
|
48
|
+
ctx.obj["verbose"] = verbose
|
|
49
|
+
ctx.obj["quiet"] = quiet
|
|
50
|
+
ctx.obj["project_dir"] = project_path
|
|
51
|
+
|
|
52
|
+
migrate()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dce.command()
|
|
56
|
+
@click.pass_context
|
|
57
|
+
def info(ctx: Context) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Display system-wide information
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
echo_info(ctx.obj["project_dir"])
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dce.command()
|
|
66
|
+
@click.pass_context
|
|
67
|
+
def init(ctx: Context) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Create an empty Databao Context Engine project
|
|
70
|
+
"""
|
|
71
|
+
project_dir = ctx.obj["project_dir"]
|
|
72
|
+
try:
|
|
73
|
+
init_dce_project(project_dir=project_dir)
|
|
74
|
+
except InitProjectError as e:
|
|
75
|
+
if e.reason == InitErrorReason.PROJECT_DIR_DOESNT_EXIST:
|
|
76
|
+
if click.confirm(
|
|
77
|
+
f"The directory {ctx.obj['project_dir'].resolve()} does not exist. Do you want to create it?",
|
|
78
|
+
default=True,
|
|
79
|
+
):
|
|
80
|
+
project_dir.mkdir(parents=True, exist_ok=False)
|
|
81
|
+
init_dce_project(project_dir=project_dir)
|
|
82
|
+
else:
|
|
83
|
+
return
|
|
84
|
+
else:
|
|
85
|
+
raise e
|
|
86
|
+
|
|
87
|
+
click.echo(f"Project initialized successfully at {project_dir.resolve()}")
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
resolve_ollama_bin()
|
|
91
|
+
except RuntimeError as e:
|
|
92
|
+
click.echo(str(e), err=True)
|
|
93
|
+
|
|
94
|
+
if click.confirm("\nDo you want to configure a datasource now?"):
|
|
95
|
+
add_datasource_config_cli(project_dir)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dce.group()
|
|
99
|
+
def datasource() -> None:
|
|
100
|
+
"""
|
|
101
|
+
Manage datasource configurations
|
|
102
|
+
"""
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@datasource.command(name="add")
|
|
107
|
+
@click.pass_context
|
|
108
|
+
def add_datasource_config(ctx: Context) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Add a new datasource configuration.
|
|
111
|
+
|
|
112
|
+
The command will ask all relevant information for that datasource and save it in your Databao Context Engine project.
|
|
113
|
+
"""
|
|
114
|
+
add_datasource_config_cli(ctx.obj["project_dir"])
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@datasource.command(name="check")
|
|
118
|
+
@click.argument(
|
|
119
|
+
"datasources-config-files",
|
|
120
|
+
type=click.STRING,
|
|
121
|
+
nargs=-1,
|
|
122
|
+
)
|
|
123
|
+
@click.pass_context
|
|
124
|
+
def check_datasource_config(ctx: Context, datasources_config_files: list[str] | None) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Check whether a datasource configuration is valid.
|
|
127
|
+
|
|
128
|
+
The configuration is considered as valid if a connection with the datasource can be established.
|
|
129
|
+
|
|
130
|
+
By default, all datasources declared in the project will be checked.
|
|
131
|
+
You can explicitely list which datasources to validate by using the [DATASOURCES_CONFIG_FILES] argument. Each argument must be the path to the file within the src folder (e.g: my-folder/my-config.yaml)
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
datasource_ids = (
|
|
135
|
+
[DatasourceId.from_string_repr(datasource_config_file) for datasource_config_file in datasources_config_files]
|
|
136
|
+
if datasources_config_files is not None
|
|
137
|
+
else None
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
check_datasource_connection_cli(ctx.obj["project_dir"], datasource_ids=datasource_ids)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dce.command()
|
|
144
|
+
@click.option(
|
|
145
|
+
"-m",
|
|
146
|
+
"--chunk-embedding-mode",
|
|
147
|
+
type=click.Choice(
|
|
148
|
+
["embeddable_text_only", "generated_description_only", "embeddable_text_and_generated_description"]
|
|
149
|
+
),
|
|
150
|
+
default="embeddable_text_only",
|
|
151
|
+
help="Choose how chunks will be embedded. If a mode with the generated_description is selected, a local LLM model will be downloaded and used.",
|
|
152
|
+
)
|
|
153
|
+
@click.pass_context
|
|
154
|
+
def build(
|
|
155
|
+
ctx: Context,
|
|
156
|
+
chunk_embedding_mode: Literal[
|
|
157
|
+
"embeddable_text_only", "generated_description_only", "embeddable_text_and_generated_description"
|
|
158
|
+
],
|
|
159
|
+
) -> None:
|
|
160
|
+
"""
|
|
161
|
+
Build context for all datasources
|
|
162
|
+
|
|
163
|
+
The output of the build command will be saved in a "run" folder in the output directory.
|
|
164
|
+
|
|
165
|
+
Internally, this indexes the context to be used by the MCP server and the "retrieve" command.
|
|
166
|
+
"""
|
|
167
|
+
result = DatabaoContextProjectManager(project_dir=ctx.obj["project_dir"]).build_context(
|
|
168
|
+
datasource_ids=None, chunk_embedding_mode=ChunkEmbeddingMode(chunk_embedding_mode.upper())
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
click.echo(f"Build complete. Processed {len(result)} datasources.")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dce.command()
|
|
175
|
+
@click.argument(
|
|
176
|
+
"retrieve-text",
|
|
177
|
+
nargs=-1,
|
|
178
|
+
required=True,
|
|
179
|
+
)
|
|
180
|
+
@click.option(
|
|
181
|
+
"-r",
|
|
182
|
+
"--run-name",
|
|
183
|
+
type=click.STRING,
|
|
184
|
+
help="Build run to use (the run folder name). Defaults to the latest run in the project.",
|
|
185
|
+
)
|
|
186
|
+
@click.option(
|
|
187
|
+
"-l",
|
|
188
|
+
"--limit",
|
|
189
|
+
type=click.INT,
|
|
190
|
+
help="Maximum number of chunk matches to return.",
|
|
191
|
+
)
|
|
192
|
+
@click.option(
|
|
193
|
+
"-o",
|
|
194
|
+
"--output-format",
|
|
195
|
+
type=click.Choice(["file", "streamed"]),
|
|
196
|
+
default="file",
|
|
197
|
+
help="The output format [file (default), streamed]",
|
|
198
|
+
)
|
|
199
|
+
@click.pass_context
|
|
200
|
+
def retrieve(
|
|
201
|
+
ctx: Context,
|
|
202
|
+
retrieve_text: tuple[str, ...],
|
|
203
|
+
run_name: str | None,
|
|
204
|
+
limit: int | None,
|
|
205
|
+
output_format: Literal["file", "streamed"],
|
|
206
|
+
) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Search the project's built context for the most relevant chunks.
|
|
209
|
+
"""
|
|
210
|
+
text = " ".join(retrieve_text)
|
|
211
|
+
|
|
212
|
+
databao_engine = DatabaoContextEngine(project_dir=ctx.obj["project_dir"])
|
|
213
|
+
|
|
214
|
+
retrieve_results = databao_engine.search_context(
|
|
215
|
+
retrieve_text=text, run_name=run_name, limit=limit, export_to_file=output_format == "file"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if output_format == "streamed":
|
|
219
|
+
display_texts = [context_search_result.context_result for context_search_result in retrieve_results]
|
|
220
|
+
click.echo("\n".join(display_texts))
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@dce.command()
|
|
224
|
+
@click.option(
|
|
225
|
+
"-r",
|
|
226
|
+
"--run-name",
|
|
227
|
+
type=click.STRING,
|
|
228
|
+
help="Name of the build run you want to use (aka. the name of the run folder in your project's output). Defaults to the latest one in the project.",
|
|
229
|
+
)
|
|
230
|
+
@click.option(
|
|
231
|
+
"-H",
|
|
232
|
+
"--host",
|
|
233
|
+
type=click.STRING,
|
|
234
|
+
help="Host to bind to. Defaults to 127.0.0.1",
|
|
235
|
+
)
|
|
236
|
+
@click.option(
|
|
237
|
+
"-p",
|
|
238
|
+
"--port",
|
|
239
|
+
type=click.INT,
|
|
240
|
+
help="Port to bind to. Defaults to 8000",
|
|
241
|
+
)
|
|
242
|
+
@click.option(
|
|
243
|
+
"-t",
|
|
244
|
+
"--transport",
|
|
245
|
+
type=click.Choice(["stdio", "streamable-http"]),
|
|
246
|
+
default="stdio",
|
|
247
|
+
help="Transport to use. Defaults to stdio",
|
|
248
|
+
)
|
|
249
|
+
@click.pass_context
|
|
250
|
+
def mcp(ctx: Context, run_name: str | None, host: str | None, port: int | None, transport: McpTransport) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Run Databao Context Engine's MCP server
|
|
253
|
+
"""
|
|
254
|
+
if transport == "stdio":
|
|
255
|
+
configure_logging(verbose=False, quiet=True, project_dir=ctx.obj["project_dir"])
|
|
256
|
+
run_mcp_server(project_dir=ctx.obj["project_dir"], run_name=run_name, transport=transport, host=host, port=port)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from databao_context_engine import (
|
|
7
|
+
CheckDatasourceConnectionResult,
|
|
8
|
+
DatabaoContextProjectManager,
|
|
9
|
+
DatasourceConnectionStatus,
|
|
10
|
+
DatasourceId,
|
|
11
|
+
)
|
|
12
|
+
from databao_context_engine.cli.add_datasource_config import add_datasource_config_interactive
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def add_datasource_config_cli(project_dir: Path) -> None:
|
|
16
|
+
datasource_id = add_datasource_config_interactive(project_dir)
|
|
17
|
+
|
|
18
|
+
if click.confirm("\nDo you want to check the connection to this new datasource?"):
|
|
19
|
+
check_datasource_connection_cli(project_dir, datasource_ids=[datasource_id])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def check_datasource_connection_cli(project_dir: Path, *, datasource_ids: list[DatasourceId] | None) -> None:
|
|
23
|
+
results = DatabaoContextProjectManager(project_dir=project_dir).check_datasource_connection(
|
|
24
|
+
datasource_ids=datasource_ids
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
_print_check_datasource_connection_results(results)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _print_check_datasource_connection_results(results: list[CheckDatasourceConnectionResult]) -> None:
|
|
31
|
+
if len(results) > 0:
|
|
32
|
+
valid_datasources = [
|
|
33
|
+
result for result in results if result.connection_status == DatasourceConnectionStatus.VALID
|
|
34
|
+
]
|
|
35
|
+
invalid_datasources = [
|
|
36
|
+
result for result in results if result.connection_status == DatasourceConnectionStatus.INVALID
|
|
37
|
+
]
|
|
38
|
+
unknown_datasources = [
|
|
39
|
+
result for result in results if result.connection_status == DatasourceConnectionStatus.UNKNOWN
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# Print all errors
|
|
43
|
+
for check_result in invalid_datasources:
|
|
44
|
+
click.echo(
|
|
45
|
+
f"Error for datasource {str(check_result.datasource_id)}:{os.linesep}{check_result.full_message}{os.linesep}"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
results_summary = (
|
|
49
|
+
os.linesep.join(
|
|
50
|
+
[
|
|
51
|
+
f"{str(check_result.datasource_id)}: {check_result.format(show_summary_only=True)}"
|
|
52
|
+
for check_result in results
|
|
53
|
+
]
|
|
54
|
+
)
|
|
55
|
+
if results
|
|
56
|
+
else "No datasource found"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
click.echo(
|
|
60
|
+
f"Validation completed with {len(valid_datasources)} valid datasource(s) and {len(invalid_datasources) + len(unknown_datasources)} invalid (or unknown status) datasource(s)"
|
|
61
|
+
f"{os.linesep}{results_summary}"
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
click.echo("No datasource found")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from databao_context_engine import DceInfo, get_databao_context_engine_info
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def echo_info(project_dir: Path) -> None:
|
|
11
|
+
click.echo(_generate_info_string(get_databao_context_engine_info(project_dir=project_dir)))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _generate_info_string(command_info: DceInfo) -> str:
|
|
15
|
+
info_lines = []
|
|
16
|
+
info_lines.append(f"Databao context engine version: {command_info.version}")
|
|
17
|
+
info_lines.append(f"Databao context engine storage dir: {command_info.dce_path}")
|
|
18
|
+
|
|
19
|
+
info_lines.append("")
|
|
20
|
+
|
|
21
|
+
info_lines.append(f"OS name: {sys.platform}")
|
|
22
|
+
info_lines.append(f"OS architecture: {os.uname().machine if hasattr(os, 'uname') else 'unknown'}")
|
|
23
|
+
|
|
24
|
+
info_lines.append("")
|
|
25
|
+
|
|
26
|
+
if command_info.project_info.is_initialised:
|
|
27
|
+
info_lines.append(f"Project dir: {command_info.project_info.project_path.resolve()}")
|
|
28
|
+
info_lines.append(f"Project ID: {str(command_info.project_info.project_id)}")
|
|
29
|
+
else:
|
|
30
|
+
info_lines.append(f"Project not initialised at {command_info.project_info.project_path.resolve()}")
|
|
31
|
+
|
|
32
|
+
return os.linesep.join(info_lines)
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
version: 1
|
|
2
|
+
formatters:
|
|
3
|
+
main:
|
|
4
|
+
format: '%(asctime)s %(levelname)s %(name)s: %(message)s'
|
|
5
|
+
console:
|
|
6
|
+
format: '%(asctime)s %(levelname)s: %(message)s'
|
|
7
|
+
handlers:
|
|
8
|
+
console:
|
|
9
|
+
class: logging.StreamHandler
|
|
10
|
+
formatter: console
|
|
11
|
+
stream: ext://sys.stdout
|
|
12
|
+
loggers:
|
|
13
|
+
databao_context_engine:
|
|
14
|
+
level: INFO
|
|
15
|
+
propagate: false
|
|
16
|
+
handlers: [ console ]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from logging.config import dictConfig
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from databao_context_engine.project.layout import get_logs_dir, is_project_dir_valid
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def configure_logging(verbose: bool, quiet: bool, project_dir: Path) -> None:
|
|
12
|
+
with Path(__file__).parent.joinpath("log_config.yaml").open(mode="r") as log_config_file:
|
|
13
|
+
log_config = yaml.safe_load(log_config_file)
|
|
14
|
+
|
|
15
|
+
if is_project_dir_valid(project_dir):
|
|
16
|
+
logs_dir_path = get_logs_dir(project_dir)
|
|
17
|
+
logs_dir_path.mkdir(exist_ok=True)
|
|
18
|
+
|
|
19
|
+
file_handler_name = "logFile"
|
|
20
|
+
log_config["handlers"][file_handler_name] = _get_logging_file_handler(logs_dir_path)
|
|
21
|
+
log_config["loggers"]["databao_context_engine"]["handlers"].append(file_handler_name)
|
|
22
|
+
|
|
23
|
+
if quiet:
|
|
24
|
+
log_config["loggers"]["databao_context_engine"]["handlers"].remove("console")
|
|
25
|
+
if verbose:
|
|
26
|
+
log_config["loggers"]["databao_context_engine"]["level"] = "DEBUG"
|
|
27
|
+
|
|
28
|
+
dictConfig(log_config)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_logging_file_handler(logs_dir_path: Path) -> dict[str, Any]:
|
|
32
|
+
return {
|
|
33
|
+
"filename": str(logs_dir_path.joinpath(_get_current_log_filename())),
|
|
34
|
+
"class": "logging.handlers.RotatingFileHandler",
|
|
35
|
+
"formatter": "main",
|
|
36
|
+
"maxBytes": 100000000, # 100MB
|
|
37
|
+
"backupCount": 12,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_current_log_filename() -> str:
|
|
42
|
+
# Creates a new log file every month
|
|
43
|
+
return datetime.now().strftime("log-%Y-%m.txt")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, overload
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.build_sources.public.api import BuildContextResult, build_all_datasources
|
|
6
|
+
from databao_context_engine.datasource_config.add_config import (
|
|
7
|
+
create_datasource_config_file,
|
|
8
|
+
get_datasource_id_for_config_file,
|
|
9
|
+
)
|
|
10
|
+
from databao_context_engine.datasource_config.check_config import (
|
|
11
|
+
CheckDatasourceConnectionResult,
|
|
12
|
+
check_datasource_connection as check_datasource_connection_internal,
|
|
13
|
+
)
|
|
14
|
+
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
15
|
+
from databao_context_engine.project.datasource_discovery import DatasourceId
|
|
16
|
+
from databao_context_engine.project.layout import ensure_datasource_config_file_doesnt_exist, ensure_project_dir
|
|
17
|
+
from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class DatasourceConfigFile:
|
|
22
|
+
datasource_id: DatasourceId
|
|
23
|
+
config_file_path: Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DatabaoContextProjectManager:
|
|
27
|
+
project_dir: Path
|
|
28
|
+
|
|
29
|
+
def __init__(self, project_dir: Path) -> None:
|
|
30
|
+
ensure_project_dir(project_dir=project_dir)
|
|
31
|
+
self.project_dir = project_dir
|
|
32
|
+
|
|
33
|
+
def build_context(
|
|
34
|
+
self, datasource_ids: list[DatasourceId] | None, chunk_embedding_mode: ChunkEmbeddingMode
|
|
35
|
+
) -> list[BuildContextResult]:
|
|
36
|
+
# TODO: Filter which datasources to build by datasource_ids
|
|
37
|
+
return build_all_datasources(project_dir=self.project_dir, chunk_embedding_mode=chunk_embedding_mode)
|
|
38
|
+
|
|
39
|
+
def check_datasource_connection(
|
|
40
|
+
self, datasource_ids: list[DatasourceId] | None
|
|
41
|
+
) -> list[CheckDatasourceConnectionResult]:
|
|
42
|
+
return sorted(
|
|
43
|
+
check_datasource_connection_internal(project_dir=self.project_dir, datasource_ids=datasource_ids).values(),
|
|
44
|
+
key=lambda result: str(result.datasource_id),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def create_datasource_config(
|
|
48
|
+
self,
|
|
49
|
+
datasource_type: DatasourceType,
|
|
50
|
+
datasource_name: str,
|
|
51
|
+
config_content: dict[str, Any],
|
|
52
|
+
overwrite_existing: bool = False,
|
|
53
|
+
) -> DatasourceConfigFile:
|
|
54
|
+
# TODO: Before creating the datasource, validate the config content based on which plugin will be used
|
|
55
|
+
config_file_path = create_datasource_config_file(
|
|
56
|
+
project_dir=self.project_dir,
|
|
57
|
+
datasource_type=datasource_type,
|
|
58
|
+
datasource_name=datasource_name,
|
|
59
|
+
config_content=config_content,
|
|
60
|
+
overwrite_existing=overwrite_existing,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return DatasourceConfigFile(
|
|
64
|
+
datasource_id=DatasourceId.from_datasource_config_file_path(config_file_path),
|
|
65
|
+
config_file_path=config_file_path,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
@overload
|
|
69
|
+
def datasource_config_exists(self, *, datasource_type: DatasourceType, datasource_name: str) -> bool: ...
|
|
70
|
+
@overload
|
|
71
|
+
def datasource_config_exists(self, *, datasource_id: DatasourceId) -> bool: ...
|
|
72
|
+
|
|
73
|
+
def datasource_config_exists(
|
|
74
|
+
self,
|
|
75
|
+
*,
|
|
76
|
+
datasource_type: DatasourceType | None = None,
|
|
77
|
+
datasource_name: str | None = None,
|
|
78
|
+
datasource_id: DatasourceId | None = None,
|
|
79
|
+
) -> bool:
|
|
80
|
+
if datasource_type is not None and datasource_name is not None:
|
|
81
|
+
datasource_id = get_datasource_id_for_config_file(datasource_type, datasource_name)
|
|
82
|
+
elif datasource_id is None:
|
|
83
|
+
raise ValueError("Either datasource_id or both datasource_type and datasource_name must be provided")
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
ensure_datasource_config_file_doesnt_exist(
|
|
87
|
+
project_dir=self.project_dir,
|
|
88
|
+
datasource_id=datasource_id,
|
|
89
|
+
)
|
|
90
|
+
return False
|
|
91
|
+
except ValueError:
|
|
92
|
+
return True
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.datasource_config.datasource_context import (
|
|
7
|
+
DatasourceContext,
|
|
8
|
+
get_all_contexts,
|
|
9
|
+
get_context_header_for_datasource,
|
|
10
|
+
get_datasource_context,
|
|
11
|
+
)
|
|
12
|
+
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
13
|
+
from databao_context_engine.project.datasource_discovery import get_datasource_list
|
|
14
|
+
from databao_context_engine.project.layout import ensure_project_dir
|
|
15
|
+
from databao_context_engine.project.types import Datasource, DatasourceId
|
|
16
|
+
from databao_context_engine.retrieve_embeddings.public.api import retrieve_embeddings
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ContextSearchResult:
|
|
21
|
+
datasource_id: DatasourceId
|
|
22
|
+
datasource_type: DatasourceType
|
|
23
|
+
distance: float
|
|
24
|
+
context_result: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatabaoContextEngine:
|
|
28
|
+
project_dir: Path
|
|
29
|
+
|
|
30
|
+
def __init__(self, project_dir: Path) -> None:
|
|
31
|
+
self.project_layout = ensure_project_dir(project_dir=project_dir)
|
|
32
|
+
self.project_dir = project_dir
|
|
33
|
+
|
|
34
|
+
def get_datasource_list(self) -> list[Datasource]:
|
|
35
|
+
# TODO: Should this return the list of built datasources rather than the list of datasources within the src folder?
|
|
36
|
+
return get_datasource_list(self.project_dir)
|
|
37
|
+
|
|
38
|
+
def get_datasource_context(self, datasource_id: DatasourceId, run_name: str | None = None) -> DatasourceContext:
|
|
39
|
+
return get_datasource_context(
|
|
40
|
+
project_layout=self.project_layout, datasource_id=datasource_id, run_name=run_name
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def get_all_contexts(self, run_name: str | None = None) -> list[DatasourceContext]:
|
|
44
|
+
return get_all_contexts(project_layout=self.project_layout, run_name=run_name)
|
|
45
|
+
|
|
46
|
+
def get_all_contexts_formatted(self, run_name: str | None = None) -> str:
|
|
47
|
+
all_contexts = self.get_all_contexts(run_name=run_name)
|
|
48
|
+
|
|
49
|
+
all_results = os.linesep.join(
|
|
50
|
+
[f"{get_context_header_for_datasource(context.datasource_id)}{context.context}" for context in all_contexts]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return all_results
|
|
54
|
+
|
|
55
|
+
def search_context(
|
|
56
|
+
self,
|
|
57
|
+
retrieve_text: str,
|
|
58
|
+
run_name: str | None,
|
|
59
|
+
limit: int | None,
|
|
60
|
+
export_to_file: bool,
|
|
61
|
+
datasource_ids: list[DatasourceId] | None = None,
|
|
62
|
+
) -> list[ContextSearchResult]:
|
|
63
|
+
# TODO: Filter with datasource_ids
|
|
64
|
+
# TODO: Remove the need for a run_name
|
|
65
|
+
|
|
66
|
+
results = retrieve_embeddings(
|
|
67
|
+
project_layout=self.project_layout,
|
|
68
|
+
retrieve_text=retrieve_text,
|
|
69
|
+
run_name=run_name,
|
|
70
|
+
limit=limit,
|
|
71
|
+
export_to_file=export_to_file,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return [
|
|
75
|
+
ContextSearchResult(
|
|
76
|
+
datasource_id=result.datasource_id,
|
|
77
|
+
datasource_type=result.datasource_type,
|
|
78
|
+
distance=result.cosine_distance,
|
|
79
|
+
context_result=result.display_text,
|
|
80
|
+
)
|
|
81
|
+
for result in results
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
def run_sql(self, datasource_id: DatasourceId, sql: str, params: list[str]) -> dict[str, Any]:
|
|
85
|
+
raise NotImplementedError("Running SQL is not supported yet")
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from databao_context_engine.introspection.property_extract import get_property_list_from_type
|
|
5
|
+
from databao_context_engine.pluginlib.build_plugin import BuildDatasourcePlugin, DatasourceType
|
|
6
|
+
from databao_context_engine.pluginlib.config import ConfigPropertyDefinition, CustomiseConfigProperties
|
|
7
|
+
from databao_context_engine.plugins.plugin_loader import get_plugin_for_type
|
|
8
|
+
from databao_context_engine.project.layout import (
|
|
9
|
+
create_datasource_config_file as create_datasource_config_file_internal,
|
|
10
|
+
)
|
|
11
|
+
from databao_context_engine.project.types import DatasourceId
|
|
12
|
+
from databao_context_engine.serialisation.yaml import to_yaml_string
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_config_file_structure_for_datasource_type(datasource_type: DatasourceType) -> list[ConfigPropertyDefinition]:
|
|
16
|
+
plugin = get_plugin_for_type(datasource_type)
|
|
17
|
+
|
|
18
|
+
if isinstance(plugin, CustomiseConfigProperties):
|
|
19
|
+
return plugin.get_config_file_properties()
|
|
20
|
+
elif isinstance(plugin, BuildDatasourcePlugin):
|
|
21
|
+
return get_property_list_from_type(plugin.config_file_type)
|
|
22
|
+
else:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Impossible to create a config for type {datasource_type.full_type}. The plugin for this type is not a BuildDatasourcePlugin or CustomiseConfigProperties"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_datasource_config_file(
|
|
29
|
+
project_dir: Path,
|
|
30
|
+
datasource_type: DatasourceType,
|
|
31
|
+
datasource_name: str,
|
|
32
|
+
config_content: dict[str, Any],
|
|
33
|
+
overwrite_existing: bool,
|
|
34
|
+
) -> Path:
|
|
35
|
+
basic_config = {"type": datasource_type.subtype, "name": datasource_name}
|
|
36
|
+
|
|
37
|
+
return create_datasource_config_file_internal(
|
|
38
|
+
project_dir,
|
|
39
|
+
get_datasource_id_for_config_file(datasource_type, datasource_name),
|
|
40
|
+
to_yaml_string(basic_config | config_content),
|
|
41
|
+
overwrite_existing=overwrite_existing,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_datasource_id_for_config_file(datasource_type: DatasourceType, datasource_name: str) -> DatasourceId:
|
|
46
|
+
return DatasourceId(
|
|
47
|
+
datasource_config_folder=datasource_type.config_folder,
|
|
48
|
+
datasource_name=datasource_name,
|
|
49
|
+
config_file_suffix=".yaml",
|
|
50
|
+
)
|