databao-context-engine 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databao_context_engine/__init__.py +35 -0
- databao_context_engine/build_sources/__init__.py +0 -0
- databao_context_engine/build_sources/internal/__init__.py +0 -0
- databao_context_engine/build_sources/internal/build_runner.py +111 -0
- databao_context_engine/build_sources/internal/build_service.py +77 -0
- databao_context_engine/build_sources/internal/build_wiring.py +52 -0
- databao_context_engine/build_sources/internal/export_results.py +43 -0
- databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
- databao_context_engine/build_sources/public/__init__.py +0 -0
- databao_context_engine/build_sources/public/api.py +4 -0
- databao_context_engine/cli/__init__.py +0 -0
- databao_context_engine/cli/add_datasource_config.py +130 -0
- databao_context_engine/cli/commands.py +256 -0
- databao_context_engine/cli/datasources.py +64 -0
- databao_context_engine/cli/info.py +32 -0
- databao_context_engine/config/__init__.py +0 -0
- databao_context_engine/config/log_config.yaml +16 -0
- databao_context_engine/config/logging.py +43 -0
- databao_context_engine/databao_context_project_manager.py +92 -0
- databao_context_engine/databao_engine.py +85 -0
- databao_context_engine/datasource_config/__init__.py +0 -0
- databao_context_engine/datasource_config/add_config.py +50 -0
- databao_context_engine/datasource_config/check_config.py +131 -0
- databao_context_engine/datasource_config/datasource_context.py +60 -0
- databao_context_engine/event_journal/__init__.py +0 -0
- databao_context_engine/event_journal/writer.py +29 -0
- databao_context_engine/generate_configs_schemas.py +92 -0
- databao_context_engine/init_project.py +18 -0
- databao_context_engine/introspection/__init__.py +0 -0
- databao_context_engine/introspection/property_extract.py +202 -0
- databao_context_engine/llm/__init__.py +0 -0
- databao_context_engine/llm/config.py +20 -0
- databao_context_engine/llm/descriptions/__init__.py +0 -0
- databao_context_engine/llm/descriptions/ollama.py +21 -0
- databao_context_engine/llm/descriptions/provider.py +10 -0
- databao_context_engine/llm/embeddings/__init__.py +0 -0
- databao_context_engine/llm/embeddings/ollama.py +37 -0
- databao_context_engine/llm/embeddings/provider.py +13 -0
- databao_context_engine/llm/errors.py +16 -0
- databao_context_engine/llm/factory.py +61 -0
- databao_context_engine/llm/install.py +227 -0
- databao_context_engine/llm/runtime.py +73 -0
- databao_context_engine/llm/service.py +159 -0
- databao_context_engine/main.py +19 -0
- databao_context_engine/mcp/__init__.py +0 -0
- databao_context_engine/mcp/all_results_tool.py +5 -0
- databao_context_engine/mcp/mcp_runner.py +16 -0
- databao_context_engine/mcp/mcp_server.py +63 -0
- databao_context_engine/mcp/retrieve_tool.py +22 -0
- databao_context_engine/pluginlib/__init__.py +0 -0
- databao_context_engine/pluginlib/build_plugin.py +107 -0
- databao_context_engine/pluginlib/config.py +37 -0
- databao_context_engine/pluginlib/plugin_utils.py +68 -0
- databao_context_engine/plugins/__init__.py +0 -0
- databao_context_engine/plugins/athena_db_plugin.py +12 -0
- databao_context_engine/plugins/base_db_plugin.py +45 -0
- databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
- databao_context_engine/plugins/databases/__init__.py +0 -0
- databao_context_engine/plugins/databases/athena_introspector.py +101 -0
- databao_context_engine/plugins/databases/base_introspector.py +144 -0
- databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
- databao_context_engine/plugins/databases/database_chunker.py +69 -0
- databao_context_engine/plugins/databases/databases_types.py +114 -0
- databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
- databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
- databao_context_engine/plugins/databases/introspection_scope.py +74 -0
- databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
- databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
- databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
- databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
- databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
- databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
- databao_context_engine/plugins/mssql_db_plugin.py +12 -0
- databao_context_engine/plugins/mysql_db_plugin.py +12 -0
- databao_context_engine/plugins/parquet_plugin.py +32 -0
- databao_context_engine/plugins/plugin_loader.py +110 -0
- databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
- databao_context_engine/plugins/resources/__init__.py +0 -0
- databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
- databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
- databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
- databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
- databao_context_engine/project/__init__.py +0 -0
- databao_context_engine/project/datasource_discovery.py +141 -0
- databao_context_engine/project/info.py +44 -0
- databao_context_engine/project/init_project.py +102 -0
- databao_context_engine/project/layout.py +127 -0
- databao_context_engine/project/project_config.py +32 -0
- databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
- databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
- databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
- databao_context_engine/project/runs.py +39 -0
- databao_context_engine/project/types.py +134 -0
- databao_context_engine/retrieve_embeddings/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
- databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
- databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
- databao_context_engine/retrieve_embeddings/public/api.py +3 -0
- databao_context_engine/serialisation/__init__.py +0 -0
- databao_context_engine/serialisation/yaml.py +35 -0
- databao_context_engine/services/__init__.py +0 -0
- databao_context_engine/services/chunk_embedding_service.py +104 -0
- databao_context_engine/services/embedding_shard_resolver.py +64 -0
- databao_context_engine/services/factories.py +88 -0
- databao_context_engine/services/models.py +12 -0
- databao_context_engine/services/persistence_service.py +61 -0
- databao_context_engine/services/run_name_policy.py +8 -0
- databao_context_engine/services/table_name_policy.py +15 -0
- databao_context_engine/storage/__init__.py +0 -0
- databao_context_engine/storage/connection.py +32 -0
- databao_context_engine/storage/exceptions/__init__.py +0 -0
- databao_context_engine/storage/exceptions/exceptions.py +6 -0
- databao_context_engine/storage/migrate.py +127 -0
- databao_context_engine/storage/migrations/V01__init.sql +63 -0
- databao_context_engine/storage/models.py +51 -0
- databao_context_engine/storage/repositories/__init__.py +0 -0
- databao_context_engine/storage/repositories/chunk_repository.py +130 -0
- databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
- databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
- databao_context_engine/storage/repositories/embedding_repository.py +113 -0
- databao_context_engine/storage/repositories/factories.py +35 -0
- databao_context_engine/storage/repositories/run_repository.py +157 -0
- databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
- databao_context_engine/storage/transaction.py +14 -0
- databao_context_engine/system/__init__.py +0 -0
- databao_context_engine/system/properties.py +13 -0
- databao_context_engine/templating/__init__.py +0 -0
- databao_context_engine/templating/renderer.py +29 -0
- databao_context_engine-0.1.1.dist-info/METADATA +186 -0
- databao_context_engine-0.1.1.dist-info/RECORD +135 -0
- databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
- databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from io import BufferedReader
|
|
3
|
+
from typing import Any, TypedDict
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.pluginlib.build_plugin import BuildFilePlugin, EmbeddableChunk
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FileChunk(TypedDict):
|
|
9
|
+
chunk_index: int
|
|
10
|
+
chunk_content: str
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InternalUnstructuredFilesPlugin(BuildFilePlugin):
|
|
14
|
+
id = "jetbrains/unstructured_files"
|
|
15
|
+
name = "Unstructured Files Plugin"
|
|
16
|
+
|
|
17
|
+
_SUPPORTED_FILES_EXTENSIONS = {"txt", "md"}
|
|
18
|
+
|
|
19
|
+
_DEFAULT_MAX_TOKENS = 300
|
|
20
|
+
_DEFAULT_TOKENS_OVERLAP = 50
|
|
21
|
+
|
|
22
|
+
def __init__(self, max_tokens: int | None = None, tokens_overlap: int | None = None):
|
|
23
|
+
self.max_tokens = max_tokens or self._DEFAULT_MAX_TOKENS
|
|
24
|
+
self.tokens_overlap = tokens_overlap or self._DEFAULT_TOKENS_OVERLAP
|
|
25
|
+
|
|
26
|
+
def supported_types(self) -> set[str]:
|
|
27
|
+
return {f"files/{extension}" for extension in self._SUPPORTED_FILES_EXTENSIONS}
|
|
28
|
+
|
|
29
|
+
def build_file_context(self, full_type: str, file_name: str, file_buffer: BufferedReader) -> Any:
|
|
30
|
+
file_content = self._read_file(file_buffer)
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
"chunks": self._chunk_file(file_content),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
|
|
37
|
+
return [self._create_embeddable_chunk_from_file_chunk(file_chunk) for file_chunk in context["chunks"]]
|
|
38
|
+
|
|
39
|
+
def _create_embeddable_chunk_from_file_chunk(self, file_chunk: FileChunk) -> EmbeddableChunk:
|
|
40
|
+
return EmbeddableChunk(
|
|
41
|
+
embeddable_text=file_chunk["chunk_content"],
|
|
42
|
+
content=file_chunk,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _read_file(self, file_buffer: BufferedReader) -> str:
|
|
46
|
+
file_bytes = file_buffer.read()
|
|
47
|
+
return file_bytes.decode("utf-8")
|
|
48
|
+
|
|
49
|
+
def _chunk_file(self, file_content: str) -> list[FileChunk]:
|
|
50
|
+
words_list = re.split(r"\s+", file_content)
|
|
51
|
+
|
|
52
|
+
chunks = []
|
|
53
|
+
|
|
54
|
+
chunk_start_index = 0
|
|
55
|
+
number_of_words = len(words_list)
|
|
56
|
+
while chunk_start_index < number_of_words:
|
|
57
|
+
chunk_end_index = min(number_of_words, chunk_start_index + self.max_tokens)
|
|
58
|
+
chunks.append(
|
|
59
|
+
FileChunk(
|
|
60
|
+
chunk_index=chunk_start_index,
|
|
61
|
+
chunk_content=" ".join(words_list[chunk_start_index:chunk_end_index]),
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
chunk_start_index = (
|
|
65
|
+
(chunk_end_index - self.tokens_overlap) if chunk_end_index < number_of_words else number_of_words
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return chunks
|
|
File without changes
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
from databao_context_engine.pluginlib.build_plugin import DatasourceType
|
|
8
|
+
from databao_context_engine.project.layout import get_source_dir
|
|
9
|
+
from databao_context_engine.project.types import (
|
|
10
|
+
DatasourceDescriptor,
|
|
11
|
+
DatasourceKind,
|
|
12
|
+
PreparedConfig,
|
|
13
|
+
PreparedDatasource,
|
|
14
|
+
PreparedFile,
|
|
15
|
+
DatasourceId,
|
|
16
|
+
Datasource,
|
|
17
|
+
)
|
|
18
|
+
from databao_context_engine.templating.renderer import render_template
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_datasource_list(project_dir: Path) -> list[Datasource]:
|
|
24
|
+
result = []
|
|
25
|
+
for discovered_datasource in discover_datasources(project_dir=project_dir):
|
|
26
|
+
try:
|
|
27
|
+
prepared_source = prepare_source(discovered_datasource)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
logger.debug(str(e), exc_info=True, stack_info=True)
|
|
30
|
+
logger.info(f"Invalid source at ({discovered_datasource.path}): {str(e)}")
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
result.append(
|
|
34
|
+
Datasource(
|
|
35
|
+
id=DatasourceId.from_datasource_config_file_path(discovered_datasource.path),
|
|
36
|
+
type=prepared_source.datasource_type,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def discover_datasources(project_dir: Path) -> list[DatasourceDescriptor]:
|
|
44
|
+
"""
|
|
45
|
+
Scan the project's src/ directory and return all discovered sources.
|
|
46
|
+
|
|
47
|
+
Rules:
|
|
48
|
+
- Each first-level directory under src/ is treated as a main_type
|
|
49
|
+
- Unsupported or unreadable entries are skipped.
|
|
50
|
+
- The returned list is sorted by directory and then filename
|
|
51
|
+
"""
|
|
52
|
+
src = get_source_dir(project_dir)
|
|
53
|
+
if not src.exists() or not src.is_dir():
|
|
54
|
+
raise ValueError(f"src directory does not exist in {project_dir}")
|
|
55
|
+
|
|
56
|
+
datasources: list[DatasourceDescriptor] = []
|
|
57
|
+
for main_dir in sorted((p for p in src.iterdir() if p.is_dir()), key=lambda p: p.name.lower()):
|
|
58
|
+
for path in sorted((p for p in main_dir.iterdir() if _is_datasource_file(p)), key=lambda p: p.name.lower()):
|
|
59
|
+
datasource = load_datasource_descriptor(path)
|
|
60
|
+
if datasource is not None:
|
|
61
|
+
datasources.append(datasource)
|
|
62
|
+
|
|
63
|
+
return datasources
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _is_datasource_file(p: Path) -> bool:
|
|
67
|
+
# ignore backup files
|
|
68
|
+
return p.is_file() and not p.suffix.endswith("~")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_datasource_descriptors(project_dir: Path, datasource_ids: list[DatasourceId]):
|
|
72
|
+
src = get_source_dir(project_dir)
|
|
73
|
+
if not src.exists() or not src.is_dir():
|
|
74
|
+
raise ValueError(f"src directory does not exist in {project_dir}")
|
|
75
|
+
|
|
76
|
+
datasources: list[DatasourceDescriptor] = []
|
|
77
|
+
for datasource_id in datasource_ids:
|
|
78
|
+
config_file_path = src.joinpath(datasource_id.relative_path_to_config_file())
|
|
79
|
+
if not config_file_path.is_file():
|
|
80
|
+
raise ValueError(f"Datasource config file not found: {config_file_path}")
|
|
81
|
+
|
|
82
|
+
datasource = load_datasource_descriptor(config_file_path)
|
|
83
|
+
if datasource is not None:
|
|
84
|
+
datasources.append(datasource)
|
|
85
|
+
|
|
86
|
+
return datasources
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def load_datasource_descriptor(path: Path) -> DatasourceDescriptor | None:
|
|
90
|
+
"""
|
|
91
|
+
Load a single file with src/<parent_name>/ into a DatasourceDescriptor
|
|
92
|
+
"""
|
|
93
|
+
if not path.is_file():
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
parent_name = path.parent.name
|
|
97
|
+
extension = path.suffix.lower().lstrip(".")
|
|
98
|
+
|
|
99
|
+
if parent_name == "files":
|
|
100
|
+
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
|
|
101
|
+
|
|
102
|
+
if extension in {"yaml", "yml"}:
|
|
103
|
+
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.CONFIG)
|
|
104
|
+
|
|
105
|
+
if extension:
|
|
106
|
+
return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
|
|
107
|
+
|
|
108
|
+
logger.debug("Skipping file without extension: %s", path)
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def prepare_source(datasource: DatasourceDescriptor) -> PreparedDatasource:
|
|
113
|
+
"""
|
|
114
|
+
Convert a discovered datasource into a prepared datasource ready for plugin execution
|
|
115
|
+
"""
|
|
116
|
+
if datasource.kind is DatasourceKind.FILE:
|
|
117
|
+
file_subtype = datasource.path.suffix.lower().lstrip(".")
|
|
118
|
+
return PreparedFile(
|
|
119
|
+
datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=file_subtype),
|
|
120
|
+
path=datasource.path,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
else:
|
|
124
|
+
config = _parse_config_file(datasource.path)
|
|
125
|
+
|
|
126
|
+
subtype = config.get("type")
|
|
127
|
+
if not subtype or not isinstance(subtype, str):
|
|
128
|
+
raise ValueError("Config missing 'type' at %s - skipping", datasource.path)
|
|
129
|
+
|
|
130
|
+
return PreparedConfig(
|
|
131
|
+
datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=subtype),
|
|
132
|
+
path=datasource.path,
|
|
133
|
+
config=config,
|
|
134
|
+
datasource_name=datasource.path.stem,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _parse_config_file(file_path: Path) -> dict[Any, Any]:
|
|
139
|
+
rendered_file = render_template(file_path.read_text())
|
|
140
|
+
|
|
141
|
+
return yaml.safe_load(rendered_file) or {}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from importlib.metadata import version
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from databao_context_engine.project.layout import validate_project_dir
|
|
7
|
+
from databao_context_engine.system.properties import get_dce_path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(kw_only=True, frozen=True)
|
|
11
|
+
class DceProjectInfo:
|
|
12
|
+
project_path: Path
|
|
13
|
+
is_initialised: bool
|
|
14
|
+
project_id: UUID | None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(kw_only=True, frozen=True)
|
|
18
|
+
class DceInfo:
|
|
19
|
+
version: str
|
|
20
|
+
dce_path: Path
|
|
21
|
+
|
|
22
|
+
project_info: DceProjectInfo
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_databao_context_engine_info(project_dir: Path) -> DceInfo:
|
|
26
|
+
return DceInfo(
|
|
27
|
+
version=get_dce_version(),
|
|
28
|
+
dce_path=get_dce_path(),
|
|
29
|
+
project_info=_get_project_info(project_dir),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _get_project_info(project_dir: Path) -> DceProjectInfo:
|
|
34
|
+
project_layout = validate_project_dir(project_dir)
|
|
35
|
+
|
|
36
|
+
return DceProjectInfo(
|
|
37
|
+
project_path=project_dir,
|
|
38
|
+
is_initialised=project_layout is not None,
|
|
39
|
+
project_id=project_layout.read_config_file().project_id if project_layout is not None else None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_dce_version() -> str:
|
|
44
|
+
return version("databao_context_engine")
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.project.layout import (
|
|
6
|
+
get_config_file,
|
|
7
|
+
get_deprecated_config_file,
|
|
8
|
+
get_examples_dir,
|
|
9
|
+
get_logs_dir,
|
|
10
|
+
get_source_dir,
|
|
11
|
+
)
|
|
12
|
+
from databao_context_engine.project.project_config import ProjectConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class InitErrorReason(Enum):
|
|
16
|
+
PROJECT_DIR_DOESNT_EXIST = "PROJECT_DIR_DOESNT_EXIST"
|
|
17
|
+
PROJECT_DIR_NOT_DIRECTORY = "PROJECT_DIR_NOT_DIRECTORY"
|
|
18
|
+
PROJECT_DIR_ALREADY_INITIALISED = "PROJECT_DIR_ALREADY_INITIALISED"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InitProjectError(Exception):
|
|
22
|
+
reason: InitErrorReason
|
|
23
|
+
|
|
24
|
+
def __init__(self, reason: InitErrorReason, message: str | None):
|
|
25
|
+
super().__init__(message or "")
|
|
26
|
+
|
|
27
|
+
self.reason = reason
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def init_project_dir(project_dir: Path) -> Path:
|
|
31
|
+
project_creator = _ProjectCreator(project_dir=project_dir)
|
|
32
|
+
project_creator.create()
|
|
33
|
+
|
|
34
|
+
return project_dir
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class _ProjectCreator:
|
|
38
|
+
def __init__(self, project_dir: Path):
|
|
39
|
+
self.project_dir = project_dir
|
|
40
|
+
self.deprecated_config_file = get_deprecated_config_file(project_dir)
|
|
41
|
+
self.config_file = get_config_file(project_dir)
|
|
42
|
+
self.src_dir = get_source_dir(project_dir)
|
|
43
|
+
self.examples_dir = get_examples_dir(project_dir)
|
|
44
|
+
self.logs_dir = get_logs_dir(project_dir)
|
|
45
|
+
|
|
46
|
+
def create(self):
|
|
47
|
+
self.ensure_can_init_project()
|
|
48
|
+
|
|
49
|
+
self.create_default_src_dir()
|
|
50
|
+
self.create_logs_dir()
|
|
51
|
+
self.create_examples_dir()
|
|
52
|
+
self.create_dce_config_file()
|
|
53
|
+
|
|
54
|
+
def ensure_can_init_project(self) -> bool:
|
|
55
|
+
if not self.project_dir.exists():
|
|
56
|
+
raise InitProjectError(
|
|
57
|
+
message=f"{self.project_dir.resolve()} does not exist", reason=InitErrorReason.PROJECT_DIR_DOESNT_EXIST
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if not self.project_dir.is_dir():
|
|
61
|
+
raise InitProjectError(
|
|
62
|
+
message=f"{self.project_dir.resolve()} is not a directory",
|
|
63
|
+
reason=InitErrorReason.PROJECT_DIR_NOT_DIRECTORY,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if self.config_file.is_file() or self.deprecated_config_file.is_file():
|
|
67
|
+
raise InitProjectError(
|
|
68
|
+
message=f"Can't initialise a Databao Context Engine project in a folder that already contains a config file. [project_dir: {self.project_dir.resolve()}]",
|
|
69
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if self.src_dir.is_dir():
|
|
73
|
+
raise InitProjectError(
|
|
74
|
+
message=f"Can't initialise a Databao Context Engine project in a folder that already contains a src directory. [project_dir: {self.project_dir.resolve()}]",
|
|
75
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if self.examples_dir.is_file():
|
|
79
|
+
raise InitProjectError(
|
|
80
|
+
message=f"Can't initialise a Databao Context Engine project in a folder that already contains an examples dir. [project_dir: {self.project_dir.resolve()}]",
|
|
81
|
+
reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
def create_default_src_dir(self) -> None:
|
|
87
|
+
self.src_dir.mkdir(parents=False, exist_ok=False)
|
|
88
|
+
|
|
89
|
+
self.src_dir.joinpath("databases").mkdir(parents=False, exist_ok=False)
|
|
90
|
+
self.src_dir.joinpath("files").mkdir(parents=False, exist_ok=False)
|
|
91
|
+
|
|
92
|
+
def create_logs_dir(self) -> None:
|
|
93
|
+
self.logs_dir.mkdir(exist_ok=True)
|
|
94
|
+
|
|
95
|
+
def create_examples_dir(self) -> None:
|
|
96
|
+
examples_to_copy = Path(__file__).parent.joinpath("resources").joinpath("examples")
|
|
97
|
+
|
|
98
|
+
shutil.copytree(str(examples_to_copy), str(self.examples_dir))
|
|
99
|
+
|
|
100
|
+
def create_dce_config_file(self) -> None:
|
|
101
|
+
self.config_file.touch()
|
|
102
|
+
ProjectConfig().save(self.config_file)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from databao_context_engine.project.project_config import ProjectConfig
|
|
6
|
+
from databao_context_engine.project.types import DatasourceId
|
|
7
|
+
|
|
8
|
+
SOURCE_FOLDER_NAME = "src"
|
|
9
|
+
OUTPUT_FOLDER_NAME = "output"
|
|
10
|
+
EXAMPLES_FOLDER_NAME = "examples"
|
|
11
|
+
LOGS_FOLDER_NAME = "logs"
|
|
12
|
+
DEPRECATED_CONFIG_FILE_NAME = "nemory.ini"
|
|
13
|
+
CONFIG_FILE_NAME = "dce.ini"
|
|
14
|
+
ALL_RESULTS_FILE_NAME = "all_results.yaml"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class ProjectLayout:
|
|
22
|
+
project_dir: Path
|
|
23
|
+
config_file: Path
|
|
24
|
+
|
|
25
|
+
def read_config_file(self) -> ProjectConfig:
|
|
26
|
+
return ProjectConfig.from_file(self.config_file)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ensure_project_dir(project_dir: Path) -> ProjectLayout:
|
|
30
|
+
return _ProjectValidator(project_dir).ensure_project_dir_valid()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_project_dir_valid(project_dir: Path) -> bool:
|
|
34
|
+
return validate_project_dir(project_dir) is not None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_project_dir(project_dir: Path) -> ProjectLayout | None:
|
|
38
|
+
return _ProjectValidator(project_dir).validate()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_source_dir(project_dir: Path) -> Path:
|
|
42
|
+
return project_dir.joinpath(SOURCE_FOLDER_NAME)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_output_dir(project_dir: Path) -> Path:
|
|
46
|
+
return project_dir.joinpath(OUTPUT_FOLDER_NAME)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_examples_dir(project_dir: Path) -> Path:
|
|
50
|
+
return project_dir.joinpath(EXAMPLES_FOLDER_NAME)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_deprecated_config_file(project_dir: Path) -> Path:
|
|
54
|
+
return project_dir.joinpath(DEPRECATED_CONFIG_FILE_NAME)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_config_file(project_dir: Path) -> Path:
|
|
58
|
+
return project_dir.joinpath(CONFIG_FILE_NAME)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_logs_dir(project_dir: Path) -> Path:
|
|
62
|
+
return project_dir.joinpath(LOGS_FOLDER_NAME)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
|
|
66
|
+
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
67
|
+
|
|
68
|
+
if config_file.is_file():
|
|
69
|
+
raise ValueError(f"A config file already exists for {str(datasource_id)}")
|
|
70
|
+
|
|
71
|
+
return config_file
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def create_datasource_config_file(
|
|
75
|
+
project_dir: Path, datasource_id: DatasourceId, config_content: str, overwrite_existing: bool
|
|
76
|
+
) -> Path:
|
|
77
|
+
if not overwrite_existing:
|
|
78
|
+
ensure_datasource_config_file_doesnt_exist(project_dir, datasource_id)
|
|
79
|
+
|
|
80
|
+
config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
|
|
81
|
+
config_file.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
config_file.write_text(config_content)
|
|
84
|
+
|
|
85
|
+
return config_file
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class _ProjectValidator:
|
|
89
|
+
def __init__(self, project_dir: Path):
|
|
90
|
+
self.project_dir = project_dir
|
|
91
|
+
self.config_file = self.get_config_file()
|
|
92
|
+
|
|
93
|
+
def ensure_project_dir_valid(self) -> ProjectLayout:
|
|
94
|
+
if not self.project_dir.is_dir():
|
|
95
|
+
raise ValueError(f"The current project directory is not valid: {self.project_dir.resolve()}")
|
|
96
|
+
|
|
97
|
+
if self.config_file is None:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"The current project directory has not been initialised. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if not self.is_src_valid():
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"The current project directory has not been initialised. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
|
|
108
|
+
|
|
109
|
+
def validate(self) -> ProjectLayout | None:
|
|
110
|
+
if self.config_file is not None and self.is_src_valid():
|
|
111
|
+
return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
def is_src_valid(self) -> bool:
|
|
115
|
+
return get_source_dir(self.project_dir).is_dir()
|
|
116
|
+
|
|
117
|
+
def get_config_file(self) -> Path | None:
|
|
118
|
+
deprecated_config_file = get_deprecated_config_file(self.project_dir)
|
|
119
|
+
if deprecated_config_file.is_file():
|
|
120
|
+
logger.warning(
|
|
121
|
+
f"{DEPRECATED_CONFIG_FILE_NAME} project config file is deprecated, please rename this file to {CONFIG_FILE_NAME}"
|
|
122
|
+
)
|
|
123
|
+
return deprecated_config_file.resolve()
|
|
124
|
+
config_file = get_config_file(self.project_dir)
|
|
125
|
+
if config_file.is_file():
|
|
126
|
+
return config_file.resolve()
|
|
127
|
+
return None
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import configparser
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProjectConfig:
|
|
7
|
+
# Since we're supporting Python 3.10, we have to add a section
|
|
8
|
+
# configparser.UNNAMED_SECTION was only introduced in Python 3.13
|
|
9
|
+
_DEFAULT_SECTION = "DEFAULT"
|
|
10
|
+
_PROJECT_ID_PROPERTY_NAME = "project-id"
|
|
11
|
+
|
|
12
|
+
project_id: uuid.UUID
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def from_file(project_config_file: Path) -> "ProjectConfig":
|
|
16
|
+
with open(project_config_file, "r") as file_stream:
|
|
17
|
+
config = configparser.ConfigParser()
|
|
18
|
+
config.read_file(file_stream)
|
|
19
|
+
|
|
20
|
+
return ProjectConfig(
|
|
21
|
+
project_id=uuid.UUID(config[ProjectConfig._DEFAULT_SECTION][ProjectConfig._PROJECT_ID_PROPERTY_NAME])
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __init__(self, project_id: uuid.UUID | None = None):
|
|
25
|
+
self.project_id = project_id or uuid.uuid4()
|
|
26
|
+
|
|
27
|
+
def save(self, project_config_file: Path) -> None:
|
|
28
|
+
config = configparser.ConfigParser()
|
|
29
|
+
config[self._DEFAULT_SECTION][self._PROJECT_ID_PROPERTY_NAME] = str(self.project_id)
|
|
30
|
+
|
|
31
|
+
with open(project_config_file, "w") as file_stream:
|
|
32
|
+
config.write(file_stream)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Project Notes — September 2025
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This project integrates database introspection, dbt metadata, and document embeddings to create a unified data context
|
|
6
|
+
engine.
|
|
7
|
+
|
|
8
|
+
## Goals
|
|
9
|
+
|
|
10
|
+
- Enable users to search and reason about their data environment in natural language.
|
|
11
|
+
- Extend support to plain text and PDF documents to capture unstructured knowledge.
|
|
12
|
+
- Provide consistent output structure under the `/output/run-<timestamp>` folders.
|
|
13
|
+
|
|
14
|
+
## Next Steps
|
|
15
|
+
|
|
16
|
+
1. Finish the file ingestion subsystem (loader, service, exporter).
|
|
17
|
+
2. Add PDF parsing and chunking support.
|
|
18
|
+
3. Implement caching for large document embeddings.
|
|
19
|
+
4. Evaluate chunking strategies for better recall during querying.
|
|
20
|
+
|
|
21
|
+
## Open Questions
|
|
22
|
+
|
|
23
|
+
- Should we embed entire paragraphs or split by sentence boundaries?
|
|
24
|
+
- How do we handle private/sensitive data inside document chunks?
|
|
25
|
+
- Could the system use source control metadata (e.g., Git history) to enrich document context?
|
|
26
|
+
|
|
27
|
+
## Summary
|
|
28
|
+
|
|
29
|
+
Once completed, the system will allow querying across databases, dbt models, and raw files — a single interface for
|
|
30
|
+
exploring data knowledge.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
The Vision: To empower developers worldwide with a seamless AI‑powered collaborative code editor that blends real‑time collaboration with intelligent assistance, debugging, and predictive coding suggestions.
|
|
2
|
+
The project, dubbed "CoDev‑AI," is an open‑source, web‑based platform built on a microservices architecture that allows plug‑ins, language models, and cloud storage to interoperate efficiently.
|
|
3
|
+
|
|
4
|
+
At its core, the editor is built on the powerful CodeMirror engine, which offers a lightweight, feature‑rich API for syntax highlighting, bracket matching, and refactoring. Users can open a shared workspace, add collaborators, and start writing code together. The AI layer sits beneath the UI, listening to every keystroke, providing real‑time code completions based on a fine‑tuned GPT‑4 architecture, as well as contextual insights into project dependencies.
|
|
5
|
+
|
|
6
|
+
The architecture is designed to scale horizontally. Microservices such as the "CodeAssist" service, "CollabSync," and "RepoSync" run in separate containers, each exposing a RESTful API. The frontend communicates through a WebSocket channel for low‑latency updates, while the services process requests asynchronously to keep the UI fluid.
|
|
7
|
+
|
|
8
|
+
The data layer stores all changes in an event‑sourced database, enabling audit trails and replay of entire sessions. Each change is tagged with a unique ID, timestamp, and the user responsible. This approach guarantees that any rollback or conflict resolution is trivial.
|
|
9
|
+
|
|
10
|
+
The core AI capabilities are integrated via the "CodeAssist" microservice. It leverages a fine‑tuned transformer model trained on millions of lines of open‑source code, and supports multiple languages, including JavaScript, Python, Rust, Go, and TypeScript. The model offers features such as autocomplete, context‑aware refactoring, and automated unit‑test generation. Moreover, it can analyze the entire repository to spot deprecated APIs, suggest migration paths, and produce a compliance report.
|
|
11
|
+
|
|
12
|
+
The collaboration engine is built on CRDTs (Conflict‑Free Replicated Data Types), ensuring eventual consistency across all clients without locking mechanisms. Users can chat, annotate, and review code inline, all changes instantly visible to the entire team. A sophisticated permission system allows owners to define roles, read/write access, and integrate with existing identity providers like SAML and OAuth2.
|
|
13
|
+
|
|
14
|
+
Beyond the editor, the platform includes a build pipeline runner that automatically triggers when a merge request is opened. It compiles, runs tests, collects coverage data, and reports metrics to the UI. This feature encourages best practices by providing instant feedback on code quality.
|
|
15
|
+
|
|
16
|
+
Future roadmap highlights the addition of a visual debugger that can step through TypeScript transpilation, highlight stack traces, and enable live breakpoints. Additionally, the team is exploring a self‑hosted variant that ships as a single Docker image, appealing to enterprises with stricter compliance requirements.
|
|
17
|
+
|
|
18
|
+
The project is hosted on GitHub under a permissive MIT license. Contributions are welcomed in the form of pull requests for new features, bug fixes, or educational material. A dedicated discussion forum guides developers in onboarding, code best practices, and architectural design. The team maintains a comprehensive documentation portal featuring tutorials, sample projects, and a robust FAQ.
|
|
19
|
+
|
|
20
|
+
In summary, CoDev‑AI aspires to transform the software development landscape, bridging creativity with artificial intelligence, reducing how developers collaborate, enabling global collaboration across distributed teams seamlessly.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from databao_context_engine.project.layout import ProjectLayout, get_output_dir
|
|
4
|
+
from databao_context_engine.storage.connection import open_duckdb_connection
|
|
5
|
+
from databao_context_engine.storage.repositories.factories import create_run_repository
|
|
6
|
+
from databao_context_engine.storage.repositories.run_repository import RunRepository
|
|
7
|
+
from databao_context_engine.system.properties import get_db_path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def resolve_run_name(*, project_layout: ProjectLayout, run_name: str | None) -> str:
|
|
11
|
+
project_id = str(project_layout.read_config_file().project_id)
|
|
12
|
+
|
|
13
|
+
with open_duckdb_connection(get_db_path()) as conn:
|
|
14
|
+
run_repository = create_run_repository(conn)
|
|
15
|
+
|
|
16
|
+
return resolve_run_name_from_repo(run_repository=run_repository, project_id=project_id, run_name=run_name)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def resolve_run_name_from_repo(*, run_repository: RunRepository, project_id: str, run_name: str | None) -> str:
|
|
20
|
+
if run_name is None:
|
|
21
|
+
latest = run_repository.get_latest_run_for_project(project_id=project_id)
|
|
22
|
+
if latest is None:
|
|
23
|
+
raise LookupError(f"No runs found for project '{project_id}'. Run a build first.")
|
|
24
|
+
return latest.run_name
|
|
25
|
+
else:
|
|
26
|
+
run = run_repository.get_by_run_name(project_id=project_id, run_name=run_name)
|
|
27
|
+
if run is None:
|
|
28
|
+
raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
|
|
29
|
+
return run.run_name
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_run_dir(project_dir: Path, run_name: str) -> Path:
|
|
33
|
+
run_dir = get_output_dir(project_dir).joinpath(run_name)
|
|
34
|
+
if not run_dir.is_dir():
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"The run with name {run_name} doesn't exist in the project. [project_dir: {project_dir.resolve()}]"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
return run_dir
|