databao-context-engine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. databao_context_engine/__init__.py +35 -0
  2. databao_context_engine/build_sources/__init__.py +0 -0
  3. databao_context_engine/build_sources/internal/__init__.py +0 -0
  4. databao_context_engine/build_sources/internal/build_runner.py +111 -0
  5. databao_context_engine/build_sources/internal/build_service.py +77 -0
  6. databao_context_engine/build_sources/internal/build_wiring.py +52 -0
  7. databao_context_engine/build_sources/internal/export_results.py +43 -0
  8. databao_context_engine/build_sources/internal/plugin_execution.py +74 -0
  9. databao_context_engine/build_sources/public/__init__.py +0 -0
  10. databao_context_engine/build_sources/public/api.py +4 -0
  11. databao_context_engine/cli/__init__.py +0 -0
  12. databao_context_engine/cli/add_datasource_config.py +130 -0
  13. databao_context_engine/cli/commands.py +256 -0
  14. databao_context_engine/cli/datasources.py +64 -0
  15. databao_context_engine/cli/info.py +32 -0
  16. databao_context_engine/config/__init__.py +0 -0
  17. databao_context_engine/config/log_config.yaml +16 -0
  18. databao_context_engine/config/logging.py +43 -0
  19. databao_context_engine/databao_context_project_manager.py +92 -0
  20. databao_context_engine/databao_engine.py +85 -0
  21. databao_context_engine/datasource_config/__init__.py +0 -0
  22. databao_context_engine/datasource_config/add_config.py +50 -0
  23. databao_context_engine/datasource_config/check_config.py +131 -0
  24. databao_context_engine/datasource_config/datasource_context.py +60 -0
  25. databao_context_engine/event_journal/__init__.py +0 -0
  26. databao_context_engine/event_journal/writer.py +29 -0
  27. databao_context_engine/generate_configs_schemas.py +92 -0
  28. databao_context_engine/init_project.py +18 -0
  29. databao_context_engine/introspection/__init__.py +0 -0
  30. databao_context_engine/introspection/property_extract.py +202 -0
  31. databao_context_engine/llm/__init__.py +0 -0
  32. databao_context_engine/llm/config.py +20 -0
  33. databao_context_engine/llm/descriptions/__init__.py +0 -0
  34. databao_context_engine/llm/descriptions/ollama.py +21 -0
  35. databao_context_engine/llm/descriptions/provider.py +10 -0
  36. databao_context_engine/llm/embeddings/__init__.py +0 -0
  37. databao_context_engine/llm/embeddings/ollama.py +37 -0
  38. databao_context_engine/llm/embeddings/provider.py +13 -0
  39. databao_context_engine/llm/errors.py +16 -0
  40. databao_context_engine/llm/factory.py +61 -0
  41. databao_context_engine/llm/install.py +227 -0
  42. databao_context_engine/llm/runtime.py +73 -0
  43. databao_context_engine/llm/service.py +159 -0
  44. databao_context_engine/main.py +19 -0
  45. databao_context_engine/mcp/__init__.py +0 -0
  46. databao_context_engine/mcp/all_results_tool.py +5 -0
  47. databao_context_engine/mcp/mcp_runner.py +16 -0
  48. databao_context_engine/mcp/mcp_server.py +63 -0
  49. databao_context_engine/mcp/retrieve_tool.py +22 -0
  50. databao_context_engine/pluginlib/__init__.py +0 -0
  51. databao_context_engine/pluginlib/build_plugin.py +107 -0
  52. databao_context_engine/pluginlib/config.py +37 -0
  53. databao_context_engine/pluginlib/plugin_utils.py +68 -0
  54. databao_context_engine/plugins/__init__.py +0 -0
  55. databao_context_engine/plugins/athena_db_plugin.py +12 -0
  56. databao_context_engine/plugins/base_db_plugin.py +45 -0
  57. databao_context_engine/plugins/clickhouse_db_plugin.py +15 -0
  58. databao_context_engine/plugins/databases/__init__.py +0 -0
  59. databao_context_engine/plugins/databases/athena_introspector.py +101 -0
  60. databao_context_engine/plugins/databases/base_introspector.py +144 -0
  61. databao_context_engine/plugins/databases/clickhouse_introspector.py +162 -0
  62. databao_context_engine/plugins/databases/database_chunker.py +69 -0
  63. databao_context_engine/plugins/databases/databases_types.py +114 -0
  64. databao_context_engine/plugins/databases/duckdb_introspector.py +325 -0
  65. databao_context_engine/plugins/databases/introspection_model_builder.py +270 -0
  66. databao_context_engine/plugins/databases/introspection_scope.py +74 -0
  67. databao_context_engine/plugins/databases/introspection_scope_matcher.py +103 -0
  68. databao_context_engine/plugins/databases/mssql_introspector.py +433 -0
  69. databao_context_engine/plugins/databases/mysql_introspector.py +338 -0
  70. databao_context_engine/plugins/databases/postgresql_introspector.py +428 -0
  71. databao_context_engine/plugins/databases/snowflake_introspector.py +287 -0
  72. databao_context_engine/plugins/duckdb_db_plugin.py +12 -0
  73. databao_context_engine/plugins/mssql_db_plugin.py +12 -0
  74. databao_context_engine/plugins/mysql_db_plugin.py +12 -0
  75. databao_context_engine/plugins/parquet_plugin.py +32 -0
  76. databao_context_engine/plugins/plugin_loader.py +110 -0
  77. databao_context_engine/plugins/postgresql_db_plugin.py +12 -0
  78. databao_context_engine/plugins/resources/__init__.py +0 -0
  79. databao_context_engine/plugins/resources/parquet_chunker.py +23 -0
  80. databao_context_engine/plugins/resources/parquet_introspector.py +154 -0
  81. databao_context_engine/plugins/snowflake_db_plugin.py +12 -0
  82. databao_context_engine/plugins/unstructured_files_plugin.py +68 -0
  83. databao_context_engine/project/__init__.py +0 -0
  84. databao_context_engine/project/datasource_discovery.py +141 -0
  85. databao_context_engine/project/info.py +44 -0
  86. databao_context_engine/project/init_project.py +102 -0
  87. databao_context_engine/project/layout.py +127 -0
  88. databao_context_engine/project/project_config.py +32 -0
  89. databao_context_engine/project/resources/examples/src/databases/example_postgres.yaml +7 -0
  90. databao_context_engine/project/resources/examples/src/files/documentation.md +30 -0
  91. databao_context_engine/project/resources/examples/src/files/notes.txt +20 -0
  92. databao_context_engine/project/runs.py +39 -0
  93. databao_context_engine/project/types.py +134 -0
  94. databao_context_engine/retrieve_embeddings/__init__.py +0 -0
  95. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  96. databao_context_engine/retrieve_embeddings/internal/export_results.py +12 -0
  97. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +34 -0
  98. databao_context_engine/retrieve_embeddings/internal/retrieve_service.py +68 -0
  99. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +29 -0
  100. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  101. databao_context_engine/retrieve_embeddings/public/api.py +3 -0
  102. databao_context_engine/serialisation/__init__.py +0 -0
  103. databao_context_engine/serialisation/yaml.py +35 -0
  104. databao_context_engine/services/__init__.py +0 -0
  105. databao_context_engine/services/chunk_embedding_service.py +104 -0
  106. databao_context_engine/services/embedding_shard_resolver.py +64 -0
  107. databao_context_engine/services/factories.py +88 -0
  108. databao_context_engine/services/models.py +12 -0
  109. databao_context_engine/services/persistence_service.py +61 -0
  110. databao_context_engine/services/run_name_policy.py +8 -0
  111. databao_context_engine/services/table_name_policy.py +15 -0
  112. databao_context_engine/storage/__init__.py +0 -0
  113. databao_context_engine/storage/connection.py +32 -0
  114. databao_context_engine/storage/exceptions/__init__.py +0 -0
  115. databao_context_engine/storage/exceptions/exceptions.py +6 -0
  116. databao_context_engine/storage/migrate.py +127 -0
  117. databao_context_engine/storage/migrations/V01__init.sql +63 -0
  118. databao_context_engine/storage/models.py +51 -0
  119. databao_context_engine/storage/repositories/__init__.py +0 -0
  120. databao_context_engine/storage/repositories/chunk_repository.py +130 -0
  121. databao_context_engine/storage/repositories/datasource_run_repository.py +136 -0
  122. databao_context_engine/storage/repositories/embedding_model_registry_repository.py +87 -0
  123. databao_context_engine/storage/repositories/embedding_repository.py +113 -0
  124. databao_context_engine/storage/repositories/factories.py +35 -0
  125. databao_context_engine/storage/repositories/run_repository.py +157 -0
  126. databao_context_engine/storage/repositories/vector_search_repository.py +63 -0
  127. databao_context_engine/storage/transaction.py +14 -0
  128. databao_context_engine/system/__init__.py +0 -0
  129. databao_context_engine/system/properties.py +13 -0
  130. databao_context_engine/templating/__init__.py +0 -0
  131. databao_context_engine/templating/renderer.py +29 -0
  132. databao_context_engine-0.1.1.dist-info/METADATA +186 -0
  133. databao_context_engine-0.1.1.dist-info/RECORD +135 -0
  134. databao_context_engine-0.1.1.dist-info/WHEEL +4 -0
  135. databao_context_engine-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,68 @@
1
+ import re
2
+ from io import BufferedReader
3
+ from typing import Any, TypedDict
4
+
5
+ from databao_context_engine.pluginlib.build_plugin import BuildFilePlugin, EmbeddableChunk
6
+
7
+
8
+ class FileChunk(TypedDict):
9
+ chunk_index: int
10
+ chunk_content: str
11
+
12
+
13
+ class InternalUnstructuredFilesPlugin(BuildFilePlugin):
14
+ id = "jetbrains/unstructured_files"
15
+ name = "Unstructured Files Plugin"
16
+
17
+ _SUPPORTED_FILES_EXTENSIONS = {"txt", "md"}
18
+
19
+ _DEFAULT_MAX_TOKENS = 300
20
+ _DEFAULT_TOKENS_OVERLAP = 50
21
+
22
+ def __init__(self, max_tokens: int | None = None, tokens_overlap: int | None = None):
23
+ self.max_tokens = max_tokens or self._DEFAULT_MAX_TOKENS
24
+ self.tokens_overlap = tokens_overlap or self._DEFAULT_TOKENS_OVERLAP
25
+
26
+ def supported_types(self) -> set[str]:
27
+ return {f"files/{extension}" for extension in self._SUPPORTED_FILES_EXTENSIONS}
28
+
29
+ def build_file_context(self, full_type: str, file_name: str, file_buffer: BufferedReader) -> Any:
30
+ file_content = self._read_file(file_buffer)
31
+
32
+ return {
33
+ "chunks": self._chunk_file(file_content),
34
+ }
35
+
36
+ def divide_context_into_chunks(self, context: Any) -> list[EmbeddableChunk]:
37
+ return [self._create_embeddable_chunk_from_file_chunk(file_chunk) for file_chunk in context["chunks"]]
38
+
39
+ def _create_embeddable_chunk_from_file_chunk(self, file_chunk: FileChunk) -> EmbeddableChunk:
40
+ return EmbeddableChunk(
41
+ embeddable_text=file_chunk["chunk_content"],
42
+ content=file_chunk,
43
+ )
44
+
45
+ def _read_file(self, file_buffer: BufferedReader) -> str:
46
+ file_bytes = file_buffer.read()
47
+ return file_bytes.decode("utf-8")
48
+
49
+ def _chunk_file(self, file_content: str) -> list[FileChunk]:
50
+ words_list = re.split(r"\s+", file_content)
51
+
52
+ chunks = []
53
+
54
+ chunk_start_index = 0
55
+ number_of_words = len(words_list)
56
+ while chunk_start_index < number_of_words:
57
+ chunk_end_index = min(number_of_words, chunk_start_index + self.max_tokens)
58
+ chunks.append(
59
+ FileChunk(
60
+ chunk_index=chunk_start_index,
61
+ chunk_content=" ".join(words_list[chunk_start_index:chunk_end_index]),
62
+ )
63
+ )
64
+ chunk_start_index = (
65
+ (chunk_end_index - self.tokens_overlap) if chunk_end_index < number_of_words else number_of_words
66
+ )
67
+
68
+ return chunks
File without changes
@@ -0,0 +1,141 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ import yaml
6
+
7
+ from databao_context_engine.pluginlib.build_plugin import DatasourceType
8
+ from databao_context_engine.project.layout import get_source_dir
9
+ from databao_context_engine.project.types import (
10
+ DatasourceDescriptor,
11
+ DatasourceKind,
12
+ PreparedConfig,
13
+ PreparedDatasource,
14
+ PreparedFile,
15
+ DatasourceId,
16
+ Datasource,
17
+ )
18
+ from databao_context_engine.templating.renderer import render_template
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def get_datasource_list(project_dir: Path) -> list[Datasource]:
24
+ result = []
25
+ for discovered_datasource in discover_datasources(project_dir=project_dir):
26
+ try:
27
+ prepared_source = prepare_source(discovered_datasource)
28
+ except Exception as e:
29
+ logger.debug(str(e), exc_info=True, stack_info=True)
30
+ logger.info(f"Invalid source at ({discovered_datasource.path}): {str(e)}")
31
+ continue
32
+
33
+ result.append(
34
+ Datasource(
35
+ id=DatasourceId.from_datasource_config_file_path(discovered_datasource.path),
36
+ type=prepared_source.datasource_type,
37
+ )
38
+ )
39
+
40
+ return result
41
+
42
+
43
+ def discover_datasources(project_dir: Path) -> list[DatasourceDescriptor]:
44
+ """
45
+ Scan the project's src/ directory and return all discovered sources.
46
+
47
+ Rules:
48
+ - Each first-level directory under src/ is treated as a main_type
49
+ - Unsupported or unreadable entries are skipped.
50
+ - The returned list is sorted by directory and then filename
51
+ """
52
+ src = get_source_dir(project_dir)
53
+ if not src.exists() or not src.is_dir():
54
+ raise ValueError(f"src directory does not exist in {project_dir}")
55
+
56
+ datasources: list[DatasourceDescriptor] = []
57
+ for main_dir in sorted((p for p in src.iterdir() if p.is_dir()), key=lambda p: p.name.lower()):
58
+ for path in sorted((p for p in main_dir.iterdir() if _is_datasource_file(p)), key=lambda p: p.name.lower()):
59
+ datasource = load_datasource_descriptor(path)
60
+ if datasource is not None:
61
+ datasources.append(datasource)
62
+
63
+ return datasources
64
+
65
+
66
+ def _is_datasource_file(p: Path) -> bool:
67
+ # ignore backup files
68
+ return p.is_file() and not p.suffix.endswith("~")
69
+
70
+
71
+ def get_datasource_descriptors(project_dir: Path, datasource_ids: list[DatasourceId]):
72
+ src = get_source_dir(project_dir)
73
+ if not src.exists() or not src.is_dir():
74
+ raise ValueError(f"src directory does not exist in {project_dir}")
75
+
76
+ datasources: list[DatasourceDescriptor] = []
77
+ for datasource_id in datasource_ids:
78
+ config_file_path = src.joinpath(datasource_id.relative_path_to_config_file())
79
+ if not config_file_path.is_file():
80
+ raise ValueError(f"Datasource config file not found: {config_file_path}")
81
+
82
+ datasource = load_datasource_descriptor(config_file_path)
83
+ if datasource is not None:
84
+ datasources.append(datasource)
85
+
86
+ return datasources
87
+
88
+
89
+ def load_datasource_descriptor(path: Path) -> DatasourceDescriptor | None:
90
+ """
91
+ Load a single file with src/<parent_name>/ into a DatasourceDescriptor
92
+ """
93
+ if not path.is_file():
94
+ return None
95
+
96
+ parent_name = path.parent.name
97
+ extension = path.suffix.lower().lstrip(".")
98
+
99
+ if parent_name == "files":
100
+ return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
101
+
102
+ if extension in {"yaml", "yml"}:
103
+ return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.CONFIG)
104
+
105
+ if extension:
106
+ return DatasourceDescriptor(path=path.resolve(), main_type=parent_name, kind=DatasourceKind.FILE)
107
+
108
+ logger.debug("Skipping file without extension: %s", path)
109
+ return None
110
+
111
+
112
+ def prepare_source(datasource: DatasourceDescriptor) -> PreparedDatasource:
113
+ """
114
+ Convert a discovered datasource into a prepared datasource ready for plugin execution
115
+ """
116
+ if datasource.kind is DatasourceKind.FILE:
117
+ file_subtype = datasource.path.suffix.lower().lstrip(".")
118
+ return PreparedFile(
119
+ datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=file_subtype),
120
+ path=datasource.path,
121
+ )
122
+
123
+ else:
124
+ config = _parse_config_file(datasource.path)
125
+
126
+ subtype = config.get("type")
127
+ if not subtype or not isinstance(subtype, str):
128
+ raise ValueError("Config missing 'type' at %s - skipping", datasource.path)
129
+
130
+ return PreparedConfig(
131
+ datasource_type=DatasourceType.from_main_and_subtypes(main_type=datasource.main_type, subtype=subtype),
132
+ path=datasource.path,
133
+ config=config,
134
+ datasource_name=datasource.path.stem,
135
+ )
136
+
137
+
138
+ def _parse_config_file(file_path: Path) -> dict[Any, Any]:
139
+ rendered_file = render_template(file_path.read_text())
140
+
141
+ return yaml.safe_load(rendered_file) or {}
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+ from importlib.metadata import version
3
+ from pathlib import Path
4
+ from uuid import UUID
5
+
6
+ from databao_context_engine.project.layout import validate_project_dir
7
+ from databao_context_engine.system.properties import get_dce_path
8
+
9
+
10
+ @dataclass(kw_only=True, frozen=True)
11
+ class DceProjectInfo:
12
+ project_path: Path
13
+ is_initialised: bool
14
+ project_id: UUID | None
15
+
16
+
17
+ @dataclass(kw_only=True, frozen=True)
18
+ class DceInfo:
19
+ version: str
20
+ dce_path: Path
21
+
22
+ project_info: DceProjectInfo
23
+
24
+
25
+ def get_databao_context_engine_info(project_dir: Path) -> DceInfo:
26
+ return DceInfo(
27
+ version=get_dce_version(),
28
+ dce_path=get_dce_path(),
29
+ project_info=_get_project_info(project_dir),
30
+ )
31
+
32
+
33
+ def _get_project_info(project_dir: Path) -> DceProjectInfo:
34
+ project_layout = validate_project_dir(project_dir)
35
+
36
+ return DceProjectInfo(
37
+ project_path=project_dir,
38
+ is_initialised=project_layout is not None,
39
+ project_id=project_layout.read_config_file().project_id if project_layout is not None else None,
40
+ )
41
+
42
+
43
+ def get_dce_version() -> str:
44
+ return version("databao_context_engine")
@@ -0,0 +1,102 @@
1
+ import shutil
2
+ from enum import Enum
3
+ from pathlib import Path
4
+
5
+ from databao_context_engine.project.layout import (
6
+ get_config_file,
7
+ get_deprecated_config_file,
8
+ get_examples_dir,
9
+ get_logs_dir,
10
+ get_source_dir,
11
+ )
12
+ from databao_context_engine.project.project_config import ProjectConfig
13
+
14
+
15
+ class InitErrorReason(Enum):
16
+ PROJECT_DIR_DOESNT_EXIST = "PROJECT_DIR_DOESNT_EXIST"
17
+ PROJECT_DIR_NOT_DIRECTORY = "PROJECT_DIR_NOT_DIRECTORY"
18
+ PROJECT_DIR_ALREADY_INITIALISED = "PROJECT_DIR_ALREADY_INITIALISED"
19
+
20
+
21
+ class InitProjectError(Exception):
22
+ reason: InitErrorReason
23
+
24
+ def __init__(self, reason: InitErrorReason, message: str | None):
25
+ super().__init__(message or "")
26
+
27
+ self.reason = reason
28
+
29
+
30
+ def init_project_dir(project_dir: Path) -> Path:
31
+ project_creator = _ProjectCreator(project_dir=project_dir)
32
+ project_creator.create()
33
+
34
+ return project_dir
35
+
36
+
37
+ class _ProjectCreator:
38
+ def __init__(self, project_dir: Path):
39
+ self.project_dir = project_dir
40
+ self.deprecated_config_file = get_deprecated_config_file(project_dir)
41
+ self.config_file = get_config_file(project_dir)
42
+ self.src_dir = get_source_dir(project_dir)
43
+ self.examples_dir = get_examples_dir(project_dir)
44
+ self.logs_dir = get_logs_dir(project_dir)
45
+
46
+ def create(self):
47
+ self.ensure_can_init_project()
48
+
49
+ self.create_default_src_dir()
50
+ self.create_logs_dir()
51
+ self.create_examples_dir()
52
+ self.create_dce_config_file()
53
+
54
+ def ensure_can_init_project(self) -> bool:
55
+ if not self.project_dir.exists():
56
+ raise InitProjectError(
57
+ message=f"{self.project_dir.resolve()} does not exist", reason=InitErrorReason.PROJECT_DIR_DOESNT_EXIST
58
+ )
59
+
60
+ if not self.project_dir.is_dir():
61
+ raise InitProjectError(
62
+ message=f"{self.project_dir.resolve()} is not a directory",
63
+ reason=InitErrorReason.PROJECT_DIR_NOT_DIRECTORY,
64
+ )
65
+
66
+ if self.config_file.is_file() or self.deprecated_config_file.is_file():
67
+ raise InitProjectError(
68
+ message=f"Can't initialise a Databao Context Engine project in a folder that already contains a config file. [project_dir: {self.project_dir.resolve()}]",
69
+ reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
70
+ )
71
+
72
+ if self.src_dir.is_dir():
73
+ raise InitProjectError(
74
+ message=f"Can't initialise a Databao Context Engine project in a folder that already contains a src directory. [project_dir: {self.project_dir.resolve()}]",
75
+ reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
76
+ )
77
+
78
+ if self.examples_dir.is_file():
79
+ raise InitProjectError(
80
+ message=f"Can't initialise a Databao Context Engine project in a folder that already contains an examples dir. [project_dir: {self.project_dir.resolve()}]",
81
+ reason=InitErrorReason.PROJECT_DIR_ALREADY_INITIALISED,
82
+ )
83
+
84
+ return True
85
+
86
+ def create_default_src_dir(self) -> None:
87
+ self.src_dir.mkdir(parents=False, exist_ok=False)
88
+
89
+ self.src_dir.joinpath("databases").mkdir(parents=False, exist_ok=False)
90
+ self.src_dir.joinpath("files").mkdir(parents=False, exist_ok=False)
91
+
92
+ def create_logs_dir(self) -> None:
93
+ self.logs_dir.mkdir(exist_ok=True)
94
+
95
+ def create_examples_dir(self) -> None:
96
+ examples_to_copy = Path(__file__).parent.joinpath("resources").joinpath("examples")
97
+
98
+ shutil.copytree(str(examples_to_copy), str(self.examples_dir))
99
+
100
+ def create_dce_config_file(self) -> None:
101
+ self.config_file.touch()
102
+ ProjectConfig().save(self.config_file)
@@ -0,0 +1,127 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+
5
+ from databao_context_engine.project.project_config import ProjectConfig
6
+ from databao_context_engine.project.types import DatasourceId
7
+
8
+ SOURCE_FOLDER_NAME = "src"
9
+ OUTPUT_FOLDER_NAME = "output"
10
+ EXAMPLES_FOLDER_NAME = "examples"
11
+ LOGS_FOLDER_NAME = "logs"
12
+ DEPRECATED_CONFIG_FILE_NAME = "nemory.ini"
13
+ CONFIG_FILE_NAME = "dce.ini"
14
+ ALL_RESULTS_FILE_NAME = "all_results.yaml"
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class ProjectLayout:
22
+ project_dir: Path
23
+ config_file: Path
24
+
25
+ def read_config_file(self) -> ProjectConfig:
26
+ return ProjectConfig.from_file(self.config_file)
27
+
28
+
29
+ def ensure_project_dir(project_dir: Path) -> ProjectLayout:
30
+ return _ProjectValidator(project_dir).ensure_project_dir_valid()
31
+
32
+
33
+ def is_project_dir_valid(project_dir: Path) -> bool:
34
+ return validate_project_dir(project_dir) is not None
35
+
36
+
37
+ def validate_project_dir(project_dir: Path) -> ProjectLayout | None:
38
+ return _ProjectValidator(project_dir).validate()
39
+
40
+
41
+ def get_source_dir(project_dir: Path) -> Path:
42
+ return project_dir.joinpath(SOURCE_FOLDER_NAME)
43
+
44
+
45
+ def get_output_dir(project_dir: Path) -> Path:
46
+ return project_dir.joinpath(OUTPUT_FOLDER_NAME)
47
+
48
+
49
+ def get_examples_dir(project_dir: Path) -> Path:
50
+ return project_dir.joinpath(EXAMPLES_FOLDER_NAME)
51
+
52
+
53
+ def get_deprecated_config_file(project_dir: Path) -> Path:
54
+ return project_dir.joinpath(DEPRECATED_CONFIG_FILE_NAME)
55
+
56
+
57
+ def get_config_file(project_dir: Path) -> Path:
58
+ return project_dir.joinpath(CONFIG_FILE_NAME)
59
+
60
+
61
+ def get_logs_dir(project_dir: Path) -> Path:
62
+ return project_dir.joinpath(LOGS_FOLDER_NAME)
63
+
64
+
65
+ def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
66
+ config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
67
+
68
+ if config_file.is_file():
69
+ raise ValueError(f"A config file already exists for {str(datasource_id)}")
70
+
71
+ return config_file
72
+
73
+
74
+ def create_datasource_config_file(
75
+ project_dir: Path, datasource_id: DatasourceId, config_content: str, overwrite_existing: bool
76
+ ) -> Path:
77
+ if not overwrite_existing:
78
+ ensure_datasource_config_file_doesnt_exist(project_dir, datasource_id)
79
+
80
+ config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
81
+ config_file.parent.mkdir(parents=True, exist_ok=True)
82
+
83
+ config_file.write_text(config_content)
84
+
85
+ return config_file
86
+
87
+
88
+ class _ProjectValidator:
89
+ def __init__(self, project_dir: Path):
90
+ self.project_dir = project_dir
91
+ self.config_file = self.get_config_file()
92
+
93
+ def ensure_project_dir_valid(self) -> ProjectLayout:
94
+ if not self.project_dir.is_dir():
95
+ raise ValueError(f"The current project directory is not valid: {self.project_dir.resolve()}")
96
+
97
+ if self.config_file is None:
98
+ raise ValueError(
99
+ f"The current project directory has not been initialised. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
100
+ )
101
+
102
+ if not self.is_src_valid():
103
+ raise ValueError(
104
+ f"The current project directory has not been initialised. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
105
+ )
106
+
107
+ return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
108
+
109
+ def validate(self) -> ProjectLayout | None:
110
+ if self.config_file is not None and self.is_src_valid():
111
+ return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
112
+ return None
113
+
114
+ def is_src_valid(self) -> bool:
115
+ return get_source_dir(self.project_dir).is_dir()
116
+
117
+ def get_config_file(self) -> Path | None:
118
+ deprecated_config_file = get_deprecated_config_file(self.project_dir)
119
+ if deprecated_config_file.is_file():
120
+ logger.warning(
121
+ f"{DEPRECATED_CONFIG_FILE_NAME} project config file is deprecated, please rename this file to {CONFIG_FILE_NAME}"
122
+ )
123
+ return deprecated_config_file.resolve()
124
+ config_file = get_config_file(self.project_dir)
125
+ if config_file.is_file():
126
+ return config_file.resolve()
127
+ return None
@@ -0,0 +1,32 @@
1
+ import configparser
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+
6
+ class ProjectConfig:
7
+ # Since we're supporting Python 3.10, we have to add a section
8
+ # configparser.UNNAMED_SECTION was only introduced in Python 3.13
9
+ _DEFAULT_SECTION = "DEFAULT"
10
+ _PROJECT_ID_PROPERTY_NAME = "project-id"
11
+
12
+ project_id: uuid.UUID
13
+
14
+ @staticmethod
15
+ def from_file(project_config_file: Path) -> "ProjectConfig":
16
+ with open(project_config_file, "r") as file_stream:
17
+ config = configparser.ConfigParser()
18
+ config.read_file(file_stream)
19
+
20
+ return ProjectConfig(
21
+ project_id=uuid.UUID(config[ProjectConfig._DEFAULT_SECTION][ProjectConfig._PROJECT_ID_PROPERTY_NAME])
22
+ )
23
+
24
+ def __init__(self, project_id: uuid.UUID | None = None):
25
+ self.project_id = project_id or uuid.uuid4()
26
+
27
+ def save(self, project_config_file: Path) -> None:
28
+ config = configparser.ConfigParser()
29
+ config[self._DEFAULT_SECTION][self._PROJECT_ID_PROPERTY_NAME] = str(self.project_id)
30
+
31
+ with open(project_config_file, "w") as file_stream:
32
+ config.write(file_stream)
@@ -0,0 +1,7 @@
1
+ type: postgres
2
+ connection:
3
+ # your database connection configurations
4
+ host: localhost
5
+ port: 5432
6
+ user: username
7
+ password: password
@@ -0,0 +1,30 @@
1
+ # Project Notes — September 2025
2
+
3
+ ## Overview
4
+
5
+ This project integrates database introspection, dbt metadata, and document embeddings to create a unified data context
6
+ engine.
7
+
8
+ ## Goals
9
+
10
+ - Enable users to search and reason about their data environment in natural language.
11
+ - Extend support to plain text and PDF documents to capture unstructured knowledge.
12
+ - Provide consistent output structure under the `/output/run-<timestamp>` folders.
13
+
14
+ ## Next Steps
15
+
16
+ 1. Finish the file ingestion subsystem (loader, service, exporter).
17
+ 2. Add PDF parsing and chunking support.
18
+ 3. Implement caching for large document embeddings.
19
+ 4. Evaluate chunking strategies for better recall during querying.
20
+
21
+ ## Open Questions
22
+
23
+ - Should we embed entire paragraphs or split by sentence boundaries?
24
+ - How do we handle private/sensitive data inside document chunks?
25
+ - Could the system use source control metadata (e.g., Git history) to enrich document context?
26
+
27
+ ## Summary
28
+
29
+ Once completed, the system will allow querying across databases, dbt models, and raw files — a single interface for
30
+ exploring data knowledge.
@@ -0,0 +1,20 @@
1
+ The Vision: To empower developers worldwide with a seamless AI‑powered collaborative code editor that blends real‑time collaboration with intelligent assistance, debugging, and predictive coding suggestions.
2
+ The project, dubbed "CoDev‑AI," is an open‑source, web‑based platform built on a microservices architecture that allows plug‑ins, language models, and cloud storage to interoperate efficiently.
3
+
4
+ At its core, the editor is built on the powerful CodeMirror engine, which offers a lightweight, feature‑rich API for syntax highlighting, bracket matching, and refactoring. Users can open a shared workspace, add collaborators, and start writing code together. The AI layer sits beneath the UI, listening to every keystroke, providing real‑time code completions based on a fine‑tuned GPT‑4 architecture, as well as contextual insights into project dependencies.
5
+
6
+ The architecture is designed to scale horizontally. Microservices such as the "CodeAssist" service, "CollabSync," and "RepoSync" run in separate containers, each exposing a RESTful API. The frontend communicates through a WebSocket channel for low‑latency updates, while the services process requests asynchronously to keep the UI fluid.
7
+
8
+ The data layer stores all changes in an event‑sourced database, enabling audit trails and replay of entire sessions. Each change is tagged with a unique ID, timestamp, and the user responsible. This approach guarantees that any rollback or conflict resolution is trivial.
9
+
10
+ The core AI capabilities are integrated via the "CodeAssist" microservice. It leverages a fine‑tuned transformer model trained on millions of lines of open‑source code, and supports multiple languages, including JavaScript, Python, Rust, Go, and TypeScript. The model offers features such as autocomplete, context‑aware refactoring, and automated unit‑test generation. Moreover, it can analyze the entire repository to spot deprecated APIs, suggest migration paths, and produce a compliance report.
11
+
12
+ The collaboration engine is built on CRDTs (Conflict‑Free Replicated Data Types), ensuring eventual consistency across all clients without locking mechanisms. Users can chat, annotate, and review code inline, all changes instantly visible to the entire team. A sophisticated permission system allows owners to define roles, read/write access, and integrate with existing identity providers like SAML and OAuth2.
13
+
14
+ Beyond the editor, the platform includes a build pipeline runner that automatically triggers when a merge request is opened. It compiles, runs tests, collects coverage data, and reports metrics to the UI. This feature encourages best practices by providing instant feedback on code quality.
15
+
16
+ Future roadmap highlights the addition of a visual debugger that can step through TypeScript transpilation, highlight stack traces, and enable live breakpoints. Additionally, the team is exploring a self‑hosted variant that ships as a single Docker image, appealing to enterprises with stricter compliance requirements.
17
+
18
+ The project is hosted on GitHub under a permissive MIT license. Contributions are welcomed in the form of pull requests for new features, bug fixes, or educational material. A dedicated discussion forum guides developers in onboarding, code best practices, and architectural design. The team maintains a comprehensive documentation portal featuring tutorials, sample projects, and a robust FAQ.
19
+
20
+ In summary, CoDev‑AI aspires to transform the software development landscape, bridging creativity with artificial intelligence, reducing how developers collaborate, enabling global collaboration across distributed teams seamlessly.
@@ -0,0 +1,39 @@
1
+ from pathlib import Path
2
+
3
+ from databao_context_engine.project.layout import ProjectLayout, get_output_dir
4
+ from databao_context_engine.storage.connection import open_duckdb_connection
5
+ from databao_context_engine.storage.repositories.factories import create_run_repository
6
+ from databao_context_engine.storage.repositories.run_repository import RunRepository
7
+ from databao_context_engine.system.properties import get_db_path
8
+
9
+
10
+ def resolve_run_name(*, project_layout: ProjectLayout, run_name: str | None) -> str:
11
+ project_id = str(project_layout.read_config_file().project_id)
12
+
13
+ with open_duckdb_connection(get_db_path()) as conn:
14
+ run_repository = create_run_repository(conn)
15
+
16
+ return resolve_run_name_from_repo(run_repository=run_repository, project_id=project_id, run_name=run_name)
17
+
18
+
19
+ def resolve_run_name_from_repo(*, run_repository: RunRepository, project_id: str, run_name: str | None) -> str:
20
+ if run_name is None:
21
+ latest = run_repository.get_latest_run_for_project(project_id=project_id)
22
+ if latest is None:
23
+ raise LookupError(f"No runs found for project '{project_id}'. Run a build first.")
24
+ return latest.run_name
25
+ else:
26
+ run = run_repository.get_by_run_name(project_id=project_id, run_name=run_name)
27
+ if run is None:
28
+ raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
29
+ return run.run_name
30
+
31
+
32
+ def get_run_dir(project_dir: Path, run_name: str) -> Path:
33
+ run_dir = get_output_dir(project_dir).joinpath(run_name)
34
+ if not run_dir.is_dir():
35
+ raise ValueError(
36
+ f"The run with name {run_name} doesn't exist in the project. [project_dir: {project_dir.resolve()}]"
37
+ )
38
+
39
+ return run_dir