databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. databao_context_engine/__init__.py +32 -7
  2. databao_context_engine/build_sources/__init__.py +4 -0
  3. databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +31 -27
  4. databao_context_engine/build_sources/build_service.py +53 -0
  5. databao_context_engine/build_sources/build_wiring.py +82 -0
  6. databao_context_engine/build_sources/export_results.py +41 -0
  7. databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +11 -18
  8. databao_context_engine/cli/add_datasource_config.py +49 -44
  9. databao_context_engine/cli/commands.py +40 -55
  10. databao_context_engine/cli/info.py +3 -2
  11. databao_context_engine/databao_context_engine.py +127 -0
  12. databao_context_engine/databao_context_project_manager.py +147 -30
  13. databao_context_engine/{datasource_config → datasources}/check_config.py +31 -23
  14. databao_context_engine/datasources/datasource_context.py +90 -0
  15. databao_context_engine/datasources/datasource_discovery.py +143 -0
  16. databao_context_engine/datasources/types.py +194 -0
  17. databao_context_engine/generate_configs_schemas.py +4 -5
  18. databao_context_engine/init_project.py +25 -3
  19. databao_context_engine/introspection/property_extract.py +76 -57
  20. databao_context_engine/llm/__init__.py +10 -0
  21. databao_context_engine/llm/api.py +57 -0
  22. databao_context_engine/llm/descriptions/ollama.py +1 -3
  23. databao_context_engine/llm/errors.py +2 -8
  24. databao_context_engine/llm/factory.py +5 -2
  25. databao_context_engine/llm/install.py +26 -30
  26. databao_context_engine/llm/runtime.py +3 -5
  27. databao_context_engine/llm/service.py +1 -3
  28. databao_context_engine/mcp/mcp_runner.py +4 -2
  29. databao_context_engine/mcp/mcp_server.py +9 -11
  30. databao_context_engine/plugin_loader.py +110 -0
  31. databao_context_engine/pluginlib/build_plugin.py +12 -29
  32. databao_context_engine/pluginlib/config.py +16 -2
  33. databao_context_engine/plugins/{athena_db_plugin.py → databases/athena/athena_db_plugin.py} +3 -3
  34. databao_context_engine/plugins/databases/athena/athena_introspector.py +161 -0
  35. databao_context_engine/plugins/{base_db_plugin.py → databases/base_db_plugin.py} +6 -5
  36. databao_context_engine/plugins/databases/base_introspector.py +11 -12
  37. databao_context_engine/plugins/{clickhouse_db_plugin.py → databases/clickhouse/clickhouse_db_plugin.py} +3 -3
  38. databao_context_engine/plugins/databases/{clickhouse_introspector.py → clickhouse/clickhouse_introspector.py} +24 -16
  39. databao_context_engine/plugins/databases/duckdb/duckdb_db_plugin.py +12 -0
  40. databao_context_engine/plugins/databases/{duckdb_introspector.py → duckdb/duckdb_introspector.py} +7 -12
  41. databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
  42. databao_context_engine/plugins/databases/introspection_scope.py +11 -9
  43. databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
  44. databao_context_engine/plugins/{mssql_db_plugin.py → databases/mssql/mssql_db_plugin.py} +3 -3
  45. databao_context_engine/plugins/databases/{mssql_introspector.py → mssql/mssql_introspector.py} +29 -21
  46. databao_context_engine/plugins/{mysql_db_plugin.py → databases/mysql/mysql_db_plugin.py} +3 -3
  47. databao_context_engine/plugins/databases/{mysql_introspector.py → mysql/mysql_introspector.py} +26 -15
  48. databao_context_engine/plugins/databases/postgresql/__init__.py +0 -0
  49. databao_context_engine/plugins/databases/postgresql/postgresql_db_plugin.py +15 -0
  50. databao_context_engine/plugins/databases/{postgresql_introspector.py → postgresql/postgresql_introspector.py} +11 -18
  51. databao_context_engine/plugins/databases/snowflake/__init__.py +0 -0
  52. databao_context_engine/plugins/databases/snowflake/snowflake_db_plugin.py +15 -0
  53. databao_context_engine/plugins/databases/{snowflake_introspector.py → snowflake/snowflake_introspector.py} +49 -17
  54. databao_context_engine/plugins/databases/sqlite/__init__.py +0 -0
  55. databao_context_engine/plugins/databases/sqlite/sqlite_db_plugin.py +12 -0
  56. databao_context_engine/plugins/databases/sqlite/sqlite_introspector.py +241 -0
  57. databao_context_engine/plugins/duckdb_tools.py +18 -0
  58. databao_context_engine/plugins/files/__init__.py +0 -0
  59. databao_context_engine/plugins/{unstructured_files_plugin.py → files/unstructured_files_plugin.py} +1 -1
  60. databao_context_engine/plugins/plugin_loader.py +58 -52
  61. databao_context_engine/plugins/resources/parquet_introspector.py +8 -20
  62. databao_context_engine/plugins/{parquet_plugin.py → resources/parquet_plugin.py} +1 -3
  63. databao_context_engine/project/info.py +34 -2
  64. databao_context_engine/project/init_project.py +16 -7
  65. databao_context_engine/project/layout.py +14 -15
  66. databao_context_engine/retrieve_embeddings/__init__.py +3 -0
  67. databao_context_engine/retrieve_embeddings/retrieve_runner.py +17 -0
  68. databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +12 -19
  69. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +46 -0
  70. databao_context_engine/serialization/__init__.py +0 -0
  71. databao_context_engine/{serialisation → serialization}/yaml.py +6 -6
  72. databao_context_engine/services/chunk_embedding_service.py +23 -11
  73. databao_context_engine/services/factories.py +1 -46
  74. databao_context_engine/services/persistence_service.py +11 -11
  75. databao_context_engine/storage/connection.py +11 -7
  76. databao_context_engine/storage/exceptions/exceptions.py +2 -2
  77. databao_context_engine/storage/migrate.py +3 -5
  78. databao_context_engine/storage/migrations/V01__init.sql +6 -31
  79. databao_context_engine/storage/models.py +2 -23
  80. databao_context_engine/storage/repositories/chunk_repository.py +16 -12
  81. databao_context_engine/storage/repositories/factories.py +1 -12
  82. databao_context_engine/storage/repositories/vector_search_repository.py +23 -16
  83. databao_context_engine/system/properties.py +4 -2
  84. databao_context_engine-0.1.5.dist-info/METADATA +228 -0
  85. databao_context_engine-0.1.5.dist-info/RECORD +135 -0
  86. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/WHEEL +1 -1
  87. databao_context_engine/build_sources/internal/build_service.py +0 -77
  88. databao_context_engine/build_sources/internal/build_wiring.py +0 -52
  89. databao_context_engine/build_sources/internal/export_results.py +0 -43
  90. databao_context_engine/build_sources/public/api.py +0 -4
  91. databao_context_engine/databao_engine.py +0 -85
  92. databao_context_engine/datasource_config/add_config.py +0 -50
  93. databao_context_engine/datasource_config/datasource_context.py +0 -60
  94. databao_context_engine/mcp/all_results_tool.py +0 -5
  95. databao_context_engine/mcp/retrieve_tool.py +0 -22
  96. databao_context_engine/plugins/databases/athena_introspector.py +0 -101
  97. databao_context_engine/plugins/duckdb_db_plugin.py +0 -12
  98. databao_context_engine/plugins/postgresql_db_plugin.py +0 -12
  99. databao_context_engine/plugins/snowflake_db_plugin.py +0 -12
  100. databao_context_engine/project/datasource_discovery.py +0 -141
  101. databao_context_engine/project/runs.py +0 -39
  102. databao_context_engine/project/types.py +0 -134
  103. databao_context_engine/retrieve_embeddings/internal/export_results.py +0 -12
  104. databao_context_engine/retrieve_embeddings/internal/retrieve_runner.py +0 -34
  105. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
  106. databao_context_engine/retrieve_embeddings/public/api.py +0 -3
  107. databao_context_engine/services/run_name_policy.py +0 -8
  108. databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
  109. databao_context_engine/storage/repositories/run_repository.py +0 -157
  110. databao_context_engine-0.1.1.dist-info/METADATA +0 -186
  111. databao_context_engine-0.1.1.dist-info/RECORD +0 -135
  112. /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
  113. /databao_context_engine/{build_sources/public → plugins/databases/athena}/__init__.py +0 -0
  114. /databao_context_engine/{datasource_config → plugins/databases/clickhouse}/__init__.py +0 -0
  115. /databao_context_engine/{retrieve_embeddings/internal → plugins/databases/duckdb}/__init__.py +0 -0
  116. /databao_context_engine/{retrieve_embeddings/public → plugins/databases/mssql}/__init__.py +0 -0
  117. /databao_context_engine/{serialisation → plugins/databases/mysql}/__init__.py +0 -0
  118. {databao_context_engine-0.1.1.dist-info → databao_context_engine-0.1.5.dist-info}/entry_points.txt +0 -0
@@ -3,7 +3,6 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
 
5
5
  from databao_context_engine.project.project_config import ProjectConfig
6
- from databao_context_engine.project.types import DatasourceId
7
6
 
8
7
  SOURCE_FOLDER_NAME = "src"
9
8
  OUTPUT_FOLDER_NAME = "output"
@@ -25,6 +24,14 @@ class ProjectLayout:
25
24
  def read_config_file(self) -> ProjectConfig:
26
25
  return ProjectConfig.from_file(self.config_file)
27
26
 
27
+ @property
28
+ def src_dir(self) -> Path:
29
+ return get_source_dir(self.project_dir)
30
+
31
+ @property
32
+ def output_dir(self) -> Path:
33
+ return get_output_dir(self.project_dir)
34
+
28
35
 
29
36
  def ensure_project_dir(project_dir: Path) -> ProjectLayout:
30
37
  return _ProjectValidator(project_dir).ensure_project_dir_valid()
@@ -62,22 +69,14 @@ def get_logs_dir(project_dir: Path) -> Path:
62
69
  return project_dir.joinpath(LOGS_FOLDER_NAME)
63
70
 
64
71
 
65
- def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
66
- config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
67
-
68
- if config_file.is_file():
69
- raise ValueError(f"A config file already exists for {str(datasource_id)}")
70
-
71
- return config_file
72
-
73
-
74
72
  def create_datasource_config_file(
75
- project_dir: Path, datasource_id: DatasourceId, config_content: str, overwrite_existing: bool
73
+ project_layout: ProjectLayout, datasource_relative_name: str, config_content: str, overwrite_existing: bool
76
74
  ) -> Path:
75
+ config_file = project_layout.src_dir / datasource_relative_name
77
76
  if not overwrite_existing:
78
- ensure_datasource_config_file_doesnt_exist(project_dir, datasource_id)
77
+ if config_file.is_file():
78
+ raise ValueError(f"A config file already exists {config_file}")
79
79
 
80
- config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
81
80
  config_file.parent.mkdir(parents=True, exist_ok=True)
82
81
 
83
82
  config_file.write_text(config_content)
@@ -96,12 +95,12 @@ class _ProjectValidator:
96
95
 
97
96
  if self.config_file is None:
98
97
  raise ValueError(
99
- f"The current project directory has not been initialised. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
98
+ f"The current project directory has not been initialized. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
100
99
  )
101
100
 
102
101
  if not self.is_src_valid():
103
102
  raise ValueError(
104
- f"The current project directory has not been initialised. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
103
+ f"The current project directory has not been initialized. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
105
104
  )
106
105
 
107
106
  return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)
@@ -0,0 +1,3 @@
1
+ from databao_context_engine.retrieve_embeddings.retrieve_wiring import retrieve_embeddings
2
+
3
+ __all__ = ["retrieve_embeddings"]
@@ -0,0 +1,17 @@
1
+ import logging
2
+
3
+ from databao_context_engine.datasources.types import DatasourceId
4
+ from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
5
+ from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def retrieve(
11
+ *,
12
+ retrieve_service: RetrieveService,
13
+ text: str,
14
+ limit: int | None,
15
+ datasource_ids: list[DatasourceId] | None = None,
16
+ ) -> list[VectorSearchResult]:
17
+ return retrieve_service.retrieve(text=text, limit=limit, datasource_ids=datasource_ids)
@@ -1,10 +1,9 @@
1
1
  import logging
2
2
  from collections.abc import Sequence
3
3
 
4
+ from databao_context_engine.datasources.types import DatasourceId
4
5
  from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
5
- from databao_context_engine.project.runs import resolve_run_name_from_repo
6
6
  from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
7
- from databao_context_engine.storage.repositories.run_repository import RunRepository
8
7
  from databao_context_engine.storage.repositories.vector_search_repository import (
9
8
  VectorSearchRepository,
10
9
  VectorSearchResult,
@@ -17,52 +16,46 @@ class RetrieveService:
17
16
  def __init__(
18
17
  self,
19
18
  *,
20
- run_repo: RunRepository,
21
19
  vector_search_repo: VectorSearchRepository,
22
20
  shard_resolver: EmbeddingShardResolver,
23
21
  provider: EmbeddingProvider,
24
22
  ):
25
- self._run_repo = run_repo
26
23
  self._shard_resolver = shard_resolver
27
24
  self._provider = provider
28
25
  self._vector_search_repo = vector_search_repo
29
26
 
30
27
  def retrieve(
31
- self, *, project_id: str, text: str, run_name: str, limit: int | None = None
28
+ self, *, text: str, limit: int | None = None, datasource_ids: list[DatasourceId] | None = None
32
29
  ) -> list[VectorSearchResult]:
33
30
  if limit is None:
34
31
  limit = 10
35
32
 
36
- run = self._run_repo.get_by_run_name(project_id=project_id, run_name=run_name)
37
- if run is None:
38
- raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
39
-
40
33
  table_name, dimension = self._shard_resolver.resolve(
41
34
  embedder=self._provider.embedder, model_id=self._provider.model_id
42
35
  )
43
36
 
44
37
  retrieve_vec: Sequence[float] = self._provider.embed(text)
45
38
 
46
- logger.debug(f"Retrieving display texts for run {run.run_id} in table {table_name}")
39
+ logger.debug(f"Retrieving display texts in table {table_name}")
47
40
 
48
41
  search_results = self._vector_search_repo.get_display_texts_by_similarity(
49
42
  table_name=table_name,
50
- run_id=run.run_id,
51
43
  retrieve_vec=retrieve_vec,
52
44
  dimension=dimension,
53
45
  limit=limit,
46
+ datasource_ids=datasource_ids,
54
47
  )
55
48
 
56
- logger.debug(f"Retrieved {len(search_results)} display texts for run {run.run_id} in table {table_name}")
49
+ logger.debug(f"Retrieved {len(search_results)} display texts in table {table_name}")
57
50
 
58
51
  if logger.isEnabledFor(logging.DEBUG):
59
- closest_result = min(search_results, key=lambda result: result.cosine_distance)
60
- logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
52
+ if search_results:
53
+ closest_result = min(search_results, key=lambda result: result.cosine_distance)
54
+ logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
61
55
 
62
- farthest_result = max(search_results, key=lambda result: result.cosine_distance)
63
- logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
56
+ farthest_result = max(search_results, key=lambda result: result.cosine_distance)
57
+ logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
58
+ else:
59
+ logger.debug("No results found")
64
60
 
65
61
  return search_results
66
-
67
- def resolve_run_name(self, *, project_id: str, run_name: str | None) -> str:
68
- return resolve_run_name_from_repo(run_repository=self._run_repo, project_id=project_id, run_name=run_name)
@@ -0,0 +1,46 @@
1
+ from duckdb import DuckDBPyConnection
2
+
3
+ from databao_context_engine.datasources.types import DatasourceId
4
+ from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
5
+ from databao_context_engine.llm.factory import create_ollama_embedding_provider, create_ollama_service
6
+ from databao_context_engine.project.layout import ProjectLayout
7
+ from databao_context_engine.retrieve_embeddings.retrieve_runner import retrieve
8
+ from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
9
+ from databao_context_engine.services.factories import create_shard_resolver
10
+ from databao_context_engine.storage.connection import open_duckdb_connection
11
+ from databao_context_engine.storage.repositories.factories import create_vector_search_repository
12
+ from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
13
+ from databao_context_engine.system.properties import get_db_path
14
+
15
+
16
+ def retrieve_embeddings(
17
+ project_layout: ProjectLayout,
18
+ retrieve_text: str,
19
+ limit: int | None,
20
+ datasource_ids: list[DatasourceId] | None,
21
+ ) -> list[VectorSearchResult]:
22
+ with open_duckdb_connection(get_db_path(project_layout.project_dir)) as conn:
23
+ ollama_service = create_ollama_service()
24
+ embedding_provider = create_ollama_embedding_provider(ollama_service)
25
+ retrieve_service = _create_retrieve_service(conn, embedding_provider=embedding_provider)
26
+ return retrieve(
27
+ retrieve_service=retrieve_service,
28
+ text=retrieve_text,
29
+ limit=limit,
30
+ datasource_ids=datasource_ids,
31
+ )
32
+
33
+
34
+ def _create_retrieve_service(
35
+ conn: DuckDBPyConnection,
36
+ *,
37
+ embedding_provider: EmbeddingProvider,
38
+ ) -> RetrieveService:
39
+ vector_search_repo = create_vector_search_repository(conn)
40
+ shard_resolver = create_shard_resolver(conn)
41
+
42
+ return RetrieveService(
43
+ vector_search_repo=vector_search_repo,
44
+ shard_resolver=shard_resolver,
45
+ provider=embedding_provider,
46
+ )
File without changes
@@ -7,17 +7,17 @@ from yaml import Node, SafeDumper
7
7
  def default_representer(dumper: SafeDumper, data: object) -> Node:
8
8
  if isinstance(data, Mapping):
9
9
  return dumper.represent_dict(data)
10
- elif hasattr(data, "__dict__"):
11
- # Doesn't serialise "private" attributes (that starts with an _)
10
+ if hasattr(data, "__dict__"):
11
+ # Doesn't serialize "private" attributes (that starts with an _)
12
12
  data_public_attributes = {key: value for key, value in data.__dict__.items() if not key.startswith("_")}
13
13
  if data_public_attributes:
14
14
  return dumper.represent_dict(data_public_attributes)
15
- else:
16
- # If there is no public attributes, we default to the string representation
17
- return dumper.represent_str(str(data))
18
- else:
15
+
16
+ # If there is no public attributes, we default to the string representation
19
17
  return dumper.represent_str(str(data))
20
18
 
19
+ return dumper.represent_str(str(data))
20
+
21
21
 
22
22
  # Registers our default representer only once, when that file is imported
23
23
  yaml.add_multi_representer(object, default_representer, Dumper=SafeDumper)
@@ -5,7 +5,7 @@ from typing import cast
5
5
  from databao_context_engine.llm.descriptions.provider import DescriptionProvider
6
6
  from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
7
7
  from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
8
- from databao_context_engine.serialisation.yaml import to_yaml_string
8
+ from databao_context_engine.serialization.yaml import to_yaml_string
9
9
  from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
10
10
  from databao_context_engine.services.models import ChunkEmbedding
11
11
  from databao_context_engine.services.persistence_service import PersistenceService
@@ -14,9 +14,22 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
16
  class ChunkEmbeddingMode(Enum):
17
+ """Mode controlling how chunks are embedded."""
18
+
17
19
  EMBEDDABLE_TEXT_ONLY = "EMBEDDABLE_TEXT_ONLY"
20
+ """
21
+ The embedding is generated only from the string defined by the plugin as embeddable for a chunk.
22
+ """
23
+
18
24
  GENERATED_DESCRIPTION_ONLY = "GENERATED_DESCRIPTION_ONLY"
25
+ """
26
+ The embedding is generated only from a description of the chunk generated by a LLM.
27
+ """
28
+
19
29
  EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION = "EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION"
30
+ """
31
+ The embedding is generated from both the embeddable string of the chunk and the description of the chunk generated by a LLM.
32
+ """
20
33
 
21
34
  def should_generate_description(self) -> bool:
22
35
  return self in (
@@ -44,26 +57,24 @@ class ChunkEmbeddingService:
44
57
  if self._chunk_embedding_mode.should_generate_description() and description_provider is None:
45
58
  raise ValueError("A DescriptionProvider must be provided when generating descriptions")
46
59
 
47
- def embed_chunks(self, *, datasource_run_id: int, chunks: list[EmbeddableChunk], result: str) -> None:
48
- """
49
- Turn plugin chunks into persisted chunks and embeddings
60
+ def embed_chunks(self, *, chunks: list[EmbeddableChunk], result: str, full_type: str, datasource_id: str) -> None:
61
+ """Turn plugin chunks into persisted chunks and embeddings.
50
62
 
51
63
  Flow:
52
- 1) Embed each chunk into an embedded vector
53
- 2) Get or create embedding table for the appropriate model and embedding dimensions
54
- 3) Persist chunks and embeddings vectors in a single transaction
64
+ 1) Embed each chunk into an embedded vector.
65
+ 2) Get or create embedding table for the appropriate model and embedding dimensions.
66
+ 3) Persist chunks and embeddings vectors in a single transaction.
55
67
  """
56
-
57
68
  if not chunks:
58
69
  return
59
70
 
60
71
  logger.debug(
61
- f"Embedding {len(chunks)} chunks for datasource run {datasource_run_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
72
+ f"Embedding {len(chunks)} chunks for datasource {datasource_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
62
73
  )
63
74
 
64
75
  enriched_embeddings: list[ChunkEmbedding] = []
65
76
  for chunk in chunks:
66
- chunk_display_text = to_yaml_string(chunk.content)
77
+ chunk_display_text = chunk.content if isinstance(chunk.content, str) else to_yaml_string(chunk.content)
67
78
 
68
79
  generated_description = ""
69
80
  match self._chunk_embedding_mode:
@@ -98,7 +109,8 @@ class ChunkEmbeddingService:
98
109
  )
99
110
 
100
111
  self._persistence_service.write_chunks_and_embeddings(
101
- datasource_run_id=datasource_run_id,
102
112
  chunk_embeddings=enriched_embeddings,
103
113
  table_name=table_name,
114
+ full_type=full_type,
115
+ datasource_id=datasource_id,
104
116
  )
@@ -1,20 +1,15 @@
1
- from _duckdb import DuckDBPyConnection
1
+ from duckdb import DuckDBPyConnection
2
2
 
3
- from databao_context_engine.build_sources.internal.build_service import BuildService
4
3
  from databao_context_engine.llm.descriptions.provider import DescriptionProvider
5
4
  from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
6
- from databao_context_engine.retrieve_embeddings.internal.retrieve_service import RetrieveService
7
5
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode, ChunkEmbeddingService
8
6
  from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
9
7
  from databao_context_engine.services.persistence_service import PersistenceService
10
8
  from databao_context_engine.services.table_name_policy import TableNamePolicy
11
9
  from databao_context_engine.storage.repositories.factories import (
12
10
  create_chunk_repository,
13
- create_datasource_run_repository,
14
11
  create_embedding_repository,
15
12
  create_registry_repository,
16
- create_run_repository,
17
- create_vector_search_repository,
18
13
  )
19
14
 
20
15
 
@@ -46,43 +41,3 @@ def create_chunk_embedding_service(
46
41
  description_provider=description_provider,
47
42
  chunk_embedding_mode=chunk_embedding_mode,
48
43
  )
49
-
50
-
51
- def create_build_service(
52
- conn: DuckDBPyConnection,
53
- *,
54
- embedding_provider: EmbeddingProvider,
55
- description_provider: DescriptionProvider | None,
56
- chunk_embedding_mode: ChunkEmbeddingMode,
57
- ) -> BuildService:
58
- run_repo = create_run_repository(conn)
59
- datasource_run_repo = create_datasource_run_repository(conn)
60
- chunk_embedding_service = create_chunk_embedding_service(
61
- conn,
62
- embedding_provider=embedding_provider,
63
- description_provider=description_provider,
64
- chunk_embedding_mode=chunk_embedding_mode,
65
- )
66
-
67
- return BuildService(
68
- run_repo=run_repo,
69
- datasource_run_repo=datasource_run_repo,
70
- chunk_embedding_service=chunk_embedding_service,
71
- )
72
-
73
-
74
- def create_retrieve_service(
75
- conn: DuckDBPyConnection,
76
- *,
77
- embedding_provider: EmbeddingProvider,
78
- ) -> RetrieveService:
79
- run_repo = create_run_repository(conn)
80
- vector_search_repo = create_vector_search_repository(conn)
81
- shard_resolver = create_shard_resolver(conn)
82
-
83
- return RetrieveService(
84
- run_repo=run_repo,
85
- vector_search_repo=vector_search_repo,
86
- shard_resolver=shard_resolver,
87
- provider=embedding_provider,
88
- )
@@ -24,11 +24,13 @@ class PersistenceService:
24
24
  self._dim = dim
25
25
 
26
26
  def write_chunks_and_embeddings(
27
- self, *, datasource_run_id: int, chunk_embeddings: list[ChunkEmbedding], table_name: str
27
+ self, *, chunk_embeddings: list[ChunkEmbedding], table_name: str, full_type: str, datasource_id: str
28
28
  ):
29
- """
30
- Atomically persist chunks and their vectors.
31
- Returns the number of embeddings written.
29
+ """Atomically persist chunks and their vectors.
30
+
31
+ Raises:
32
+ ValueError: If chunk_embeddings is an empty list.
33
+
32
34
  """
33
35
  if not chunk_embeddings:
34
36
  raise ValueError("chunk_embeddings must be a non-empty list")
@@ -36,21 +38,19 @@ class PersistenceService:
36
38
  with transaction(self._conn):
37
39
  for chunk_embedding in chunk_embeddings:
38
40
  chunk_dto = self.create_chunk(
39
- datasource_run_id=datasource_run_id,
41
+ full_type=full_type,
42
+ datasource_id=datasource_id,
40
43
  embeddable_text=chunk_embedding.chunk.embeddable_text,
41
44
  display_text=chunk_embedding.display_text,
42
- generated_description=chunk_embedding.generated_description,
43
45
  )
44
46
  self.create_embedding(table_name=table_name, chunk_id=chunk_dto.chunk_id, vec=chunk_embedding.vec)
45
47
 
46
- def create_chunk(
47
- self, *, datasource_run_id: int, embeddable_text: str, display_text: str, generated_description: str
48
- ) -> ChunkDTO:
48
+ def create_chunk(self, *, full_type: str, datasource_id: str, embeddable_text: str, display_text: str) -> ChunkDTO:
49
49
  return self._chunk_repo.create(
50
- datasource_run_id=datasource_run_id,
50
+ full_type=full_type,
51
+ datasource_id=datasource_id,
51
52
  embeddable_text=embeddable_text,
52
53
  display_text=display_text,
53
- generated_description=generated_description,
54
54
  )
55
55
 
56
56
  def create_embedding(self, *, table_name: str, chunk_id: int, vec: Sequence[float]):
@@ -1,24 +1,28 @@
1
1
  import logging
2
2
  from contextlib import contextmanager
3
3
  from pathlib import Path
4
+ from typing import Iterator
4
5
 
5
6
  import duckdb
6
-
7
- from databao_context_engine.system.properties import get_db_path
7
+ from duckdb import DuckDBPyConnection
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
11
 
12
12
  @contextmanager
13
- def open_duckdb_connection(db_path: str | Path | None = None):
14
- """
15
- Open a DuckDB connection with vector search enabled and close on exist.
16
- Loads the vss extension and enables HNSW experimental persistence.
13
+ def open_duckdb_connection(db_path: str | Path) -> Iterator[DuckDBPyConnection]:
14
+ """Open a DuckDB connection with vector search enabled and close on exist.
15
+
16
+ It also loads the vss extension and enables HNSW experimental persistence on the DuckDB.
17
17
 
18
18
  Usage:
19
19
  with open_duckdb_connection() as conn:
20
+
21
+ Yields:
22
+ The opened DuckDB connection.
23
+
20
24
  """
21
- path = str(db_path or get_db_path())
25
+ path = str(db_path)
22
26
  conn = duckdb.connect(path)
23
27
  logger.debug(f"Connected to DuckDB database at {path}")
24
28
 
@@ -1,6 +1,6 @@
1
1
  class RepositoryError(Exception):
2
- """Base exception for repository errors"""
2
+ """Base exception for repository errors."""
3
3
 
4
4
 
5
5
  class IntegrityError(RepositoryError):
6
- """Raised when a DB constraint is violated"""
6
+ """Raised when a DB constraint is violated."""
@@ -8,12 +8,10 @@ from typing import LiteralString
8
8
 
9
9
  import duckdb
10
10
 
11
- from databao_context_engine.system.properties import get_db_path
12
-
13
11
  logger = logging.getLogger(__name__)
14
12
 
15
13
 
16
- def migrate(db_path: str | Path | None = None, migration_files: list[Path] | None = None) -> None:
14
+ def migrate(db_path: str | Path, migration_files: list[Path] | None = None) -> None:
17
15
  if migration_files is None:
18
16
  migration_files = [
19
17
  migration
@@ -21,7 +19,7 @@ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | Non
21
19
  if isinstance(migration, Path) and ".sql" == migration.suffix
22
20
  ]
23
21
 
24
- db = Path(db_path or get_db_path()).expanduser().resolve()
22
+ db = Path(db_path).expanduser().resolve()
25
23
  db.parent.mkdir(parents=True, exist_ok=True)
26
24
  logger.debug("Running migrations on database: %s", db)
27
25
 
@@ -71,7 +69,7 @@ class _Migration:
71
69
  def _create_migration(file: Path) -> _Migration:
72
70
  query_bytes = file.read_bytes()
73
71
  query = query_bytes.decode("utf-8")
74
- checksum = hashlib.md5(query_bytes).hexdigest()
72
+ checksum = hashlib.md5(query_bytes, usedforsecurity=False).hexdigest()
75
73
  version = _extract_version_from_name(file.name)
76
74
  return _Migration(name=file.name, version=version, checksum=checksum, query=query)
77
75
 
@@ -2,36 +2,15 @@ INSTALL vss;
2
2
  LOAD vss;
3
3
  SET hnsw_enable_experimental_persistence = true;
4
4
 
5
- CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
6
- CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
7
5
  CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
8
6
 
9
- CREATE TABLE IF NOT EXISTS run (
10
- run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
11
- project_id TEXT NOT NULL,
12
- started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
13
- ended_at TIMESTAMP,
14
- nemory_version TEXT,
15
- run_name TEXT NOT NULL,
16
- );
17
-
18
- CREATE TABLE IF NOT EXISTS datasource_run (
19
- datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
20
- run_id BIGINT NOT NULL REFERENCES run(run_id),
21
- plugin TEXT NOT NULL,
22
- source_id TEXT NOT NULL,
23
- storage_directory TEXT NOT NULL,
24
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
25
- full_type TEXT NOT NULL,
26
- );
27
-
28
7
  CREATE TABLE IF NOT EXISTS chunk (
29
- chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
30
- datasource_run_id BIGINT NOT NULL REFERENCES datasource_run(datasource_run_id),
31
- embeddable_text TEXT NOT NULL,
32
- display_text TEXT,
33
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
34
- generated_description TEXT,
8
+ chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
9
+ full_type TEXT NOT NULL,
10
+ datasource_id TEXT NOT NULL,
11
+ embeddable_text TEXT NOT NULL,
12
+ display_text TEXT,
13
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
35
14
  );
36
15
 
37
16
  CREATE TABLE IF NOT EXISTS embedding_model_registry (
@@ -57,7 +36,3 @@ OR IGNORE INTO
57
36
  embedding_model_registry(embedder, model_id, dim, table_name)
58
37
  VALUES
59
38
  ('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
60
-
61
- CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
62
- CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
63
- CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
@@ -4,34 +4,13 @@ from datetime import datetime
4
4
  from typing import Optional
5
5
 
6
6
 
7
- @dataclass(frozen=True)
8
- class RunDTO:
9
- run_id: int
10
- run_name: str
11
- project_id: str
12
- started_at: datetime
13
- ended_at: Optional[datetime]
14
- nemory_version: str
15
-
16
-
17
- @dataclass(frozen=True)
18
- class DatasourceRunDTO:
19
- datasource_run_id: int
20
- run_id: int
21
- plugin: str
22
- full_type: str
23
- source_id: str
24
- storage_directory: str
25
- created_at: datetime
26
-
27
-
28
7
  @dataclass(frozen=True)
29
8
  class ChunkDTO:
30
9
  chunk_id: int
31
- datasource_run_id: int
10
+ full_type: str
11
+ datasource_id: str
32
12
  embeddable_text: str
33
13
  display_text: Optional[str]
34
- generated_description: str
35
14
  created_at: datetime
36
15
 
37
16
 
@@ -14,22 +14,22 @@ class ChunkRepository:
14
14
  def create(
15
15
  self,
16
16
  *,
17
- datasource_run_id: int,
17
+ full_type: str,
18
+ datasource_id: str,
18
19
  embeddable_text: str,
19
20
  display_text: Optional[str],
20
- generated_description: str,
21
21
  ) -> ChunkDTO:
22
22
  try:
23
23
  row = self._conn.execute(
24
24
  """
25
25
  INSERT INTO
26
- chunk(datasource_run_id, embeddable_text, display_text, generated_description)
26
+ chunk(full_type, datasource_id, embeddable_text, display_text)
27
27
  VALUES
28
28
  (?, ?, ?, ?)
29
29
  RETURNING
30
30
  *
31
31
  """,
32
- [datasource_run_id, embeddable_text, display_text, generated_description],
32
+ [full_type, datasource_id, embeddable_text, display_text],
33
33
  ).fetchone()
34
34
  if row is None:
35
35
  raise RuntimeError("chunk creation returned no object")
@@ -55,22 +55,26 @@ class ChunkRepository:
55
55
  self,
56
56
  chunk_id: int,
57
57
  *,
58
+ full_type: Optional[str] = None,
59
+ datasource_id: Optional[str] = None,
58
60
  embeddable_text: Optional[str] = None,
59
61
  display_text: Optional[str] = None,
60
- generated_description: Optional[str] = None,
61
62
  ) -> Optional[ChunkDTO]:
62
63
  sets: list[Any] = []
63
64
  params: list[Any] = []
64
65
 
66
+ if full_type is not None:
67
+ sets.append("full_type = ?")
68
+ params.append(full_type)
69
+ if datasource_id is not None:
70
+ sets.append("datasource_id = ?")
71
+ params.append(datasource_id)
65
72
  if embeddable_text is not None:
66
73
  sets.append("embeddable_text = ?")
67
74
  params.append(embeddable_text)
68
75
  if display_text is not None:
69
76
  sets.append("display_text = ?")
70
77
  params.append(display_text)
71
- if generated_description is not None:
72
- sets.append("generated_description = ?")
73
- params.append(generated_description)
74
78
 
75
79
  if not sets:
76
80
  return self.get(chunk_id)
@@ -119,12 +123,12 @@ class ChunkRepository:
119
123
 
120
124
  @staticmethod
121
125
  def _row_to_dto(row: Tuple) -> ChunkDTO:
122
- chunk_id, datasource_run_id, embeddable_text, display_text, created_at, generated_description = row
126
+ chunk_id, full_type, datasource_id, embeddable_text, display_text, created_at = row
123
127
  return ChunkDTO(
124
128
  chunk_id=int(chunk_id),
125
- datasource_run_id=int(datasource_run_id),
126
- embeddable_text=str(embeddable_text),
129
+ full_type=full_type,
130
+ datasource_id=datasource_id,
131
+ embeddable_text=embeddable_text,
127
132
  display_text=display_text,
128
- generated_description=str(generated_description),
129
133
  created_at=created_at,
130
134
  )