databao-context-engine 0.1.2__py3-none-any.whl → 0.1.4.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. databao_context_engine/__init__.py +18 -6
  2. databao_context_engine/build_sources/__init__.py +4 -0
  3. databao_context_engine/build_sources/{internal/build_runner.py → build_runner.py} +27 -23
  4. databao_context_engine/build_sources/build_service.py +53 -0
  5. databao_context_engine/build_sources/build_wiring.py +84 -0
  6. databao_context_engine/build_sources/export_results.py +41 -0
  7. databao_context_engine/build_sources/{internal/plugin_execution.py → plugin_execution.py} +3 -7
  8. databao_context_engine/cli/add_datasource_config.py +41 -15
  9. databao_context_engine/cli/commands.py +12 -43
  10. databao_context_engine/cli/info.py +3 -2
  11. databao_context_engine/databao_context_engine.py +137 -0
  12. databao_context_engine/databao_context_project_manager.py +96 -6
  13. databao_context_engine/datasources/add_config.py +34 -0
  14. databao_context_engine/{datasource_config → datasources}/check_config.py +18 -7
  15. databao_context_engine/datasources/datasource_context.py +93 -0
  16. databao_context_engine/{project → datasources}/datasource_discovery.py +17 -16
  17. databao_context_engine/{project → datasources}/types.py +64 -15
  18. databao_context_engine/init_project.py +25 -3
  19. databao_context_engine/introspection/property_extract.py +67 -53
  20. databao_context_engine/llm/errors.py +2 -8
  21. databao_context_engine/llm/install.py +13 -20
  22. databao_context_engine/llm/service.py +1 -3
  23. databao_context_engine/mcp/mcp_runner.py +4 -2
  24. databao_context_engine/mcp/mcp_server.py +10 -10
  25. databao_context_engine/plugin_loader.py +111 -0
  26. databao_context_engine/pluginlib/build_plugin.py +25 -9
  27. databao_context_engine/pluginlib/config.py +16 -2
  28. databao_context_engine/plugins/base_db_plugin.py +5 -2
  29. databao_context_engine/plugins/databases/athena_introspector.py +85 -22
  30. databao_context_engine/plugins/databases/base_introspector.py +5 -3
  31. databao_context_engine/plugins/databases/clickhouse_introspector.py +22 -11
  32. databao_context_engine/plugins/databases/duckdb_introspector.py +3 -5
  33. databao_context_engine/plugins/databases/introspection_model_builder.py +1 -1
  34. databao_context_engine/plugins/databases/introspection_scope.py +11 -9
  35. databao_context_engine/plugins/databases/introspection_scope_matcher.py +2 -5
  36. databao_context_engine/plugins/databases/mssql_introspector.py +26 -17
  37. databao_context_engine/plugins/databases/mysql_introspector.py +23 -12
  38. databao_context_engine/plugins/databases/postgresql_introspector.py +2 -2
  39. databao_context_engine/plugins/databases/snowflake_introspector.py +43 -10
  40. databao_context_engine/plugins/duckdb_tools.py +18 -0
  41. databao_context_engine/plugins/plugin_loader.py +43 -42
  42. databao_context_engine/plugins/resources/parquet_introspector.py +7 -19
  43. databao_context_engine/project/info.py +34 -2
  44. databao_context_engine/project/init_project.py +16 -7
  45. databao_context_engine/project/layout.py +3 -3
  46. databao_context_engine/retrieve_embeddings/__init__.py +3 -0
  47. databao_context_engine/retrieve_embeddings/{internal/export_results.py → export_results.py} +2 -2
  48. databao_context_engine/retrieve_embeddings/{internal/retrieve_runner.py → retrieve_runner.py} +5 -9
  49. databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} +3 -17
  50. databao_context_engine/retrieve_embeddings/retrieve_wiring.py +49 -0
  51. databao_context_engine/{serialisation → serialization}/yaml.py +1 -1
  52. databao_context_engine/services/chunk_embedding_service.py +23 -11
  53. databao_context_engine/services/factories.py +1 -46
  54. databao_context_engine/services/persistence_service.py +11 -11
  55. databao_context_engine/storage/connection.py +11 -7
  56. databao_context_engine/storage/exceptions/exceptions.py +2 -2
  57. databao_context_engine/storage/migrate.py +2 -4
  58. databao_context_engine/storage/migrations/V01__init.sql +6 -31
  59. databao_context_engine/storage/models.py +2 -23
  60. databao_context_engine/storage/repositories/chunk_repository.py +16 -12
  61. databao_context_engine/storage/repositories/factories.py +1 -12
  62. databao_context_engine/storage/repositories/vector_search_repository.py +8 -13
  63. databao_context_engine/system/properties.py +4 -2
  64. databao_context_engine-0.1.4.dev1.dist-info/METADATA +75 -0
  65. databao_context_engine-0.1.4.dev1.dist-info/RECORD +125 -0
  66. {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/WHEEL +1 -1
  67. databao_context_engine/build_sources/internal/build_service.py +0 -77
  68. databao_context_engine/build_sources/internal/build_wiring.py +0 -52
  69. databao_context_engine/build_sources/internal/export_results.py +0 -43
  70. databao_context_engine/build_sources/public/api.py +0 -4
  71. databao_context_engine/databao_engine.py +0 -85
  72. databao_context_engine/datasource_config/__init__.py +0 -0
  73. databao_context_engine/datasource_config/add_config.py +0 -50
  74. databao_context_engine/datasource_config/datasource_context.py +0 -60
  75. databao_context_engine/mcp/all_results_tool.py +0 -5
  76. databao_context_engine/mcp/retrieve_tool.py +0 -22
  77. databao_context_engine/project/runs.py +0 -39
  78. databao_context_engine/retrieve_embeddings/internal/__init__.py +0 -0
  79. databao_context_engine/retrieve_embeddings/internal/retrieve_wiring.py +0 -29
  80. databao_context_engine/retrieve_embeddings/public/__init__.py +0 -0
  81. databao_context_engine/retrieve_embeddings/public/api.py +0 -3
  82. databao_context_engine/serialisation/__init__.py +0 -0
  83. databao_context_engine/services/run_name_policy.py +0 -8
  84. databao_context_engine/storage/repositories/datasource_run_repository.py +0 -136
  85. databao_context_engine/storage/repositories/run_repository.py +0 -157
  86. databao_context_engine-0.1.2.dist-info/METADATA +0 -187
  87. databao_context_engine-0.1.2.dist-info/RECORD +0 -135
  88. /databao_context_engine/{build_sources/internal → datasources}/__init__.py +0 -0
  89. /databao_context_engine/{build_sources/public → serialization}/__init__.py +0 -0
  90. {databao_context_engine-0.1.2.dist-info → databao_context_engine-0.1.4.dev1.dist-info}/entry_points.txt +0 -0
@@ -5,7 +5,7 @@ from typing import cast
5
5
  from databao_context_engine.llm.descriptions.provider import DescriptionProvider
6
6
  from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
7
7
  from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
8
- from databao_context_engine.serialisation.yaml import to_yaml_string
8
+ from databao_context_engine.serialization.yaml import to_yaml_string
9
9
  from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
10
10
  from databao_context_engine.services.models import ChunkEmbedding
11
11
  from databao_context_engine.services.persistence_service import PersistenceService
@@ -14,9 +14,22 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
16
  class ChunkEmbeddingMode(Enum):
17
+ """Mode controlling how chunks are embedded."""
18
+
17
19
  EMBEDDABLE_TEXT_ONLY = "EMBEDDABLE_TEXT_ONLY"
20
+ """
21
+ The embedding is generated only from the string defined by the plugin as embeddable for a chunk.
22
+ """
23
+
18
24
  GENERATED_DESCRIPTION_ONLY = "GENERATED_DESCRIPTION_ONLY"
25
+ """
26
+ The embedding is generated only from a description of the chunk generated by a LLM.
27
+ """
28
+
19
29
  EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION = "EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION"
30
+ """
31
+ The embedding is generated from both the embeddable string of the chunk and the description of the chunk generated by a LLM.
32
+ """
20
33
 
21
34
  def should_generate_description(self) -> bool:
22
35
  return self in (
@@ -44,26 +57,24 @@ class ChunkEmbeddingService:
44
57
  if self._chunk_embedding_mode.should_generate_description() and description_provider is None:
45
58
  raise ValueError("A DescriptionProvider must be provided when generating descriptions")
46
59
 
47
- def embed_chunks(self, *, datasource_run_id: int, chunks: list[EmbeddableChunk], result: str) -> None:
48
- """
49
- Turn plugin chunks into persisted chunks and embeddings
60
+ def embed_chunks(self, *, chunks: list[EmbeddableChunk], result: str, full_type: str, datasource_id: str) -> None:
61
+ """Turn plugin chunks into persisted chunks and embeddings.
50
62
 
51
63
  Flow:
52
- 1) Embed each chunk into an embedded vector
53
- 2) Get or create embedding table for the appropriate model and embedding dimensions
54
- 3) Persist chunks and embeddings vectors in a single transaction
64
+ 1) Embed each chunk into an embedded vector.
65
+ 2) Get or create embedding table for the appropriate model and embedding dimensions.
66
+ 3) Persist chunks and embeddings vectors in a single transaction.
55
67
  """
56
-
57
68
  if not chunks:
58
69
  return
59
70
 
60
71
  logger.debug(
61
- f"Embedding {len(chunks)} chunks for datasource run {datasource_run_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
72
+ f"Embedding {len(chunks)} chunks for datasource {datasource_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
62
73
  )
63
74
 
64
75
  enriched_embeddings: list[ChunkEmbedding] = []
65
76
  for chunk in chunks:
66
- chunk_display_text = to_yaml_string(chunk.content)
77
+ chunk_display_text = chunk.content if isinstance(chunk.content, str) else to_yaml_string(chunk.content)
67
78
 
68
79
  generated_description = ""
69
80
  match self._chunk_embedding_mode:
@@ -98,7 +109,8 @@ class ChunkEmbeddingService:
98
109
  )
99
110
 
100
111
  self._persistence_service.write_chunks_and_embeddings(
101
- datasource_run_id=datasource_run_id,
102
112
  chunk_embeddings=enriched_embeddings,
103
113
  table_name=table_name,
114
+ full_type=full_type,
115
+ datasource_id=datasource_id,
104
116
  )
@@ -1,20 +1,15 @@
1
- from _duckdb import DuckDBPyConnection
1
+ from duckdb import DuckDBPyConnection
2
2
 
3
- from databao_context_engine.build_sources.internal.build_service import BuildService
4
3
  from databao_context_engine.llm.descriptions.provider import DescriptionProvider
5
4
  from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
6
- from databao_context_engine.retrieve_embeddings.internal.retrieve_service import RetrieveService
7
5
  from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode, ChunkEmbeddingService
8
6
  from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
9
7
  from databao_context_engine.services.persistence_service import PersistenceService
10
8
  from databao_context_engine.services.table_name_policy import TableNamePolicy
11
9
  from databao_context_engine.storage.repositories.factories import (
12
10
  create_chunk_repository,
13
- create_datasource_run_repository,
14
11
  create_embedding_repository,
15
12
  create_registry_repository,
16
- create_run_repository,
17
- create_vector_search_repository,
18
13
  )
19
14
 
20
15
 
@@ -46,43 +41,3 @@ def create_chunk_embedding_service(
46
41
  description_provider=description_provider,
47
42
  chunk_embedding_mode=chunk_embedding_mode,
48
43
  )
49
-
50
-
51
- def create_build_service(
52
- conn: DuckDBPyConnection,
53
- *,
54
- embedding_provider: EmbeddingProvider,
55
- description_provider: DescriptionProvider | None,
56
- chunk_embedding_mode: ChunkEmbeddingMode,
57
- ) -> BuildService:
58
- run_repo = create_run_repository(conn)
59
- datasource_run_repo = create_datasource_run_repository(conn)
60
- chunk_embedding_service = create_chunk_embedding_service(
61
- conn,
62
- embedding_provider=embedding_provider,
63
- description_provider=description_provider,
64
- chunk_embedding_mode=chunk_embedding_mode,
65
- )
66
-
67
- return BuildService(
68
- run_repo=run_repo,
69
- datasource_run_repo=datasource_run_repo,
70
- chunk_embedding_service=chunk_embedding_service,
71
- )
72
-
73
-
74
- def create_retrieve_service(
75
- conn: DuckDBPyConnection,
76
- *,
77
- embedding_provider: EmbeddingProvider,
78
- ) -> RetrieveService:
79
- run_repo = create_run_repository(conn)
80
- vector_search_repo = create_vector_search_repository(conn)
81
- shard_resolver = create_shard_resolver(conn)
82
-
83
- return RetrieveService(
84
- run_repo=run_repo,
85
- vector_search_repo=vector_search_repo,
86
- shard_resolver=shard_resolver,
87
- provider=embedding_provider,
88
- )
@@ -24,11 +24,13 @@ class PersistenceService:
24
24
  self._dim = dim
25
25
 
26
26
  def write_chunks_and_embeddings(
27
- self, *, datasource_run_id: int, chunk_embeddings: list[ChunkEmbedding], table_name: str
27
+ self, *, chunk_embeddings: list[ChunkEmbedding], table_name: str, full_type: str, datasource_id: str
28
28
  ):
29
- """
30
- Atomically persist chunks and their vectors.
31
- Returns the number of embeddings written.
29
+ """Atomically persist chunks and their vectors.
30
+
31
+ Raises:
32
+ ValueError: If chunk_embeddings is an empty list.
33
+
32
34
  """
33
35
  if not chunk_embeddings:
34
36
  raise ValueError("chunk_embeddings must be a non-empty list")
@@ -36,21 +38,19 @@ class PersistenceService:
36
38
  with transaction(self._conn):
37
39
  for chunk_embedding in chunk_embeddings:
38
40
  chunk_dto = self.create_chunk(
39
- datasource_run_id=datasource_run_id,
41
+ full_type=full_type,
42
+ datasource_id=datasource_id,
40
43
  embeddable_text=chunk_embedding.chunk.embeddable_text,
41
44
  display_text=chunk_embedding.display_text,
42
- generated_description=chunk_embedding.generated_description,
43
45
  )
44
46
  self.create_embedding(table_name=table_name, chunk_id=chunk_dto.chunk_id, vec=chunk_embedding.vec)
45
47
 
46
- def create_chunk(
47
- self, *, datasource_run_id: int, embeddable_text: str, display_text: str, generated_description: str
48
- ) -> ChunkDTO:
48
+ def create_chunk(self, *, full_type: str, datasource_id: str, embeddable_text: str, display_text: str) -> ChunkDTO:
49
49
  return self._chunk_repo.create(
50
- datasource_run_id=datasource_run_id,
50
+ full_type=full_type,
51
+ datasource_id=datasource_id,
51
52
  embeddable_text=embeddable_text,
52
53
  display_text=display_text,
53
- generated_description=generated_description,
54
54
  )
55
55
 
56
56
  def create_embedding(self, *, table_name: str, chunk_id: int, vec: Sequence[float]):
@@ -1,24 +1,28 @@
1
1
  import logging
2
2
  from contextlib import contextmanager
3
3
  from pathlib import Path
4
+ from typing import Iterator
4
5
 
5
6
  import duckdb
6
-
7
- from databao_context_engine.system.properties import get_db_path
7
+ from duckdb import DuckDBPyConnection
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
11
 
12
12
  @contextmanager
13
- def open_duckdb_connection(db_path: str | Path | None = None):
14
- """
15
- Open a DuckDB connection with vector search enabled and close on exist.
16
- Loads the vss extension and enables HNSW experimental persistence.
13
+ def open_duckdb_connection(db_path: str | Path) -> Iterator[DuckDBPyConnection]:
14
+ """Open a DuckDB connection with vector search enabled and close on exist.
15
+
16
+ It also loads the vss extension and enables HNSW experimental persistence on the DuckDB.
17
17
 
18
18
  Usage:
19
19
  with open_duckdb_connection() as conn:
20
+
21
+ Yields:
22
+ The opened DuckDB connection.
23
+
20
24
  """
21
- path = str(db_path or get_db_path())
25
+ path = str(db_path)
22
26
  conn = duckdb.connect(path)
23
27
  logger.debug(f"Connected to DuckDB database at {path}")
24
28
 
@@ -1,6 +1,6 @@
1
1
  class RepositoryError(Exception):
2
- """Base exception for repository errors"""
2
+ """Base exception for repository errors."""
3
3
 
4
4
 
5
5
  class IntegrityError(RepositoryError):
6
- """Raised when a DB constraint is violated"""
6
+ """Raised when a DB constraint is violated."""
@@ -8,12 +8,10 @@ from typing import LiteralString
8
8
 
9
9
  import duckdb
10
10
 
11
- from databao_context_engine.system.properties import get_db_path
12
-
13
11
  logger = logging.getLogger(__name__)
14
12
 
15
13
 
16
- def migrate(db_path: str | Path | None = None, migration_files: list[Path] | None = None) -> None:
14
+ def migrate(db_path: str | Path, migration_files: list[Path] | None = None) -> None:
17
15
  if migration_files is None:
18
16
  migration_files = [
19
17
  migration
@@ -21,7 +19,7 @@ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | Non
21
19
  if isinstance(migration, Path) and ".sql" == migration.suffix
22
20
  ]
23
21
 
24
- db = Path(db_path or get_db_path()).expanduser().resolve()
22
+ db = Path(db_path).expanduser().resolve()
25
23
  db.parent.mkdir(parents=True, exist_ok=True)
26
24
  logger.debug("Running migrations on database: %s", db)
27
25
 
@@ -2,36 +2,15 @@ INSTALL vss;
2
2
  LOAD vss;
3
3
  SET hnsw_enable_experimental_persistence = true;
4
4
 
5
- CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
6
- CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
7
5
  CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
8
6
 
9
- CREATE TABLE IF NOT EXISTS run (
10
- run_id BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
11
- project_id TEXT NOT NULL,
12
- started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
13
- ended_at TIMESTAMP,
14
- nemory_version TEXT,
15
- run_name TEXT NOT NULL,
16
- );
17
-
18
- CREATE TABLE IF NOT EXISTS datasource_run (
19
- datasource_run_id BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
20
- run_id BIGINT NOT NULL REFERENCES run(run_id),
21
- plugin TEXT NOT NULL,
22
- source_id TEXT NOT NULL,
23
- storage_directory TEXT NOT NULL,
24
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
25
- full_type TEXT NOT NULL,
26
- );
27
-
28
7
  CREATE TABLE IF NOT EXISTS chunk (
29
- chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
30
- datasource_run_id BIGINT NOT NULL REFERENCES datasource_run(datasource_run_id),
31
- embeddable_text TEXT NOT NULL,
32
- display_text TEXT,
33
- created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
34
- generated_description TEXT,
8
+ chunk_id BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
9
+ full_type TEXT NOT NULL,
10
+ datasource_id TEXT NOT NULL,
11
+ embeddable_text TEXT NOT NULL,
12
+ display_text TEXT,
13
+ created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
35
14
  );
36
15
 
37
16
  CREATE TABLE IF NOT EXISTS embedding_model_registry (
@@ -57,7 +36,3 @@ OR IGNORE INTO
57
36
  embedding_model_registry(embedder, model_id, dim, table_name)
58
37
  VALUES
59
38
  ('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
60
-
61
- CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
62
- CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
63
- CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);
@@ -4,34 +4,13 @@ from datetime import datetime
4
4
  from typing import Optional
5
5
 
6
6
 
7
- @dataclass(frozen=True)
8
- class RunDTO:
9
- run_id: int
10
- run_name: str
11
- project_id: str
12
- started_at: datetime
13
- ended_at: Optional[datetime]
14
- nemory_version: str
15
-
16
-
17
- @dataclass(frozen=True)
18
- class DatasourceRunDTO:
19
- datasource_run_id: int
20
- run_id: int
21
- plugin: str
22
- full_type: str
23
- source_id: str
24
- storage_directory: str
25
- created_at: datetime
26
-
27
-
28
7
  @dataclass(frozen=True)
29
8
  class ChunkDTO:
30
9
  chunk_id: int
31
- datasource_run_id: int
10
+ full_type: str
11
+ datasource_id: str
32
12
  embeddable_text: str
33
13
  display_text: Optional[str]
34
- generated_description: str
35
14
  created_at: datetime
36
15
 
37
16
 
@@ -14,22 +14,22 @@ class ChunkRepository:
14
14
  def create(
15
15
  self,
16
16
  *,
17
- datasource_run_id: int,
17
+ full_type: str,
18
+ datasource_id: str,
18
19
  embeddable_text: str,
19
20
  display_text: Optional[str],
20
- generated_description: str,
21
21
  ) -> ChunkDTO:
22
22
  try:
23
23
  row = self._conn.execute(
24
24
  """
25
25
  INSERT INTO
26
- chunk(datasource_run_id, embeddable_text, display_text, generated_description)
26
+ chunk(full_type, datasource_id, embeddable_text, display_text)
27
27
  VALUES
28
28
  (?, ?, ?, ?)
29
29
  RETURNING
30
30
  *
31
31
  """,
32
- [datasource_run_id, embeddable_text, display_text, generated_description],
32
+ [full_type, datasource_id, embeddable_text, display_text],
33
33
  ).fetchone()
34
34
  if row is None:
35
35
  raise RuntimeError("chunk creation returned no object")
@@ -55,22 +55,26 @@ class ChunkRepository:
55
55
  self,
56
56
  chunk_id: int,
57
57
  *,
58
+ full_type: Optional[str] = None,
59
+ datasource_id: Optional[str] = None,
58
60
  embeddable_text: Optional[str] = None,
59
61
  display_text: Optional[str] = None,
60
- generated_description: Optional[str] = None,
61
62
  ) -> Optional[ChunkDTO]:
62
63
  sets: list[Any] = []
63
64
  params: list[Any] = []
64
65
 
66
+ if full_type is not None:
67
+ sets.append("full_type = ?")
68
+ params.append(full_type)
69
+ if datasource_id is not None:
70
+ sets.append("datasource_id = ?")
71
+ params.append(datasource_id)
65
72
  if embeddable_text is not None:
66
73
  sets.append("embeddable_text = ?")
67
74
  params.append(embeddable_text)
68
75
  if display_text is not None:
69
76
  sets.append("display_text = ?")
70
77
  params.append(display_text)
71
- if generated_description is not None:
72
- sets.append("generated_description = ?")
73
- params.append(generated_description)
74
78
 
75
79
  if not sets:
76
80
  return self.get(chunk_id)
@@ -119,12 +123,12 @@ class ChunkRepository:
119
123
 
120
124
  @staticmethod
121
125
  def _row_to_dto(row: Tuple) -> ChunkDTO:
122
- chunk_id, datasource_run_id, embeddable_text, display_text, created_at, generated_description = row
126
+ chunk_id, full_type, datasource_id, embeddable_text, display_text, created_at = row
123
127
  return ChunkDTO(
124
128
  chunk_id=int(chunk_id),
125
- datasource_run_id=int(datasource_run_id),
126
- embeddable_text=str(embeddable_text),
129
+ full_type=full_type,
130
+ datasource_id=datasource_id,
131
+ embeddable_text=embeddable_text,
127
132
  display_text=display_text,
128
- generated_description=str(generated_description),
129
133
  created_at=created_at,
130
134
  )
@@ -1,24 +1,13 @@
1
- from _duckdb import DuckDBPyConnection
1
+ from duckdb import DuckDBPyConnection
2
2
 
3
- from databao_context_engine.services.run_name_policy import RunNamePolicy
4
3
  from databao_context_engine.storage.repositories.chunk_repository import ChunkRepository
5
- from databao_context_engine.storage.repositories.datasource_run_repository import DatasourceRunRepository
6
4
  from databao_context_engine.storage.repositories.embedding_model_registry_repository import (
7
5
  EmbeddingModelRegistryRepository,
8
6
  )
9
7
  from databao_context_engine.storage.repositories.embedding_repository import EmbeddingRepository
10
- from databao_context_engine.storage.repositories.run_repository import RunRepository
11
8
  from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchRepository
12
9
 
13
10
 
14
- def create_run_repository(conn: DuckDBPyConnection) -> RunRepository:
15
- return RunRepository(conn, run_name_policy=RunNamePolicy())
16
-
17
-
18
- def create_datasource_run_repository(conn: DuckDBPyConnection) -> DatasourceRunRepository:
19
- return DatasourceRunRepository(conn)
20
-
21
-
22
11
  def create_chunk_repository(conn: DuckDBPyConnection) -> ChunkRepository:
23
12
  return ChunkRepository(conn)
24
13
 
@@ -3,8 +3,8 @@ from dataclasses import dataclass
3
3
 
4
4
  import duckdb
5
5
 
6
+ from databao_context_engine.datasources.types import DatasourceId
6
7
  from databao_context_engine.pluginlib.build_plugin import DatasourceType
7
- from databao_context_engine.project.types import DatasourceId
8
8
 
9
9
 
10
10
  @dataclass(kw_only=True, frozen=True)
@@ -23,32 +23,27 @@ class VectorSearchRepository:
23
23
  self._conn = conn
24
24
 
25
25
  def get_display_texts_by_similarity(
26
- self, *, table_name: str, run_id: int, retrieve_vec: Sequence[float], dimension: int, limit: int
26
+ self, *, table_name: str, retrieve_vec: Sequence[float], dimension: int, limit: int
27
27
  ) -> list[VectorSearchResult]:
28
- """
29
- Read only similarity search on a specific embedding shard table.
30
- Returns the display text for the closest matches in a given run
31
- """
28
+ """Read only similarity search on a specific embedding shard table."""
32
29
  rows = self._conn.execute(
33
30
  f"""
34
31
  SELECT
35
32
  COALESCE(c.display_text, c.embeddable_text) AS display_text,
36
- c.embeddable_text AS embeddable_text,
33
+ c.embeddable_text,
37
34
  array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) AS cosine_distance,
38
- dr.full_type,
39
- dr.source_id,
35
+ c.full_type,
36
+ c.datasource_id,
40
37
  FROM
41
38
  {table_name} e
42
39
  JOIN chunk c ON e.chunk_id = c.chunk_id
43
- JOIN datasource_run dr ON c.datasource_run_id = dr.datasource_run_id
44
40
  WHERE
45
- dr.run_id = ?
46
- AND cosine_distance < ?
41
+ cosine_distance < ?
47
42
  ORDER BY
48
43
  array_cosine_distance(e.vec, CAST(? AS FLOAT[{dimension}])) ASC
49
44
  LIMIT ?
50
45
  """,
51
- [list(retrieve_vec), run_id, self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit],
46
+ [list(retrieve_vec), self._DEFAULT_DISTANCE_THRESHOLD, list(retrieve_vec), limit],
52
47
  ).fetchall()
53
48
 
54
49
  return [
@@ -1,6 +1,8 @@
1
1
  import os
2
2
  from pathlib import Path
3
3
 
4
+ from databao_context_engine.project.layout import get_output_dir
5
+
4
6
  # it's private, so it doesn't get imported directy. This value is mocked in tests
5
7
  _dce_path = Path(os.getenv("DATABAO_CONTEXT_ENGINE_PATH") or "~/.dce").expanduser().resolve()
6
8
 
@@ -9,5 +11,5 @@ def get_dce_path() -> Path:
9
11
  return _dce_path
10
12
 
11
13
 
12
- def get_db_path() -> Path:
13
- return get_dce_path() / "dce.duckdb"
14
+ def get_db_path(project_dir: Path) -> Path:
15
+ return get_output_dir(project_dir) / "dce.duckdb"
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.3
2
+ Name: databao-context-engine
3
+ Version: 0.1.4.dev1
4
+ Summary: Semantic context for your LLMs — generated automatically
5
+ Requires-Dist: click>=8.3.0
6
+ Requires-Dist: duckdb>=1.4.3
7
+ Requires-Dist: pyyaml>=6.0.3
8
+ Requires-Dist: requests>=2.32.5
9
+ Requires-Dist: mcp>=1.23.3
10
+ Requires-Dist: pydantic>=2.12.4
11
+ Requires-Dist: jinja2>=3.1.6
12
+ Requires-Dist: pyathena>=3.25.0 ; extra == 'athena'
13
+ Requires-Dist: clickhouse-connect>=0.10.0 ; extra == 'clickhouse'
14
+ Requires-Dist: mssql-python>=1.0.0 ; extra == 'mssql'
15
+ Requires-Dist: pymysql>=1.1.2 ; extra == 'mysql'
16
+ Requires-Dist: asyncpg>=0.31.0 ; extra == 'postgresql'
17
+ Requires-Dist: snowflake-connector-python>=4.2.0 ; extra == 'snowflake'
18
+ Requires-Python: >=3.12
19
+ Provides-Extra: athena
20
+ Provides-Extra: clickhouse
21
+ Provides-Extra: mssql
22
+ Provides-Extra: mysql
23
+ Provides-Extra: postgresql
24
+ Provides-Extra: snowflake
25
+ Description-Content-Type: text/markdown
26
+
27
+ [![official project](https://jb.gg/badges/official.svg)](https://confluence.jetbrains.com/display/ALL/JetBrains+on+GitHub)
28
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/JetBrains/databao-context-engine/blob/main/LICENSE)
29
+
30
+ [//]: # ([![PyPI version]&#40;https://img.shields.io/pypi/v/databao-context-engine.svg&#41;]&#40;https://pypi.org/project/databao-context-engine&#41;)
31
+
32
+ [//]: # ([![Python versions]&#40;https://img.shields.io/pypi/pyversions/databao-context-engine.svg&#41;]&#40;https://pypi.org/project/databao-context-engine/&#41;)
33
+
34
+
35
+ <h1 align="center">Databao Context Engine</h1>
36
+ <p align="center">
37
+ <b>Semantic context for your LLMs — generated automatically.</b><br/>
38
+ No more copying schemas. No manual documentation. Just accurate answers.
39
+ </p>
40
+ <p align="center">
41
+ <a href="https://databao.app">Website</a>
42
+
43
+ [//]: # (•)
44
+
45
+ [//]: # ( <a href="#quickstart">Quickstart</a> •)
46
+
47
+ [//]: # ( <a href="#supported-data-sources">Data Sources</a> •)
48
+
49
+ [//]: # ( <a href="#contributing">Contributing</a>)
50
+ </p>
51
+
52
+ ---
53
+
54
+ ## What is Databao Context Engine?
55
+
56
+ Databao Context Engine **automatically generates governed semantic context** from your databases, BI tools, documents, and spreadsheets.
57
+
58
+ Integrate it with any LLM to deliver **accurate, context-aware answers** — without copying schemas or writing documentation by hand.
59
+
60
+ ```
61
+ Your data sources → Context Engine → Unified semantic graph → Any LLM
62
+ ```
63
+
64
+ ## Why choose Databao Context Engine?
65
+
66
+ | Feature | What it means for you |
67
+ |----------------------------|----------------------------------------------------------------|
68
+ | **Auto-generated context** | Extracts schemas, relationships, and semantics automatically |
69
+ | **Runs locally** | Your data never leaves your environment |
70
+ | **MCP integration** | Works with Claude Desktop, Cursor, and any MCP-compatible tool |
71
+ | **Multiple sources** | Databases, dbt projects, spreadsheets, documents |
72
+ | **Built-in benchmarks** | Measure and improve context quality over time |
73
+ | **LLM agnostic** | OpenAI, Anthropic, Ollama, Gemini — use any model |
74
+ | **Governed & versioned** | Track, version, and share context across your team |
75
+ | **Dynamic or static** | Serve context via MCP server or export as artifact |