PyPI - databao-context-engine - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

databao-context-engine 0.1.1py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

databao_context_engine/project/layout.py CHANGED Viewed

@@ -3,7 +3,6 @@ from dataclasses import dataclass
 from pathlib import Path
 from databao_context_engine.project.project_config import ProjectConfig
-from databao_context_engine.project.types import DatasourceId
 SOURCE_FOLDER_NAME = "src"
 OUTPUT_FOLDER_NAME = "output"
@@ -25,6 +24,14 @@ class ProjectLayout:
     def read_config_file(self) -> ProjectConfig:
         return ProjectConfig.from_file(self.config_file)
+    @property
+    def src_dir(self) -> Path:
+        return get_source_dir(self.project_dir)
+    @property
+    def output_dir(self) -> Path:
+        return get_output_dir(self.project_dir)
 def ensure_project_dir(project_dir: Path) -> ProjectLayout:
     return _ProjectValidator(project_dir).ensure_project_dir_valid()
@@ -62,22 +69,14 @@ def get_logs_dir(project_dir: Path) -> Path:
     return project_dir.joinpath(LOGS_FOLDER_NAME)
-def ensure_datasource_config_file_doesnt_exist(project_dir: Path, datasource_id: DatasourceId) -> Path:
-    config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
-    if config_file.is_file():
-        raise ValueError(f"A config file already exists for {str(datasource_id)}")
-    return config_file
 def create_datasource_config_file(
-    project_dir: Path, datasource_id: DatasourceId, config_content: str, overwrite_existing: bool
+    project_layout: ProjectLayout, datasource_relative_name: str, config_content: str, overwrite_existing: bool
 ) -> Path:
+    config_file = project_layout.src_dir / datasource_relative_name
     if not overwrite_existing:
-        ensure_datasource_config_file_doesnt_exist(project_dir, datasource_id)
+        if config_file.is_file():
+            raise ValueError(f"A config file already exists {config_file}")
-    config_file = get_source_dir(project_dir).joinpath(datasource_id.relative_path_to_config_file())
     config_file.parent.mkdir(parents=True, exist_ok=True)
     config_file.write_text(config_content)
@@ -96,12 +95,12 @@ class _ProjectValidator:
         if self.config_file is None:
             raise ValueError(
-                f"The current project directory has not been initialised. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
+                f"The current project directory has not been initialized. It should contain a config file. [project_dir: {self.project_dir.resolve()}]"
             )
         if not self.is_src_valid():
             raise ValueError(
-                f"The current project directory has not been initialised. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
+                f"The current project directory has not been initialized. It should contain a src directory. [project_dir: {self.project_dir.resolve()}]"
             )
         return ProjectLayout(project_dir=self.project_dir, config_file=self.config_file)

databao_context_engine/retrieve_embeddings/__init__.py CHANGED Viewed

@@ -0,0 +1,3 @@
+from databao_context_engine.retrieve_embeddings.retrieve_wiring import retrieve_embeddings
+__all__ = ["retrieve_embeddings"]

databao_context_engine/retrieve_embeddings/retrieve_runner.py ADDED Viewed

@@ -0,0 +1,17 @@
+import logging
+from databao_context_engine.datasources.types import DatasourceId
+from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
+from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
+logger = logging.getLogger(__name__)
+def retrieve(
+    *,
+    retrieve_service: RetrieveService,
+    text: str,
+    limit: int | None,
+    datasource_ids: list[DatasourceId] | None = None,
+) -> list[VectorSearchResult]:
+    return retrieve_service.retrieve(text=text, limit=limit, datasource_ids=datasource_ids)

databao_context_engine/retrieve_embeddings/{internal/retrieve_service.py → retrieve_service.py} RENAMED Viewed

@@ -1,10 +1,9 @@
 import logging
 from collections.abc import Sequence
+from databao_context_engine.datasources.types import DatasourceId
 from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
-from databao_context_engine.project.runs import resolve_run_name_from_repo
 from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
-from databao_context_engine.storage.repositories.run_repository import RunRepository
 from databao_context_engine.storage.repositories.vector_search_repository import (
     VectorSearchRepository,
     VectorSearchResult,
@@ -17,52 +16,46 @@ class RetrieveService:
     def __init__(
         self,
         *,
-        run_repo: RunRepository,
         vector_search_repo: VectorSearchRepository,
         shard_resolver: EmbeddingShardResolver,
         provider: EmbeddingProvider,
     ):
-        self._run_repo = run_repo
         self._shard_resolver = shard_resolver
         self._provider = provider
         self._vector_search_repo = vector_search_repo
     def retrieve(
-        self, *, project_id: str, text: str, run_name: str, limit: int | None = None
+        self, *, text: str, limit: int | None = None, datasource_ids: list[DatasourceId] | None = None
     ) -> list[VectorSearchResult]:
         if limit is None:
             limit = 10
-        run = self._run_repo.get_by_run_name(project_id=project_id, run_name=run_name)
-        if run is None:
-            raise LookupError(f"Run '{run_name}' not found for project '{project_id}'.")
         table_name, dimension = self._shard_resolver.resolve(
             embedder=self._provider.embedder, model_id=self._provider.model_id
         )
         retrieve_vec: Sequence[float] = self._provider.embed(text)
-        logger.debug(f"Retrieving display texts for run {run.run_id} in table {table_name}")
+        logger.debug(f"Retrieving display texts in table {table_name}")
         search_results = self._vector_search_repo.get_display_texts_by_similarity(
             table_name=table_name,
-            run_id=run.run_id,
             retrieve_vec=retrieve_vec,
             dimension=dimension,
             limit=limit,
+            datasource_ids=datasource_ids,
         )
-        logger.debug(f"Retrieved {len(search_results)} display texts for run {run.run_id} in table {table_name}")
+        logger.debug(f"Retrieved {len(search_results)} display texts in table {table_name}")
         if logger.isEnabledFor(logging.DEBUG):
-            closest_result = min(search_results, key=lambda result: result.cosine_distance)
-            logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
+            if search_results:
+                closest_result = min(search_results, key=lambda result: result.cosine_distance)
+                logger.debug(f"Best result: ({closest_result.cosine_distance}, {closest_result.embeddable_text})")
-            farthest_result = max(search_results, key=lambda result: result.cosine_distance)
-            logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
+                farthest_result = max(search_results, key=lambda result: result.cosine_distance)
+                logger.debug(f"Worst result: ({farthest_result.cosine_distance}, {farthest_result.embeddable_text})")
+            else:
+                logger.debug("No results found")
         return search_results
-    def resolve_run_name(self, *, project_id: str, run_name: str | None) -> str:
-        return resolve_run_name_from_repo(run_repository=self._run_repo, project_id=project_id, run_name=run_name)

databao_context_engine/retrieve_embeddings/retrieve_wiring.py ADDED Viewed

@@ -0,0 +1,46 @@
+from duckdb import DuckDBPyConnection
+from databao_context_engine.datasources.types import DatasourceId
+from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
+from databao_context_engine.llm.factory import create_ollama_embedding_provider, create_ollama_service
+from databao_context_engine.project.layout import ProjectLayout
+from databao_context_engine.retrieve_embeddings.retrieve_runner import retrieve
+from databao_context_engine.retrieve_embeddings.retrieve_service import RetrieveService
+from databao_context_engine.services.factories import create_shard_resolver
+from databao_context_engine.storage.connection import open_duckdb_connection
+from databao_context_engine.storage.repositories.factories import create_vector_search_repository
+from databao_context_engine.storage.repositories.vector_search_repository import VectorSearchResult
+from databao_context_engine.system.properties import get_db_path
+def retrieve_embeddings(
+    project_layout: ProjectLayout,
+    retrieve_text: str,
+    limit: int | None,
+    datasource_ids: list[DatasourceId] | None,
+) -> list[VectorSearchResult]:
+    with open_duckdb_connection(get_db_path(project_layout.project_dir)) as conn:
+        ollama_service = create_ollama_service()
+        embedding_provider = create_ollama_embedding_provider(ollama_service)
+        retrieve_service = _create_retrieve_service(conn, embedding_provider=embedding_provider)
+        return retrieve(
+            retrieve_service=retrieve_service,
+            text=retrieve_text,
+            limit=limit,
+            datasource_ids=datasource_ids,
+        )
+def _create_retrieve_service(
+    conn: DuckDBPyConnection,
+    *,
+    embedding_provider: EmbeddingProvider,
+) -> RetrieveService:
+    vector_search_repo = create_vector_search_repository(conn)
+    shard_resolver = create_shard_resolver(conn)
+    return RetrieveService(
+        vector_search_repo=vector_search_repo,
+        shard_resolver=shard_resolver,
+        provider=embedding_provider,
+    )

databao_context_engine/serialization/__init__.py ADDED Viewed

File without changes

databao_context_engine/{serialisation → serialization}/yaml.py RENAMED Viewed

@@ -7,17 +7,17 @@ from yaml import Node, SafeDumper
 def default_representer(dumper: SafeDumper, data: object) -> Node:
     if isinstance(data, Mapping):
         return dumper.represent_dict(data)
-    elif hasattr(data, "__dict__"):
-        # Doesn't serialise "private" attributes (that starts with an _)
+    if hasattr(data, "__dict__"):
+        # Doesn't serialize "private" attributes (that starts with an _)
         data_public_attributes = {key: value for key, value in data.__dict__.items() if not key.startswith("_")}
         if data_public_attributes:
             return dumper.represent_dict(data_public_attributes)
-        else:
-            # If there is no public attributes, we default to the string representation
-            return dumper.represent_str(str(data))
-    else:
+        # If there is no public attributes, we default to the string representation
         return dumper.represent_str(str(data))
+    return dumper.represent_str(str(data))
 # Registers our default representer only once, when that file is imported
 yaml.add_multi_representer(object, default_representer, Dumper=SafeDumper)

databao_context_engine/services/chunk_embedding_service.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import cast
 from databao_context_engine.llm.descriptions.provider import DescriptionProvider
 from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
 from databao_context_engine.pluginlib.build_plugin import EmbeddableChunk
-from databao_context_engine.serialisation.yaml import to_yaml_string
+from databao_context_engine.serialization.yaml import to_yaml_string
 from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
 from databao_context_engine.services.models import ChunkEmbedding
 from databao_context_engine.services.persistence_service import PersistenceService
@@ -14,9 +14,22 @@ logger = logging.getLogger(__name__)
 class ChunkEmbeddingMode(Enum):
+    """Mode controlling how chunks are embedded."""
     EMBEDDABLE_TEXT_ONLY = "EMBEDDABLE_TEXT_ONLY"
+    """
+    The embedding is generated only from the string defined by the plugin as embeddable for a chunk.
+    """
     GENERATED_DESCRIPTION_ONLY = "GENERATED_DESCRIPTION_ONLY"
+    """
+    The embedding is generated only from a description of the chunk generated by a LLM.
+    """
     EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION = "EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION"
+    """
+    The embedding is generated from both the embeddable string of the chunk and the description of the chunk generated by a LLM.
+    """
     def should_generate_description(self) -> bool:
         return self in (
@@ -44,26 +57,24 @@ class ChunkEmbeddingService:
         if self._chunk_embedding_mode.should_generate_description() and description_provider is None:
             raise ValueError("A DescriptionProvider must be provided when generating descriptions")
-    def embed_chunks(self, *, datasource_run_id: int, chunks: list[EmbeddableChunk], result: str) -> None:
-        """
-        Turn plugin chunks into persisted chunks and embeddings
+    def embed_chunks(self, *, chunks: list[EmbeddableChunk], result: str, full_type: str, datasource_id: str) -> None:
+        """Turn plugin chunks into persisted chunks and embeddings.
         Flow:
-        1) Embed each chunk into an embedded vector
-        2) Get or create embedding table for the appropriate model and embedding dimensions
-        3) Persist chunks and embeddings vectors in a single transaction
+        1) Embed each chunk into an embedded vector.
+        2) Get or create embedding table for the appropriate model and embedding dimensions.
+        3) Persist chunks and embeddings vectors in a single transaction.
         """
         if not chunks:
             return
         logger.debug(
-            f"Embedding {len(chunks)} chunks for datasource run {datasource_run_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
+            f"Embedding {len(chunks)} chunks for datasource {datasource_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
         )
         enriched_embeddings: list[ChunkEmbedding] = []
         for chunk in chunks:
-            chunk_display_text = to_yaml_string(chunk.content)
+            chunk_display_text = chunk.content if isinstance(chunk.content, str) else to_yaml_string(chunk.content)
             generated_description = ""
             match self._chunk_embedding_mode:
@@ -98,7 +109,8 @@ class ChunkEmbeddingService:
         )
         self._persistence_service.write_chunks_and_embeddings(
-            datasource_run_id=datasource_run_id,
             chunk_embeddings=enriched_embeddings,
             table_name=table_name,
+            full_type=full_type,
+            datasource_id=datasource_id,
         )

databao_context_engine/services/factories.py CHANGED Viewed

@@ -1,20 +1,15 @@
-from _duckdb import DuckDBPyConnection
+from duckdb import DuckDBPyConnection
-from databao_context_engine.build_sources.internal.build_service import BuildService
 from databao_context_engine.llm.descriptions.provider import DescriptionProvider
 from databao_context_engine.llm.embeddings.provider import EmbeddingProvider
-from databao_context_engine.retrieve_embeddings.internal.retrieve_service import RetrieveService
 from databao_context_engine.services.chunk_embedding_service import ChunkEmbeddingMode, ChunkEmbeddingService
 from databao_context_engine.services.embedding_shard_resolver import EmbeddingShardResolver
 from databao_context_engine.services.persistence_service import PersistenceService
 from databao_context_engine.services.table_name_policy import TableNamePolicy
 from databao_context_engine.storage.repositories.factories import (
     create_chunk_repository,
-    create_datasource_run_repository,
     create_embedding_repository,
     create_registry_repository,
-    create_run_repository,
-    create_vector_search_repository,
 )
@@ -46,43 +41,3 @@ def create_chunk_embedding_service(
         description_provider=description_provider,
         chunk_embedding_mode=chunk_embedding_mode,
     )
-def create_build_service(
-    conn: DuckDBPyConnection,
-    *,
-    embedding_provider: EmbeddingProvider,
-    description_provider: DescriptionProvider | None,
-    chunk_embedding_mode: ChunkEmbeddingMode,
-) -> BuildService:
-    run_repo = create_run_repository(conn)
-    datasource_run_repo = create_datasource_run_repository(conn)
-    chunk_embedding_service = create_chunk_embedding_service(
-        conn,
-        embedding_provider=embedding_provider,
-        description_provider=description_provider,
-        chunk_embedding_mode=chunk_embedding_mode,
-    )
-    return BuildService(
-        run_repo=run_repo,
-        datasource_run_repo=datasource_run_repo,
-        chunk_embedding_service=chunk_embedding_service,
-    )
-def create_retrieve_service(
-    conn: DuckDBPyConnection,
-    *,
-    embedding_provider: EmbeddingProvider,
-) -> RetrieveService:
-    run_repo = create_run_repository(conn)
-    vector_search_repo = create_vector_search_repository(conn)
-    shard_resolver = create_shard_resolver(conn)
-    return RetrieveService(
-        run_repo=run_repo,
-        vector_search_repo=vector_search_repo,
-        shard_resolver=shard_resolver,
-        provider=embedding_provider,
-    )

databao_context_engine/services/persistence_service.py CHANGED Viewed

@@ -24,11 +24,13 @@ class PersistenceService:
         self._dim = dim
     def write_chunks_and_embeddings(
-        self, *, datasource_run_id: int, chunk_embeddings: list[ChunkEmbedding], table_name: str
+        self, *, chunk_embeddings: list[ChunkEmbedding], table_name: str, full_type: str, datasource_id: str
     ):
-        """
-        Atomically persist chunks and their vectors.
-        Returns the number of embeddings written.
+        """Atomically persist chunks and their vectors.
+        Raises:
+            ValueError: If chunk_embeddings is an empty list.
         """
         if not chunk_embeddings:
             raise ValueError("chunk_embeddings must be a non-empty list")
@@ -36,21 +38,19 @@ class PersistenceService:
         with transaction(self._conn):
             for chunk_embedding in chunk_embeddings:
                 chunk_dto = self.create_chunk(
-                    datasource_run_id=datasource_run_id,
+                    full_type=full_type,
+                    datasource_id=datasource_id,
                     embeddable_text=chunk_embedding.chunk.embeddable_text,
                     display_text=chunk_embedding.display_text,
-                    generated_description=chunk_embedding.generated_description,
                 )
                 self.create_embedding(table_name=table_name, chunk_id=chunk_dto.chunk_id, vec=chunk_embedding.vec)
-    def create_chunk(
-        self, *, datasource_run_id: int, embeddable_text: str, display_text: str, generated_description: str
-    ) -> ChunkDTO:
+    def create_chunk(self, *, full_type: str, datasource_id: str, embeddable_text: str, display_text: str) -> ChunkDTO:
         return self._chunk_repo.create(
-            datasource_run_id=datasource_run_id,
+            full_type=full_type,
+            datasource_id=datasource_id,
             embeddable_text=embeddable_text,
             display_text=display_text,
-            generated_description=generated_description,
         )
     def create_embedding(self, *, table_name: str, chunk_id: int, vec: Sequence[float]):

databao_context_engine/storage/connection.py CHANGED Viewed

@@ -1,24 +1,28 @@
 import logging
 from contextlib import contextmanager
 from pathlib import Path
+from typing import Iterator
 import duckdb
-from databao_context_engine.system.properties import get_db_path
+from duckdb import DuckDBPyConnection
 logger = logging.getLogger(__name__)
 @contextmanager
-def open_duckdb_connection(db_path: str | Path | None = None):
-    """
-    Open a DuckDB connection with vector search enabled and close on exist.
-    Loads the vss extension and enables HNSW experimental persistence.
+def open_duckdb_connection(db_path: str | Path) -> Iterator[DuckDBPyConnection]:
+    """Open a DuckDB connection with vector search enabled and close on exist.
+    It also loads the vss extension and enables HNSW experimental persistence on the DuckDB.
     Usage:
         with open_duckdb_connection() as conn:
+    Yields:
+        The opened DuckDB connection.
     """
-    path = str(db_path or get_db_path())
+    path = str(db_path)
     conn = duckdb.connect(path)
     logger.debug(f"Connected to DuckDB database at {path}")

databao_context_engine/storage/exceptions/exceptions.py CHANGED Viewed

@@ -1,6 +1,6 @@
 class RepositoryError(Exception):
-    """Base exception for repository errors"""
+    """Base exception for repository errors."""
 class IntegrityError(RepositoryError):
-    """Raised when a DB constraint is violated"""
+    """Raised when a DB constraint is violated."""

databao_context_engine/storage/migrate.py CHANGED Viewed

@@ -8,12 +8,10 @@ from typing import LiteralString
 import duckdb
-from databao_context_engine.system.properties import get_db_path
 logger = logging.getLogger(__name__)
-def migrate(db_path: str | Path | None = None, migration_files: list[Path] | None = None) -> None:
+def migrate(db_path: str | Path, migration_files: list[Path] | None = None) -> None:
     if migration_files is None:
         migration_files = [
             migration
@@ -21,7 +19,7 @@ def migrate(db_path: str | Path | None = None, migration_files: list[Path] | Non
             if isinstance(migration, Path) and ".sql" == migration.suffix
         ]
-    db = Path(db_path or get_db_path()).expanduser().resolve()
+    db = Path(db_path).expanduser().resolve()
     db.parent.mkdir(parents=True, exist_ok=True)
     logger.debug("Running migrations on database: %s", db)
@@ -71,7 +69,7 @@ class _Migration:
 def _create_migration(file: Path) -> _Migration:
     query_bytes = file.read_bytes()
     query = query_bytes.decode("utf-8")
-    checksum = hashlib.md5(query_bytes).hexdigest()
+    checksum = hashlib.md5(query_bytes, usedforsecurity=False).hexdigest()
     version = _extract_version_from_name(file.name)
     return _Migration(name=file.name, version=version, checksum=checksum, query=query)

databao_context_engine/storage/migrations/V01__init.sql CHANGED Viewed

@@ -2,36 +2,15 @@ INSTALL vss;
 LOAD vss;
 SET hnsw_enable_experimental_persistence = true;
-CREATE SEQUENCE IF NOT EXISTS run_id_seq START 1;
-CREATE SEQUENCE IF NOT EXISTS datasource_run_id_seq START 1;
 CREATE SEQUENCE IF NOT EXISTS chunk_id_seq START 1;
-CREATE TABLE IF NOT EXISTS run (
-    run_id          BIGINT PRIMARY KEY DEFAULT nextval('run_id_seq'),
-    project_id      TEXT NOT NULL,
-    started_at      TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-    ended_at        TIMESTAMP,
-    nemory_version TEXT,
-    run_name TEXT NOT NULL,
-);
-CREATE TABLE IF NOT EXISTS datasource_run (
-    datasource_run_id   BIGINT PRIMARY KEY DEFAULT nextval('datasource_run_id_seq'),
-    run_id              BIGINT NOT NULL REFERENCES run(run_id),
-    plugin              TEXT NOT NULL,
-    source_id           TEXT NOT NULL,
-    storage_directory   TEXT NOT NULL,
-    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-    full_type TEXT NOT NULL,
-);
 CREATE TABLE IF NOT EXISTS chunk (
-    chunk_id       BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
-    datasource_run_id        BIGINT NOT NULL REFERENCES datasource_run(datasource_run_id),
-    embeddable_text  TEXT NOT NULL,
-    display_text     TEXT,
-    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-    generated_description TEXT,
+    chunk_id        BIGINT PRIMARY KEY DEFAULT nextval('chunk_id_seq'),
+    full_type       TEXT NOT NULL,
+    datasource_id   TEXT NOT NULL,
+    embeddable_text TEXT NOT NULL,
+    display_text    TEXT,
+    created_at      TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS embedding_model_registry (
@@ -57,7 +36,3 @@ OR IGNORE INTO
     embedding_model_registry(embedder, model_id, dim, table_name)
 VALUES
     ('ollama', 'nomic-embed-text:v1.5', 768, 'embedding_ollama__nomic_embed_text_v1_5__768');
-CREATE INDEX IF NOT EXISTS idx_datasource_run_run ON datasource_run(run_id);
-CREATE INDEX IF NOT EXISTS idx_datasource_run_plugin_run ON datasource_run(plugin, run_id);
-CREATE INDEX IF NOT EXISTS idx_chunk_datasource_run ON chunk(datasource_run_id);

databao_context_engine/storage/models.py CHANGED Viewed

@@ -4,34 +4,13 @@ from datetime import datetime
 from typing import Optional
-@dataclass(frozen=True)
-class RunDTO:
-    run_id: int
-    run_name: str
-    project_id: str
-    started_at: datetime
-    ended_at: Optional[datetime]
-    nemory_version: str
-@dataclass(frozen=True)
-class DatasourceRunDTO:
-    datasource_run_id: int
-    run_id: int
-    plugin: str
-    full_type: str
-    source_id: str
-    storage_directory: str
-    created_at: datetime
 @dataclass(frozen=True)
 class ChunkDTO:
     chunk_id: int
-    datasource_run_id: int
+    full_type: str
+    datasource_id: str
     embeddable_text: str
     display_text: Optional[str]
-    generated_description: str
     created_at: datetime

databao_context_engine/storage/repositories/chunk_repository.py CHANGED Viewed

@@ -14,22 +14,22 @@ class ChunkRepository:
     def create(
         self,
         *,
-        datasource_run_id: int,
+        full_type: str,
+        datasource_id: str,
         embeddable_text: str,
         display_text: Optional[str],
-        generated_description: str,
     ) -> ChunkDTO:
         try:
             row = self._conn.execute(
                 """
             INSERT INTO
-                chunk(datasource_run_id, embeddable_text, display_text, generated_description)
+                chunk(full_type, datasource_id, embeddable_text, display_text)
             VALUES
                 (?, ?, ?, ?)
             RETURNING
                 *
             """,
-                [datasource_run_id, embeddable_text, display_text, generated_description],
+                [full_type, datasource_id, embeddable_text, display_text],
             ).fetchone()
             if row is None:
                 raise RuntimeError("chunk creation returned no object")
@@ -55,22 +55,26 @@ class ChunkRepository:
         self,
         chunk_id: int,
         *,
+        full_type: Optional[str] = None,
+        datasource_id: Optional[str] = None,
         embeddable_text: Optional[str] = None,
         display_text: Optional[str] = None,
-        generated_description: Optional[str] = None,
     ) -> Optional[ChunkDTO]:
         sets: list[Any] = []
         params: list[Any] = []
+        if full_type is not None:
+            sets.append("full_type = ?")
+            params.append(full_type)
+        if datasource_id is not None:
+            sets.append("datasource_id = ?")
+            params.append(datasource_id)
         if embeddable_text is not None:
             sets.append("embeddable_text = ?")
             params.append(embeddable_text)
         if display_text is not None:
             sets.append("display_text = ?")
             params.append(display_text)
-        if generated_description is not None:
-            sets.append("generated_description = ?")
-            params.append(generated_description)
         if not sets:
             return self.get(chunk_id)
@@ -119,12 +123,12 @@ class ChunkRepository:
     @staticmethod
     def _row_to_dto(row: Tuple) -> ChunkDTO:
-        chunk_id, datasource_run_id, embeddable_text, display_text, created_at, generated_description = row
+        chunk_id, full_type, datasource_id, embeddable_text, display_text, created_at = row
         return ChunkDTO(
             chunk_id=int(chunk_id),
-            datasource_run_id=int(datasource_run_id),
-            embeddable_text=str(embeddable_text),
+            full_type=full_type,
+            datasource_id=datasource_id,
+            embeddable_text=embeddable_text,
             display_text=display_text,
-            generated_description=str(generated_description),
             created_at=created_at,
         )

databao-context-engine 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl

databao-context-engine 0.1.1py3-none-any.whl → 0.1.5py3-none-any.whl