PyPI - haystack-velesdb - Versions diffs - 1.14.1__tar.gz - Mend

haystack-velesdb 1.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

haystack_velesdb-1.14.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 VelesDB Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

haystack_velesdb-1.14.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,161 @@
+Metadata-Version: 2.4
+Name: haystack-velesdb
+Version: 1.14.1
+Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
+Author-email: VelesDB Team <contact@wiscale.fr>
+License: MIT
+Project-URL: Homepage, https://github.com/cyberlife-coder/VelesDB
+Project-URL: Documentation, https://velesdb.com/docs/integrations/haystack
+Project-URL: Repository, https://github.com/cyberlife-coder/VelesDB
+Keywords: haystack,velesdb,vector-database,embeddings,rag,local-first,semantic-search
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: haystack-ai>=2.0.0
+Requires-Dist: velesdb>=1.13.2
+Requires-Dist: velesdb-common>=1.13.2
+Provides-Extra: dev
+Requires-Dist: pytest<9.0,>=7.0; extra == "dev"
+Dynamic: license-file
+# haystack-velesdb
+A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
+the local-first, microsecond-latency vector database.
+This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
+connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
+## Installation
+```bash
+pip install haystack-velesdb
+```
+For development:
+```bash
+pip install -e "integrations/haystack[dev]"
+```
+## Quick start
+```python
+from haystack_velesdb import VelesDBDocumentStore
+from haystack.dataclasses import Document
+store = VelesDBDocumentStore(
+    path="./my_docs",
+    collection_name="knowledge_base",
+    embedding_dim=768,
+    metric="cosine",
+)
+# Write pre-embedded documents
+documents = [
+    Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
+    Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
+]
+store.write_documents(documents)
+# Retrieve by vector
+results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
+for doc in results:
+    print(doc.content, doc.score)
+```
+## Full RAG pipeline
+See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
+and semantic search example using `SentenceTransformersDocumentEmbedder`.
+```python
+from haystack import Pipeline
+from haystack.components.converters import PyPDFToDocument
+from haystack.components.embedders import (
+    SentenceTransformersDocumentEmbedder,
+    SentenceTransformersTextEmbedder,
+)
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack_velesdb import VelesDBDocumentStore
+store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
+# Indexing pipeline
+indexer = Pipeline()
+indexer.add_component("converter", PyPDFToDocument())
+indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
+indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
+indexer.add_component("writer", DocumentWriter(document_store=store))
+indexer.connect("converter", "splitter")
+indexer.connect("splitter", "embedder")
+indexer.connect("embedder", "writer")
+indexer.run({"converter": {"sources": ["paper.pdf"]}})
+# Query pipeline
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+querier = Pipeline()
+querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
+querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
+querier.connect("embedder.embedding", "retriever.query_embedding")
+result = querier.run({"embedder": {"text": "What is VelesDB?"}})
+print(result["retriever"]["documents"])
+```
+## API reference
+### `VelesDBDocumentStore`
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
+| `collection_name` | `"haystack_documents"` | VelesDB collection name |
+| `embedding_dim` | `768` | Embedding vector dimension |
+| `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
+### Methods
+| Method | Description |
+|--------|-------------|
+| `write_documents(documents, policy)` | Upsert documents; returns count written |
+| `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
+| `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
+| `count_documents()` | Total document count |
+| `delete_documents(document_ids)` | Delete by Haystack string IDs |
+| `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
+**Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
+and always overwrite on collision.  `FAIL` is fully enforced: a pre-scan is
+performed before writing and `DuplicateDocumentError` is raised if any document
+already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
+**Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
+integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots).  For a
+1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
+is negligible for typical RAG workloads.  A `ValueError` is raised at write time
+if a collision is detected between a new document and an existing one.
+**Note on `scale_score`:** When `True` (default), cosine similarity scores
+are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
+in downstream re-ranking.
+## Running tests
+```bash
+cd integrations/haystack
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+Tests use lightweight fake VelesDB objects — no running server required.

haystack_velesdb-1.14.1/README.md ADDED Viewed

@@ -0,0 +1,132 @@
+# haystack-velesdb
+A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
+the local-first, microsecond-latency vector database.
+This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
+connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
+## Installation
+```bash
+pip install haystack-velesdb
+```
+For development:
+```bash
+pip install -e "integrations/haystack[dev]"
+```
+## Quick start
+```python
+from haystack_velesdb import VelesDBDocumentStore
+from haystack.dataclasses import Document
+store = VelesDBDocumentStore(
+    path="./my_docs",
+    collection_name="knowledge_base",
+    embedding_dim=768,
+    metric="cosine",
+)
+# Write pre-embedded documents
+documents = [
+    Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
+    Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
+]
+store.write_documents(documents)
+# Retrieve by vector
+results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
+for doc in results:
+    print(doc.content, doc.score)
+```
+## Full RAG pipeline
+See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
+and semantic search example using `SentenceTransformersDocumentEmbedder`.
+```python
+from haystack import Pipeline
+from haystack.components.converters import PyPDFToDocument
+from haystack.components.embedders import (
+    SentenceTransformersDocumentEmbedder,
+    SentenceTransformersTextEmbedder,
+)
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack_velesdb import VelesDBDocumentStore
+store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
+# Indexing pipeline
+indexer = Pipeline()
+indexer.add_component("converter", PyPDFToDocument())
+indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
+indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
+indexer.add_component("writer", DocumentWriter(document_store=store))
+indexer.connect("converter", "splitter")
+indexer.connect("splitter", "embedder")
+indexer.connect("embedder", "writer")
+indexer.run({"converter": {"sources": ["paper.pdf"]}})
+# Query pipeline
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+querier = Pipeline()
+querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
+querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
+querier.connect("embedder.embedding", "retriever.query_embedding")
+result = querier.run({"embedder": {"text": "What is VelesDB?"}})
+print(result["retriever"]["documents"])
+```
+## API reference
+### `VelesDBDocumentStore`
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
+| `collection_name` | `"haystack_documents"` | VelesDB collection name |
+| `embedding_dim` | `768` | Embedding vector dimension |
+| `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
+### Methods
+| Method | Description |
+|--------|-------------|
+| `write_documents(documents, policy)` | Upsert documents; returns count written |
+| `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
+| `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
+| `count_documents()` | Total document count |
+| `delete_documents(document_ids)` | Delete by Haystack string IDs |
+| `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
+**Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
+and always overwrite on collision.  `FAIL` is fully enforced: a pre-scan is
+performed before writing and `DuplicateDocumentError` is raised if any document
+already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
+**Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
+integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots).  For a
+1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
+is negligible for typical RAG workloads.  A `ValueError` is raised at write time
+if a collision is detected between a new document and an existing one.
+**Note on `scale_score`:** When `True` (default), cosine similarity scores
+are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
+in downstream re-ranking.
+## Running tests
+```bash
+cd integrations/haystack
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+Tests use lightweight fake VelesDB objects — no running server required.

haystack_velesdb-1.14.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,44 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "haystack-velesdb"
+version = "1.14.1"
+description = "Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database."
+readme = "README.md"
+license = {text = "MIT"}
+authors = [
+    {name = "VelesDB Team", email = "contact@wiscale.fr"}
+]
+keywords = ["haystack", "velesdb", "vector-database", "embeddings", "rag", "local-first", "semantic-search"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+requires-python = ">=3.9"
+dependencies = [
+    "haystack-ai>=2.0.0",
+    "velesdb>=1.13.2",
+    "velesdb-common>=1.13.2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0,<9.0",
+]
+[project.urls]
+Homepage = "https://github.com/cyberlife-coder/VelesDB"
+Documentation = "https://velesdb.com/docs/integrations/haystack"
+Repository = "https://github.com/cyberlife-coder/VelesDB"
+[tool.setuptools.packages.find]
+where = ["src"]

haystack_velesdb-1.14.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

haystack_velesdb-1.14.1/src/haystack_velesdb/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Haystack 2.x DocumentStore integration for VelesDB."""
+from haystack_velesdb.document_store import VelesDBDocumentStore
+__all__ = ["VelesDBDocumentStore"]
+__version__ = "1.0.0"

haystack_velesdb-1.14.1/src/haystack_velesdb/document_store.py ADDED Viewed

@@ -0,0 +1,350 @@
+"""Haystack 2.x DocumentStore backed by VelesDB.
+Implements the Haystack ``DocumentStore`` protocol so VelesDB can be used
+as the vector backend in any Haystack 2.x indexing or retrieval pipeline.
+"""
+from __future__ import annotations
+import hashlib
+import logging
+from typing import Any, Dict, List, Optional
+from haystack import default_from_dict, default_to_dict
+from haystack.dataclasses import Document
+from haystack.document_stores.errors import DuplicateDocumentError
+from haystack.document_stores.types import DuplicatePolicy
+import velesdb
+from velesdb_common.security import (
+    validate_collection_name,
+    validate_metric,
+    validate_path,
+)
+logger = logging.getLogger(__name__)
+__all__ = ["VelesDBDocumentStore"]
+_DEFAULT_COLLECTION = "haystack_documents"
+_DEFAULT_DIMENSION = 768
+_DEFAULT_METRIC = "cosine"
+_DEFAULT_SCROLL_LIMIT = 10_000
+_INT63_MASK = (1 << 63) - 1
+# Reserved keys stored by this integration in the VelesDB payload.
+_RESERVED_PAYLOAD_KEYS = frozenset({"_doc_id", "content"})
+def _str_id_to_int(doc_id: str) -> int:
+    """Map a Haystack string document ID to a stable positive 63-bit integer.
+    Uses the first 8 bytes of SHA-256, masked to 63 bits (~9.2 × 10¹⁸ slots).
+    Collision probability for a 1 M-document collection is roughly 5 × 10⁻¹⁴ —
+    negligible for typical RAG workloads but not zero.  If two distinct string
+    IDs produce the same integer ID, :meth:`write_documents` raises
+    :class:`ValueError` rather than silently overwriting the existing document.
+    """
+    return int.from_bytes(hashlib.sha256(doc_id.encode()).digest()[:8], "big") & _INT63_MASK
+def _doc_to_point(doc: Document) -> dict:
+    """Convert a Haystack Document to a VelesDB point dict.
+    Reserved payload keys (``_doc_id``, ``content``) are always written from
+    the document's canonical fields, not from ``doc.meta``.  Any meta entry
+    that shares a reserved name is silently dropped from the payload to
+    prevent round-trip corruption.
+    """
+    payload: dict = {}
+    # Merge meta first; reserved keys are excluded so they cannot
+    # clobber the canonical doc identity written below.
+    if doc.meta:
+        for k, v in doc.meta.items():
+            if k not in _RESERVED_PAYLOAD_KEYS:
+                payload[k] = v
+    payload["_doc_id"] = doc.id
+    if doc.content is not None:
+        payload["content"] = doc.content
+    point: dict = {"id": _str_id_to_int(doc.id), "payload": payload}
+    if doc.embedding is not None:
+        point["vector"] = list(doc.embedding)
+    return point
+def _result_to_doc(
+    result: dict, *, scale_score: bool = False, metric: str = "cosine"
+) -> Document:
+    """Convert a VelesDB search or scroll result to a Haystack Document.
+    Requires ``_doc_id`` to be present in the payload. Points written by
+    :meth:`VelesDBDocumentStore.write_documents` always carry that key, so
+    a missing ``_doc_id`` means the underlying VelesDB collection was
+    populated by a different code path (raw ``col.upsert``, migration
+    scripts, mixed tooling). Falling back to the stringified integer ID
+    would silently corrupt :meth:`delete_documents`: the integer-as-string
+    re-hashes through SHA-256 to a *different* integer, so the delete
+    would no-op without raising. We fail fast instead.
+    Raises:
+        ValueError: When ``_doc_id`` is missing from the payload.
+    """
+    payload = result.get("payload", {})
+    doc_id = payload.get("_doc_id")
+    if doc_id is None:
+        raise ValueError(
+            f"VelesDB point id={result.get('id')} has no '_doc_id' field in "
+            "its payload. VelesDBDocumentStore requires every point in the "
+            "underlying collection to be written via write_documents(); "
+            "points populated by raw col.upsert() or external migration "
+            "scripts cannot be round-tripped because the stringified "
+            "integer ID would re-hash to a different integer and break "
+            "delete_documents()."
+        )
+    content = payload.get("content")
+    meta = {k: v for k, v in payload.items() if k not in _RESERVED_PAYLOAD_KEYS}
+    raw_score: Optional[float] = result.get("score")
+    if scale_score and raw_score is not None and metric == "cosine":
+        # Normalise cosine similarity from [-1, 1] to [0, 1].
+        # Only meaningful for cosine; l2 and dot scores have different ranges.
+        score: Optional[float] = (raw_score + 1.0) / 2.0
+    else:
+        score = raw_score
+    return Document(id=doc_id, content=content, meta=meta, score=score)
+def _build_int_id_map(documents: List[Document]) -> Dict[int, str]:
+    """Map every document's integer ID back to its string ID, raising on
+    in-batch SHA-256 collisions.
+    Two distinct string IDs that hash to the same 63-bit integer would
+    silently overwrite each other on upsert. This helper is the first
+    line of defence: it detects collisions inside a single
+    ``write_documents`` batch before any state hits the collection.
+    """
+    int_id_map: Dict[int, str] = {}
+    for doc in documents:
+        iid = _str_id_to_int(doc.id)
+        if iid in int_id_map and int_id_map[iid] != doc.id:
+            raise ValueError(
+                f"SHA-256 collision in write batch: '{int_id_map[iid]}' and "
+                f"'{doc.id}' map to the same integer ID {iid}. "
+                "Rename one of the documents."
+            )
+        int_id_map[iid] = doc.id
+    return int_id_map
+def _enforce_fail_policy(col: Any, int_id_map: Dict[int, str]) -> None:
+    """For ``DuplicatePolicy.FAIL``, raise if any incoming integer ID
+    already exists in the collection, or if a stored point points to a
+    different string ID (cross-store SHA-256 collision).
+    Uses point-by-point ``col.get(int_ids)`` — O(batch_size) — instead of
+    a full scroll, so collections larger than ``scroll_limit`` are still
+    correctly enforced.
+    """
+    existing_points: List[Any] = col.get(list(int_id_map.keys()))
+    conflicts: List[str] = []
+    for point in existing_points:
+        if point is None:
+            continue
+        iid = point["id"]
+        existing_str = point.get("payload", {}).get("_doc_id", str(iid))
+        str_id = int_id_map[iid]
+        if existing_str != str_id:
+            raise ValueError(
+                f"SHA-256 collision on write: incoming document '{str_id}' "
+                f"maps to the same integer ID {iid} as existing document "
+                f"'{existing_str}'. Rename one of the documents."
+            )
+        conflicts.append(str_id)
+    if conflicts:
+        raise DuplicateDocumentError(
+            f"Documents already exist (policy=FAIL): {conflicts}"
+        )
+def _documents_to_points(documents: List[Document]) -> List[dict]:
+    """Convert each document to its VelesDB point dict, logging documents
+    that lack an embedding so the caller still gets feedback when the
+    underlying SDK accepts vector-less points.
+    """
+    points: List[dict] = []
+    for doc in documents:
+        if doc.embedding is None:
+            logger.warning(
+                "Document '%s' has no embedding; stored without vector.", doc.id
+            )
+        points.append(_doc_to_point(doc))
+    return points
+class VelesDBDocumentStore:
+    """Haystack 2.x DocumentStore backed by a local VelesDB collection.
+    Stores documents (with optional embeddings) in VelesDB and exposes the
+    standard Haystack retrieval interface so this store works as a drop-in
+    backend for ``EmbeddingRetriever`` and similar pipeline components.
+    Args:
+        path: Directory path where VelesDB persists data.
+        collection_name: Name of the VelesDB collection to use.
+        embedding_dim: Dimensionality of the embedding vectors.
+        metric: Distance metric: ``"cosine"``, ``"euclidean"``, or ``"dot"``.
+        scroll_limit: Maximum documents returned by :meth:`filter_documents`.
+            Increase this value when your collection exceeds 10 000 documents.
+    """
+    def __init__(  # pylint: disable=too-many-arguments,too-many-positional-arguments
+        self,
+        path: str = "./velesdb_haystack",
+        collection_name: str = _DEFAULT_COLLECTION,
+        embedding_dim: int = _DEFAULT_DIMENSION,
+        metric: str = _DEFAULT_METRIC,
+        scroll_limit: int = _DEFAULT_SCROLL_LIMIT,
+    ) -> None:
+        self._path = validate_path(path)
+        self._collection_name = validate_collection_name(collection_name)
+        self._embedding_dim = embedding_dim
+        self._metric = validate_metric(metric)
+        self._scroll_limit = scroll_limit
+        self._db: Optional[Any] = None
+        self._collection: Optional[Any] = None
+    # ------------------------------------------------------------------
+    # Internal connection management
+    # ------------------------------------------------------------------
+    def _get_collection(self) -> Any:
+        """Return the VelesDB collection, opening or creating it as needed."""
+        if self._db is None:
+            self._db = velesdb.Database(self._path)
+        if self._collection is None:
+            col: Optional[Any] = None
+            try:
+                col = self._db.get_collection(self._collection_name)
+            except KeyError:
+                pass
+            if col is None:
+                col = self._db.create_collection(
+                    self._collection_name,
+                    dimension=self._embedding_dim,
+                    metric=self._metric,
+                )
+            self._collection = col
+        return self._collection
+    # ------------------------------------------------------------------
+    # DocumentStore protocol
+    # ------------------------------------------------------------------
+    def count_documents(self) -> int:
+        """Return the total number of documents in the store."""
+        result = self._get_collection().count()
+        return result if isinstance(result, int) else 0
+    def filter_documents(
+        self,
+        filters: Optional[Dict[str, Any]] = None,
+    ) -> List[Document]:
+        """Return documents matching *filters*, or all documents when *None*.
+        Passes *filters* directly to VelesDB's scroll operation. The real
+        SDK returns ``Iterator[List[Dict]]`` and has no ``limit`` kwarg, so
+        we drive the iterator ourselves and stop once ``self._scroll_limit``
+        documents have been collected. Increase ``scroll_limit`` on the
+        constructor for collections larger than the default 10 000.
+        """
+        col = self._get_collection()
+        documents: List[Document] = []
+        for batch in col.scroll(filter=filters):
+            for raw in batch:
+                if len(documents) >= self._scroll_limit:
+                    return documents
+                documents.append(_result_to_doc(raw))
+        return documents
+    def write_documents(
+        self,
+        documents: List[Document],
+        policy: DuplicatePolicy = DuplicatePolicy.NONE,
+    ) -> int:
+        """Write *documents* to VelesDB and return the number written.
+        VelesDB upsert semantics apply for policies other than ``FAIL``:
+        an existing point with the same integer ID is overwritten.
+        When *policy* is ``DuplicatePolicy.FAIL`` this method scans the
+        collection before writing and raises :class:`DuplicateDocumentError`
+        if any incoming document already exists.  For large collections
+        prefer ``OVERWRITE`` or ``NONE`` to avoid the pre-scan cost.
+        Raises:
+            DuplicateDocumentError: When *policy* is ``FAIL`` and at least
+                one document already exists in the store.
+            ValueError: When a SHA-256 hash collision is detected — two
+                distinct string IDs that map to the same integer ID.
+        """
+        if not documents:
+            return 0
+        int_id_map = _build_int_id_map(documents)
+        col = self._get_collection()
+        if policy == DuplicatePolicy.FAIL:
+            _enforce_fail_policy(col, int_id_map)
+        points = _documents_to_points(documents)
+        result = col.upsert(points)
+        return result if isinstance(result, int) else len(points)
+    def delete_documents(
+        self,
+        document_ids: Optional[List[str]] = None,
+    ) -> None:
+        """Delete documents identified by their Haystack string IDs."""
+        if not document_ids:
+            return
+        int_ids = [_str_id_to_int(did) for did in document_ids]
+        self._get_collection().delete(int_ids)
+    def embedding_retrieval(
+        self,
+        query_embedding: List[float],
+        *,
+        top_k: int = 10,
+        filters: Optional[Dict[str, Any]] = None,
+        scale_score: bool = True,
+    ) -> List[Document]:
+        """Return the *top_k* documents most similar to *query_embedding*.
+        Args:
+            query_embedding: Dense query vector.
+            top_k: Maximum number of documents to return.
+            filters: Optional VelesDB filter dict to restrict the search space.
+            scale_score: When ``True`` and ``metric="cosine"``, scores are
+                normalised from ``[-1, 1]`` to ``[0, 1]``. Ignored for other
+                metrics, where raw scores are returned unchanged.
+        """
+        results: List[dict] = self._get_collection().search(
+            vector=query_embedding,
+            top_k=top_k,
+            filter=filters,
+        )
+        return [_result_to_doc(r, scale_score=scale_score, metric=self._metric) for r in results]
+    # ------------------------------------------------------------------
+    # Haystack pipeline serialisation
+    # ------------------------------------------------------------------
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialise the store configuration for Haystack pipeline YAML."""
+        return default_to_dict(
+            self,
+            path=self._path,
+            collection_name=self._collection_name,
+            embedding_dim=self._embedding_dim,
+            metric=self._metric,
+            scroll_limit=self._scroll_limit,
+        )
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "VelesDBDocumentStore":
+        """Restore a store instance from a Haystack pipeline config dict."""
+        return default_from_dict(cls, data)

haystack_velesdb-1.14.1/src/haystack_velesdb/py.typed ADDED Viewed

File without changes

haystack_velesdb-1.14.1/src/haystack_velesdb.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,161 @@
+Metadata-Version: 2.4
+Name: haystack-velesdb
+Version: 1.14.1
+Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
+Author-email: VelesDB Team <contact@wiscale.fr>
+License: MIT
+Project-URL: Homepage, https://github.com/cyberlife-coder/VelesDB
+Project-URL: Documentation, https://velesdb.com/docs/integrations/haystack
+Project-URL: Repository, https://github.com/cyberlife-coder/VelesDB
+Keywords: haystack,velesdb,vector-database,embeddings,rag,local-first,semantic-search
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: haystack-ai>=2.0.0
+Requires-Dist: velesdb>=1.13.2
+Requires-Dist: velesdb-common>=1.13.2
+Provides-Extra: dev
+Requires-Dist: pytest<9.0,>=7.0; extra == "dev"
+Dynamic: license-file
+# haystack-velesdb
+A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
+the local-first, microsecond-latency vector database.
+This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
+connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
+## Installation
+```bash
+pip install haystack-velesdb
+```
+For development:
+```bash
+pip install -e "integrations/haystack[dev]"
+```
+## Quick start
+```python
+from haystack_velesdb import VelesDBDocumentStore
+from haystack.dataclasses import Document
+store = VelesDBDocumentStore(
+    path="./my_docs",
+    collection_name="knowledge_base",
+    embedding_dim=768,
+    metric="cosine",
+)
+# Write pre-embedded documents
+documents = [
+    Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
+    Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
+]
+store.write_documents(documents)
+# Retrieve by vector
+results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
+for doc in results:
+    print(doc.content, doc.score)
+```
+## Full RAG pipeline
+See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
+and semantic search example using `SentenceTransformersDocumentEmbedder`.
+```python
+from haystack import Pipeline
+from haystack.components.converters import PyPDFToDocument
+from haystack.components.embedders import (
+    SentenceTransformersDocumentEmbedder,
+    SentenceTransformersTextEmbedder,
+)
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack_velesdb import VelesDBDocumentStore
+store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
+# Indexing pipeline
+indexer = Pipeline()
+indexer.add_component("converter", PyPDFToDocument())
+indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
+indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
+indexer.add_component("writer", DocumentWriter(document_store=store))
+indexer.connect("converter", "splitter")
+indexer.connect("splitter", "embedder")
+indexer.connect("embedder", "writer")
+indexer.run({"converter": {"sources": ["paper.pdf"]}})
+# Query pipeline
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+querier = Pipeline()
+querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
+querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
+querier.connect("embedder.embedding", "retriever.query_embedding")
+result = querier.run({"embedder": {"text": "What is VelesDB?"}})
+print(result["retriever"]["documents"])
+```
+## API reference
+### `VelesDBDocumentStore`
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
+| `collection_name` | `"haystack_documents"` | VelesDB collection name |
+| `embedding_dim` | `768` | Embedding vector dimension |
+| `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
+### Methods
+| Method | Description |
+|--------|-------------|
+| `write_documents(documents, policy)` | Upsert documents; returns count written |
+| `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
+| `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
+| `count_documents()` | Total document count |
+| `delete_documents(document_ids)` | Delete by Haystack string IDs |
+| `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
+**Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
+and always overwrite on collision.  `FAIL` is fully enforced: a pre-scan is
+performed before writing and `DuplicateDocumentError` is raised if any document
+already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
+**Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
+integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots).  For a
+1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
+is negligible for typical RAG workloads.  A `ValueError` is raised at write time
+if a collision is detected between a new document and an existing one.
+**Note on `scale_score`:** When `True` (default), cosine similarity scores
+are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
+in downstream re-ranking.
+## Running tests
+```bash
+cd integrations/haystack
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+Tests use lightweight fake VelesDB objects — no running server required.

haystack_velesdb-1.14.1/src/haystack_velesdb.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,12 @@
+LICENSE
+README.md
+pyproject.toml
+src/haystack_velesdb/__init__.py
+src/haystack_velesdb/document_store.py
+src/haystack_velesdb/py.typed
+src/haystack_velesdb.egg-info/PKG-INFO
+src/haystack_velesdb.egg-info/SOURCES.txt
+src/haystack_velesdb.egg-info/dependency_links.txt
+src/haystack_velesdb.egg-info/requires.txt
+src/haystack_velesdb.egg-info/top_level.txt
+tests/test_document_store.py

haystack_velesdb-1.14.1/src/haystack_velesdb.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

haystack_velesdb-1.14.1/src/haystack_velesdb.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,6 @@
+haystack-ai>=2.0.0
+velesdb>=1.13.2
+velesdb-common>=1.13.2
+[dev]
+pytest<9.0,>=7.0

haystack_velesdb-1.14.1/src/haystack_velesdb.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ haystack_velesdb

haystack_velesdb-1.14.1/tests/test_document_store.py ADDED Viewed

@@ -0,0 +1,400 @@
+"""Unit tests for VelesDBDocumentStore.
+All external dependencies (haystack, velesdb) are replaced with lightweight
+stubs so no server or framework install is required to run the suite.
+"""
+from __future__ import annotations
+import importlib.util
+import sys
+import types
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+# ---------------------------------------------------------------------------
+# Haystack 2.x stubs — mirror the public API surface used by document_store.py
+# ---------------------------------------------------------------------------
+@dataclass
+class Document:
+    id: str = ""
+    content: Optional[str] = None
+    embedding: Optional[List[float]] = None
+    meta: Dict[str, Any] = field(default_factory=dict)
+    score: Optional[float] = None
+class DuplicatePolicy(Enum):
+    NONE = "none"
+    SKIP = "skip"
+    OVERWRITE = "overwrite"
+    FAIL = "fail"
+class DuplicateDocumentError(Exception):
+    pass
+# ---------------------------------------------------------------------------
+# Fake VelesDB objects — deterministic, no I/O
+# ---------------------------------------------------------------------------
+class _FakeCollection:
+    def __init__(self) -> None:
+        self._points: dict = {}  # int_id -> point dict
+    def upsert(self, points: list) -> int:
+        for p in points:
+            self._points[p["id"]] = p
+        return len(points)
+    def get(self, int_ids: list) -> list:
+        return [
+            {"id": iid, "payload": self._points[iid].get("payload", {})}
+            if iid in self._points else None
+            for iid in int_ids
+        ]
+    # `filter=` mirrors the public velesdb SDK kwarg name on Collection.search /
+    # Collection.scroll; renaming it would break the kwargs contract under test.
+    def search(  # pylint: disable=redefined-builtin
+        self, vector: list, top_k: int = 10, filter: Any = None
+    ) -> list:
+        del vector, filter  # the fake ignores these
+        return [
+            {"id": p["id"], "score": 0.9, "payload": p.get("payload", {})}
+            for p in list(self._points.values())[:top_k]
+        ]
+    def scroll(  # pylint: disable=redefined-builtin
+        self,
+        *,
+        batch_size: int = 100,
+        filter: Any = None,
+        as_dataframe: bool = False,
+        backend: str = "pandas",
+    ) -> Any:
+        """Match the real velesdb SDK signature: kwargs-only, returns
+        Iterator[List[Dict]]. The real SDK has no ``limit`` kwarg — callers
+        drive the iterator and stop themselves.
+        """
+        del filter, as_dataframe, backend  # the fake ignores these
+        all_points = [
+            {"id": p["id"], "score": None, "payload": p.get("payload", {})}
+            for p in self._points.values()
+        ]
+        for offset in range(0, len(all_points), batch_size):
+            yield all_points[offset : offset + batch_size]
+    def delete(self, int_ids: list) -> None:
+        for iid in int_ids:
+            self._points.pop(iid, None)
+    def count(self) -> int:
+        return len(self._points)
+class _FakeDatabase:
+    def __init__(self, path: str) -> None:
+        self._collections: dict = {}
+    def get_collection(self, name: str) -> _FakeCollection:
+        if name not in self._collections:
+            raise KeyError(name)
+        return self._collections[name]
+    def create_collection(
+        self, name: str, dimension: int, metric: str
+    ) -> _FakeCollection:
+        col = _FakeCollection()
+        self._collections[name] = col
+        return col
+# ---------------------------------------------------------------------------
+# Module loader — inject stubs, load document_store from source
+# ---------------------------------------------------------------------------
+def _load_module() -> types.ModuleType:
+    root = Path(__file__).resolve().parents[1] / "src" / "haystack_velesdb"
+    haystack_pkg = types.ModuleType("haystack")
+    haystack_pkg.default_to_dict = lambda obj, **kw: {  # type: ignore[attr-defined]
+        "type": type(obj).__name__,
+        "init_parameters": kw,
+    }
+    haystack_pkg.default_from_dict = lambda cls, d: cls(  # type: ignore[attr-defined]
+        **d.get("init_parameters", {})
+    )
+    sys.modules["haystack"] = haystack_pkg
+    dc_mod = types.ModuleType("haystack.dataclasses")
+    dc_mod.Document = Document  # type: ignore[attr-defined]
+    sys.modules["haystack.dataclasses"] = dc_mod
+    ds_pkg = types.ModuleType("haystack.document_stores")
+    sys.modules["haystack.document_stores"] = ds_pkg
+    types_mod = types.ModuleType("haystack.document_stores.types")
+    types_mod.DuplicatePolicy = DuplicatePolicy  # type: ignore[attr-defined]
+    sys.modules["haystack.document_stores.types"] = types_mod
+    errors_mod = types.ModuleType("haystack.document_stores.errors")
+    errors_mod.DuplicateDocumentError = DuplicateDocumentError  # type: ignore[attr-defined]
+    sys.modules["haystack.document_stores.errors"] = errors_mod
+    sys.modules["velesdb"] = types.SimpleNamespace(Database=_FakeDatabase)  # type: ignore
+    # Stub velesdb_common.security with no-op validators (real package has its own tests).
+    def _passthrough(value: Any, *args: Any, **kwargs: Any) -> Any:
+        return value
+    vc_mod = types.ModuleType("velesdb_common")
+    sys.modules["velesdb_common"] = vc_mod
+    vc_sec = types.ModuleType("velesdb_common.security")
+    vc_sec.validate_path = _passthrough  # type: ignore[attr-defined]
+    vc_sec.validate_collection_name = _passthrough  # type: ignore[attr-defined]
+    vc_sec.validate_metric = _passthrough  # type: ignore[attr-defined]
+    vc_sec.SecurityError = ValueError  # type: ignore[attr-defined]
+    sys.modules["velesdb_common.security"] = vc_sec
+    pkg = types.ModuleType("haystack_velesdb")
+    pkg.__path__ = [str(root)]  # type: ignore[attr-defined]
+    sys.modules["haystack_velesdb"] = pkg
+    spec = importlib.util.spec_from_file_location(
+        "haystack_velesdb.document_store", root / "document_store.py"
+    )
+    assert spec and spec.loader
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules["haystack_velesdb.document_store"] = mod
+    spec.loader.exec_module(mod)  # type: ignore[union-attr]
+    return mod
+_MOD = _load_module()
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+def test_write_and_count() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_write")
+    docs = [
+        Document(id="a", content="alpha", embedding=[0.1, 0.2, 0.3]),
+        Document(id="b", content="beta", embedding=[0.4, 0.5, 0.6]),
+    ]
+    assert store.write_documents(docs) == 2
+    assert store.count_documents() == 2
+def test_write_empty_returns_zero() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_empty")
+    assert store.write_documents([]) == 0
+def test_embedding_retrieval_returns_documents() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_retrieval")
+    store.write_documents([Document(id="x", content="hello", embedding=[0.1, 0.2, 0.3])])
+    results = store.embedding_retrieval([0.1, 0.2, 0.3], top_k=5)
+    assert len(results) >= 1
+    assert results[0].id == "x"
+    assert results[0].content == "hello"
+def test_scale_score_normalises_cosine() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_score")
+    store.write_documents([Document(id="y", content="world", embedding=[1.0, 0.0])])
+    scaled = store.embedding_retrieval([1.0, 0.0], scale_score=True)
+    raw = store.embedding_retrieval([1.0, 0.0], scale_score=False)
+    assert scaled[0].score == (0.9 + 1.0) / 2.0
+    assert raw[0].score == 0.9
+def test_filter_documents_returns_all_when_none() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_filter")
+    store.write_documents([
+        Document(id="p", content="foo", embedding=[0.1, 0.2]),
+        Document(id="q", content="bar", embedding=[0.7, 0.8]),
+    ])
+    assert len(store.filter_documents()) == 2
+def test_filter_documents_passes_filter_to_scroll() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_filter_arg")
+    store.write_documents([
+        Document(id="fa", content="alpha", embedding=[0.1]),
+    ])
+    # Passing a non-None filter should not raise; the fake scroll ignores it,
+    # but this confirms the filter arg is forwarded without error.
+    results = store.filter_documents(filters={"field": "value"})
+    assert len(results) == 1
+def test_scale_score_not_applied_for_non_cosine_metric() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_score_nc", metric="euclidean")
+    store.write_documents([Document(id="z", content="raw", embedding=[1.0])])
+    scaled = store.embedding_retrieval([1.0], scale_score=True)
+    # For euclidean metric scale_score should be a no-op — raw score returned.
+    assert scaled[0].score == 0.9
+def test_scroll_limit_is_respected() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_limit", scroll_limit=1)
+    store.write_documents([
+        Document(id="r", content="one", embedding=[0.1]),
+        Document(id="s", content="two", embedding=[0.2]),
+    ])
+    # With scroll_limit=1 the fake scroll caps at 1 result.
+    assert len(store.filter_documents()) == 1
+def test_delete_documents() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_delete")
+    store.write_documents([
+        Document(id="del1", content="remove me", embedding=[0.1, 0.2]),
+        Document(id="keep1", content="keep me", embedding=[0.3, 0.4]),
+    ])
+    assert store.count_documents() == 2
+    store.delete_documents(["del1"])
+    assert store.count_documents() == 1
+    remaining = store.filter_documents()
+    assert remaining[0].id == "keep1"
+def test_document_metadata_round_trips() -> None:
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_meta")
+    store.write_documents([
+        Document(id="m1", content="meta test", embedding=[0.5], meta={"source": "wiki"})
+    ])
+    docs = store.filter_documents()
+    assert docs[0].meta.get("source") == "wiki"
+def test_reserved_meta_keys_do_not_corrupt_payload() -> None:
+    """doc.meta containing reserved keys must not overwrite canonical fields."""
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_reserved")
+    # A user accidentally sets meta keys that clash with our reserved names.
+    store.write_documents([
+        Document(
+            id="safe",
+            content="real content",
+            embedding=[0.1],
+            meta={"_doc_id": "evil_id", "content": "evil content"},
+        )
+    ])
+    docs = store.filter_documents()
+    assert docs[0].id == "safe", "_doc_id must come from doc.id, not meta"
+    assert docs[0].content == "real content", "content must come from doc.content, not meta"
+    # Reserved keys should not leak back into meta on retrieval.
+    assert "_doc_id" not in docs[0].meta
+    assert "content" not in docs[0].meta
+def test_get_collection_catches_key_error_and_creates_collection() -> None:
+    """_get_collection catches KeyError from get_collection and falls back to create_collection."""
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_key_error_path")
+    # The fake raises KeyError for unknown collections; _get_collection should
+    # catch it and call create_collection instead of letting the error propagate.
+    assert store.count_documents() == 0
+    assert store._collection is not None
+def test_write_documents_fail_policy_raises_on_duplicate() -> None:
+    """DuplicatePolicy.FAIL raises DuplicateDocumentError when a document already exists."""
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_fail_dup")
+    doc = Document(id="dup1", content="original", embedding=[0.1, 0.2])
+    store.write_documents([doc])
+    import pytest
+    with pytest.raises(DuplicateDocumentError):
+        store.write_documents([doc], policy=DuplicatePolicy.FAIL)
+def test_write_documents_fail_policy_succeeds_for_new_docs() -> None:
+    """DuplicatePolicy.FAIL succeeds when none of the documents already exist."""
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_fail_new")
+    doc = Document(id="new_only", content="fresh", embedding=[0.5])
+    result = store.write_documents([doc], policy=DuplicatePolicy.FAIL)
+    assert result == 1
+    assert store.count_documents() == 1
+def test_serialisation_round_trip() -> None:
+    store = _MOD.VelesDBDocumentStore(
+        path="/tmp/hs_serial",
+        collection_name="serial",
+        embedding_dim=384,
+        metric="euclidean",
+        scroll_limit=5_000,
+    )
+    d = store.to_dict()
+    assert d["init_parameters"]["embedding_dim"] == 384
+    assert d["init_parameters"]["metric"] == "euclidean"
+    assert d["init_parameters"]["scroll_limit"] == 5_000
+    restored = _MOD.VelesDBDocumentStore.from_dict(d)
+    assert restored._embedding_dim == 384
+    assert restored._metric == "euclidean"
+    assert restored._scroll_limit == 5_000
+def test_filter_documents_drives_scroll_iterator_across_batches() -> None:
+    """Regression: filter_documents must drive the Iterator returned by
+    Collection.scroll() (the real SDK returns Iterator[List[Dict]], it does
+    not return a flat list nor accept a 'limit' kwarg). With batch_size=100
+    in the fake, a 2-document collection yields a single 2-element batch,
+    and the helper must collect both.
+    """
+    store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_iter_drive")
+    store.write_documents(
+        [
+            Document(id="i1", content="one", embedding=[0.1]),
+            Document(id="i2", content="two", embedding=[0.2]),
+        ]
+    )
+    docs = store.filter_documents()
+    assert {d.id for d in docs} == {"i1", "i2"}
+def test_result_to_doc_raises_on_missing_doc_id() -> None:
+    """Regression: a VelesDB point with no `_doc_id` payload key must raise
+    rather than silently fall back to str(int_id). The previous fallback
+    corrupted delete_documents() because str(int_id) re-hashes via SHA-256
+    to a different integer, so the delete would no-op without raising.
+    """
+    import pytest
+    raw = {"id": 12345, "score": 0.9, "payload": {"content": "orphan"}}
+    with pytest.raises(ValueError, match="no '_doc_id'"):
+        _MOD._result_to_doc(raw)
+def test_get_collection_returns_none_and_creates_collection() -> None:
+    """_get_collection handles SDK returning None (the production SDK behavior)."""
+    class _FakeDatabaseReturnsNone:
+        def __init__(self, path: str) -> None:
+            self._collections: dict = {}
+        def get_collection(self, name: str) -> Optional[_FakeCollection]:
+            return None  # Real VelesDB SDK returns None for unknown collections.
+        def create_collection(
+            self, name: str, dimension: int, metric: str
+        ) -> _FakeCollection:
+            col = _FakeCollection()
+            self._collections[name] = col
+            return col
+    original_velesdb = _MOD.velesdb
+    try:
+        _MOD.velesdb = types.SimpleNamespace(Database=_FakeDatabaseReturnsNone)  # type: ignore
+        store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_none_path")
+        assert store.count_documents() == 0
+        assert store._collection is not None
+    finally:
+        _MOD.velesdb = original_velesdb