haystack-velesdb 1.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 VelesDB Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.4
2
+ Name: haystack-velesdb
3
+ Version: 1.14.1
4
+ Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
5
+ Author-email: VelesDB Team <contact@wiscale.fr>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cyberlife-coder/VelesDB
8
+ Project-URL: Documentation, https://velesdb.com/docs/integrations/haystack
9
+ Project-URL: Repository, https://github.com/cyberlife-coder/VelesDB
10
+ Keywords: haystack,velesdb,vector-database,embeddings,rag,local-first,semantic-search
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: haystack-ai>=2.0.0
24
+ Requires-Dist: velesdb>=1.13.2
25
+ Requires-Dist: velesdb-common>=1.13.2
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest<9.0,>=7.0; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # haystack-velesdb
31
+
32
+ A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
33
+ the local-first, microsecond-latency vector database.
34
+
35
+ This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
36
+ connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install haystack-velesdb
42
+ ```
43
+
44
+ For development:
45
+
46
+ ```bash
47
+ pip install -e "integrations/haystack[dev]"
48
+ ```
49
+
50
+ ## Quick start
51
+
52
+ ```python
53
+ from haystack_velesdb import VelesDBDocumentStore
54
+ from haystack.dataclasses import Document
55
+
56
+ store = VelesDBDocumentStore(
57
+ path="./my_docs",
58
+ collection_name="knowledge_base",
59
+ embedding_dim=768,
60
+ metric="cosine",
61
+ )
62
+
63
+ # Write pre-embedded documents
64
+ documents = [
65
+ Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
66
+ Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
67
+ ]
68
+ store.write_documents(documents)
69
+
70
+ # Retrieve by vector
71
+ results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
72
+ for doc in results:
73
+ print(doc.content, doc.score)
74
+ ```
75
+
76
+ ## Full RAG pipeline
77
+
78
+ See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
79
+ and semantic search example using `SentenceTransformersDocumentEmbedder`.
80
+
81
+ ```python
82
+ from haystack import Pipeline
83
+ from haystack.components.converters import PyPDFToDocument
84
+ from haystack.components.embedders import (
85
+ SentenceTransformersDocumentEmbedder,
86
+ SentenceTransformersTextEmbedder,
87
+ )
88
+ from haystack.components.preprocessors import DocumentSplitter
89
+ from haystack.components.writers import DocumentWriter
90
+ from haystack_velesdb import VelesDBDocumentStore
91
+
92
+ store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
93
+
94
+ # Indexing pipeline
95
+ indexer = Pipeline()
96
+ indexer.add_component("converter", PyPDFToDocument())
97
+ indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
98
+ indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
99
+ indexer.add_component("writer", DocumentWriter(document_store=store))
100
+ indexer.connect("converter", "splitter")
101
+ indexer.connect("splitter", "embedder")
102
+ indexer.connect("embedder", "writer")
103
+ indexer.run({"converter": {"sources": ["paper.pdf"]}})
104
+
105
+ # Query pipeline
106
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
107
+
108
+ querier = Pipeline()
109
+ querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
110
+ querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
111
+ querier.connect("embedder.embedding", "retriever.query_embedding")
112
+ result = querier.run({"embedder": {"text": "What is VelesDB?"}})
113
+ print(result["retriever"]["documents"])
114
+ ```
115
+
116
+ ## API reference
117
+
118
+ ### `VelesDBDocumentStore`
119
+
120
+ | Parameter | Default | Description |
121
+ |-----------|---------|-------------|
122
+ | `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
123
+ | `collection_name` | `"haystack_documents"` | VelesDB collection name |
124
+ | `embedding_dim` | `768` | Embedding vector dimension |
125
+ | `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
126
+
127
+ ### Methods
128
+
129
+ | Method | Description |
130
+ |--------|-------------|
131
+ | `write_documents(documents, policy)` | Upsert documents; returns count written |
132
+ | `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
133
+ | `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
134
+ | `count_documents()` | Total document count |
135
+ | `delete_documents(document_ids)` | Delete by Haystack string IDs |
136
+ | `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
137
+
138
+ **Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
139
+ and always overwrite on collision. `FAIL` is fully enforced: a pre-scan is
140
+ performed before writing and `DuplicateDocumentError` is raised if any document
141
+ already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
142
+
143
+ **Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
144
+ integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots). For a
145
+ 1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
146
+ is negligible for typical RAG workloads. A `ValueError` is raised at write time
147
+ if a collision is detected between a new document and an existing one.
148
+
149
+ **Note on `scale_score`:** When `True` (default), cosine similarity scores
150
+ are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
151
+ in downstream re-ranking.
152
+
153
+ ## Running tests
154
+
155
+ ```bash
156
+ cd integrations/haystack
157
+ pip install -e ".[dev]"
158
+ pytest tests/ -v
159
+ ```
160
+
161
+ Tests use lightweight fake VelesDB objects — no running server required.
@@ -0,0 +1,132 @@
1
+ # haystack-velesdb
2
+
3
+ A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
4
+ the local-first, microsecond-latency vector database.
5
+
6
+ This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
7
+ connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install haystack-velesdb
13
+ ```
14
+
15
+ For development:
16
+
17
+ ```bash
18
+ pip install -e "integrations/haystack[dev]"
19
+ ```
20
+
21
+ ## Quick start
22
+
23
+ ```python
24
+ from haystack_velesdb import VelesDBDocumentStore
25
+ from haystack.dataclasses import Document
26
+
27
+ store = VelesDBDocumentStore(
28
+ path="./my_docs",
29
+ collection_name="knowledge_base",
30
+ embedding_dim=768,
31
+ metric="cosine",
32
+ )
33
+
34
+ # Write pre-embedded documents
35
+ documents = [
36
+ Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
37
+ Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
38
+ ]
39
+ store.write_documents(documents)
40
+
41
+ # Retrieve by vector
42
+ results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
43
+ for doc in results:
44
+ print(doc.content, doc.score)
45
+ ```
46
+
47
+ ## Full RAG pipeline
48
+
49
+ See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
50
+ and semantic search example using `SentenceTransformersDocumentEmbedder`.
51
+
52
+ ```python
53
+ from haystack import Pipeline
54
+ from haystack.components.converters import PyPDFToDocument
55
+ from haystack.components.embedders import (
56
+ SentenceTransformersDocumentEmbedder,
57
+ SentenceTransformersTextEmbedder,
58
+ )
59
+ from haystack.components.preprocessors import DocumentSplitter
60
+ from haystack.components.writers import DocumentWriter
61
+ from haystack_velesdb import VelesDBDocumentStore
62
+
63
+ store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
64
+
65
+ # Indexing pipeline
66
+ indexer = Pipeline()
67
+ indexer.add_component("converter", PyPDFToDocument())
68
+ indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
69
+ indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
70
+ indexer.add_component("writer", DocumentWriter(document_store=store))
71
+ indexer.connect("converter", "splitter")
72
+ indexer.connect("splitter", "embedder")
73
+ indexer.connect("embedder", "writer")
74
+ indexer.run({"converter": {"sources": ["paper.pdf"]}})
75
+
76
+ # Query pipeline
77
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
78
+
79
+ querier = Pipeline()
80
+ querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
81
+ querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
82
+ querier.connect("embedder.embedding", "retriever.query_embedding")
83
+ result = querier.run({"embedder": {"text": "What is VelesDB?"}})
84
+ print(result["retriever"]["documents"])
85
+ ```
86
+
87
+ ## API reference
88
+
89
+ ### `VelesDBDocumentStore`
90
+
91
+ | Parameter | Default | Description |
92
+ |-----------|---------|-------------|
93
+ | `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
94
+ | `collection_name` | `"haystack_documents"` | VelesDB collection name |
95
+ | `embedding_dim` | `768` | Embedding vector dimension |
96
+ | `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
97
+
98
+ ### Methods
99
+
100
+ | Method | Description |
101
+ |--------|-------------|
102
+ | `write_documents(documents, policy)` | Upsert documents; returns count written |
103
+ | `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
104
+ | `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
105
+ | `count_documents()` | Total document count |
106
+ | `delete_documents(document_ids)` | Delete by Haystack string IDs |
107
+ | `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
108
+
109
+ **Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
110
+ and always overwrite on collision. `FAIL` is fully enforced: a pre-scan is
111
+ performed before writing and `DuplicateDocumentError` is raised if any document
112
+ already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
113
+
114
+ **Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
115
+ integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots). For a
116
+ 1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
117
+ is negligible for typical RAG workloads. A `ValueError` is raised at write time
118
+ if a collision is detected between a new document and an existing one.
119
+
120
+ **Note on `scale_score`:** When `True` (default), cosine similarity scores
121
+ are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
122
+ in downstream re-ranking.
123
+
124
+ ## Running tests
125
+
126
+ ```bash
127
+ cd integrations/haystack
128
+ pip install -e ".[dev]"
129
+ pytest tests/ -v
130
+ ```
131
+
132
+ Tests use lightweight fake VelesDB objects — no running server required.
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "haystack-velesdb"
7
+ version = "1.14.1"
8
+ description = "Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ authors = [
12
+ {name = "VelesDB Team", email = "contact@wiscale.fr"}
13
+ ]
14
+ keywords = ["haystack", "velesdb", "vector-database", "embeddings", "rag", "local-first", "semantic-search"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ ]
26
+ requires-python = ">=3.9"
27
+ dependencies = [
28
+ "haystack-ai>=2.0.0",
29
+ "velesdb>=1.13.2",
30
+ "velesdb-common>=1.13.2",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=7.0,<9.0",
36
+ ]
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/cyberlife-coder/VelesDB"
40
+ Documentation = "https://velesdb.com/docs/integrations/haystack"
41
+ Repository = "https://github.com/cyberlife-coder/VelesDB"
42
+
43
+ [tool.setuptools.packages.find]
44
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ """Haystack 2.x DocumentStore integration for VelesDB."""
2
+
3
+ from haystack_velesdb.document_store import VelesDBDocumentStore
4
+
5
+ __all__ = ["VelesDBDocumentStore"]
6
+ __version__ = "1.0.0"
@@ -0,0 +1,350 @@
1
+ """Haystack 2.x DocumentStore backed by VelesDB.
2
+
3
+ Implements the Haystack ``DocumentStore`` protocol so VelesDB can be used
4
+ as the vector backend in any Haystack 2.x indexing or retrieval pipeline.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import logging
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from haystack import default_from_dict, default_to_dict
13
+ from haystack.dataclasses import Document
14
+ from haystack.document_stores.errors import DuplicateDocumentError
15
+ from haystack.document_stores.types import DuplicatePolicy
16
+
17
+ import velesdb
18
+ from velesdb_common.security import (
19
+ validate_collection_name,
20
+ validate_metric,
21
+ validate_path,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ __all__ = ["VelesDBDocumentStore"]
27
+
28
+ _DEFAULT_COLLECTION = "haystack_documents"
29
+ _DEFAULT_DIMENSION = 768
30
+ _DEFAULT_METRIC = "cosine"
31
+ _DEFAULT_SCROLL_LIMIT = 10_000
32
+ _INT63_MASK = (1 << 63) - 1
33
+ # Reserved keys stored by this integration in the VelesDB payload.
34
+ _RESERVED_PAYLOAD_KEYS = frozenset({"_doc_id", "content"})
35
+
36
+
37
+ def _str_id_to_int(doc_id: str) -> int:
38
+ """Map a Haystack string document ID to a stable positive 63-bit integer.
39
+
40
+ Uses the first 8 bytes of SHA-256, masked to 63 bits (~9.2 × 10¹⁸ slots).
41
+ Collision probability for a 1 M-document collection is roughly 5 × 10⁻¹⁴ —
42
+ negligible for typical RAG workloads but not zero. If two distinct string
43
+ IDs produce the same integer ID, :meth:`write_documents` raises
44
+ :class:`ValueError` rather than silently overwriting the existing document.
45
+ """
46
+ return int.from_bytes(hashlib.sha256(doc_id.encode()).digest()[:8], "big") & _INT63_MASK
47
+
48
+
49
+ def _doc_to_point(doc: Document) -> dict:
50
+ """Convert a Haystack Document to a VelesDB point dict.
51
+
52
+ Reserved payload keys (``_doc_id``, ``content``) are always written from
53
+ the document's canonical fields, not from ``doc.meta``. Any meta entry
54
+ that shares a reserved name is silently dropped from the payload to
55
+ prevent round-trip corruption.
56
+ """
57
+ payload: dict = {}
58
+ # Merge meta first; reserved keys are excluded so they cannot
59
+ # clobber the canonical doc identity written below.
60
+ if doc.meta:
61
+ for k, v in doc.meta.items():
62
+ if k not in _RESERVED_PAYLOAD_KEYS:
63
+ payload[k] = v
64
+ payload["_doc_id"] = doc.id
65
+ if doc.content is not None:
66
+ payload["content"] = doc.content
67
+ point: dict = {"id": _str_id_to_int(doc.id), "payload": payload}
68
+ if doc.embedding is not None:
69
+ point["vector"] = list(doc.embedding)
70
+ return point
71
+
72
+
73
+ def _result_to_doc(
74
+ result: dict, *, scale_score: bool = False, metric: str = "cosine"
75
+ ) -> Document:
76
+ """Convert a VelesDB search or scroll result to a Haystack Document.
77
+
78
+ Requires ``_doc_id`` to be present in the payload. Points written by
79
+ :meth:`VelesDBDocumentStore.write_documents` always carry that key, so
80
+ a missing ``_doc_id`` means the underlying VelesDB collection was
81
+ populated by a different code path (raw ``col.upsert``, migration
82
+ scripts, mixed tooling). Falling back to the stringified integer ID
83
+ would silently corrupt :meth:`delete_documents`: the integer-as-string
84
+ re-hashes through SHA-256 to a *different* integer, so the delete
85
+ would no-op without raising. We fail fast instead.
86
+
87
+ Raises:
88
+ ValueError: When ``_doc_id`` is missing from the payload.
89
+ """
90
+ payload = result.get("payload", {})
91
+ doc_id = payload.get("_doc_id")
92
+ if doc_id is None:
93
+ raise ValueError(
94
+ f"VelesDB point id={result.get('id')} has no '_doc_id' field in "
95
+ "its payload. VelesDBDocumentStore requires every point in the "
96
+ "underlying collection to be written via write_documents(); "
97
+ "points populated by raw col.upsert() or external migration "
98
+ "scripts cannot be round-tripped because the stringified "
99
+ "integer ID would re-hash to a different integer and break "
100
+ "delete_documents()."
101
+ )
102
+ content = payload.get("content")
103
+ meta = {k: v for k, v in payload.items() if k not in _RESERVED_PAYLOAD_KEYS}
104
+ raw_score: Optional[float] = result.get("score")
105
+ if scale_score and raw_score is not None and metric == "cosine":
106
+ # Normalise cosine similarity from [-1, 1] to [0, 1].
107
+ # Only meaningful for cosine; l2 and dot scores have different ranges.
108
+ score: Optional[float] = (raw_score + 1.0) / 2.0
109
+ else:
110
+ score = raw_score
111
+ return Document(id=doc_id, content=content, meta=meta, score=score)
112
+
113
+
114
+ def _build_int_id_map(documents: List[Document]) -> Dict[int, str]:
115
+ """Map every document's integer ID back to its string ID, raising on
116
+ in-batch SHA-256 collisions.
117
+
118
+ Two distinct string IDs that hash to the same 63-bit integer would
119
+ silently overwrite each other on upsert. This helper is the first
120
+ line of defence: it detects collisions inside a single
121
+ ``write_documents`` batch before any state hits the collection.
122
+ """
123
+ int_id_map: Dict[int, str] = {}
124
+ for doc in documents:
125
+ iid = _str_id_to_int(doc.id)
126
+ if iid in int_id_map and int_id_map[iid] != doc.id:
127
+ raise ValueError(
128
+ f"SHA-256 collision in write batch: '{int_id_map[iid]}' and "
129
+ f"'{doc.id}' map to the same integer ID {iid}. "
130
+ "Rename one of the documents."
131
+ )
132
+ int_id_map[iid] = doc.id
133
+ return int_id_map
134
+
135
+
136
+ def _enforce_fail_policy(col: Any, int_id_map: Dict[int, str]) -> None:
137
+ """For ``DuplicatePolicy.FAIL``, raise if any incoming integer ID
138
+ already exists in the collection, or if a stored point points to a
139
+ different string ID (cross-store SHA-256 collision).
140
+
141
+ Uses point-by-point ``col.get(int_ids)`` — O(batch_size) — instead of
142
+ a full scroll, so collections larger than ``scroll_limit`` are still
143
+ correctly enforced.
144
+ """
145
+ existing_points: List[Any] = col.get(list(int_id_map.keys()))
146
+ conflicts: List[str] = []
147
+ for point in existing_points:
148
+ if point is None:
149
+ continue
150
+ iid = point["id"]
151
+ existing_str = point.get("payload", {}).get("_doc_id", str(iid))
152
+ str_id = int_id_map[iid]
153
+ if existing_str != str_id:
154
+ raise ValueError(
155
+ f"SHA-256 collision on write: incoming document '{str_id}' "
156
+ f"maps to the same integer ID {iid} as existing document "
157
+ f"'{existing_str}'. Rename one of the documents."
158
+ )
159
+ conflicts.append(str_id)
160
+ if conflicts:
161
+ raise DuplicateDocumentError(
162
+ f"Documents already exist (policy=FAIL): {conflicts}"
163
+ )
164
+
165
+
166
+ def _documents_to_points(documents: List[Document]) -> List[dict]:
167
+ """Convert each document to its VelesDB point dict, logging documents
168
+ that lack an embedding so the caller still gets feedback when the
169
+ underlying SDK accepts vector-less points.
170
+ """
171
+ points: List[dict] = []
172
+ for doc in documents:
173
+ if doc.embedding is None:
174
+ logger.warning(
175
+ "Document '%s' has no embedding; stored without vector.", doc.id
176
+ )
177
+ points.append(_doc_to_point(doc))
178
+ return points
179
+
180
+
181
+ class VelesDBDocumentStore:
182
+ """Haystack 2.x DocumentStore backed by a local VelesDB collection.
183
+
184
+ Stores documents (with optional embeddings) in VelesDB and exposes the
185
+ standard Haystack retrieval interface so this store works as a drop-in
186
+ backend for ``EmbeddingRetriever`` and similar pipeline components.
187
+
188
+ Args:
189
+ path: Directory path where VelesDB persists data.
190
+ collection_name: Name of the VelesDB collection to use.
191
+ embedding_dim: Dimensionality of the embedding vectors.
192
+ metric: Distance metric: ``"cosine"``, ``"euclidean"``, or ``"dot"``.
193
+ scroll_limit: Maximum documents returned by :meth:`filter_documents`.
194
+ Increase this value when your collection exceeds 10 000 documents.
195
+ """
196
+
197
+ def __init__( # pylint: disable=too-many-arguments,too-many-positional-arguments
198
+ self,
199
+ path: str = "./velesdb_haystack",
200
+ collection_name: str = _DEFAULT_COLLECTION,
201
+ embedding_dim: int = _DEFAULT_DIMENSION,
202
+ metric: str = _DEFAULT_METRIC,
203
+ scroll_limit: int = _DEFAULT_SCROLL_LIMIT,
204
+ ) -> None:
205
+ self._path = validate_path(path)
206
+ self._collection_name = validate_collection_name(collection_name)
207
+ self._embedding_dim = embedding_dim
208
+ self._metric = validate_metric(metric)
209
+ self._scroll_limit = scroll_limit
210
+ self._db: Optional[Any] = None
211
+ self._collection: Optional[Any] = None
212
+
213
+ # ------------------------------------------------------------------
214
+ # Internal connection management
215
+ # ------------------------------------------------------------------
216
+
217
+ def _get_collection(self) -> Any:
218
+ """Return the VelesDB collection, opening or creating it as needed."""
219
+ if self._db is None:
220
+ self._db = velesdb.Database(self._path)
221
+ if self._collection is None:
222
+ col: Optional[Any] = None
223
+ try:
224
+ col = self._db.get_collection(self._collection_name)
225
+ except KeyError:
226
+ pass
227
+ if col is None:
228
+ col = self._db.create_collection(
229
+ self._collection_name,
230
+ dimension=self._embedding_dim,
231
+ metric=self._metric,
232
+ )
233
+ self._collection = col
234
+ return self._collection
235
+
236
+ # ------------------------------------------------------------------
237
+ # DocumentStore protocol
238
+ # ------------------------------------------------------------------
239
+
240
+ def count_documents(self) -> int:
241
+ """Return the total number of documents in the store."""
242
+ result = self._get_collection().count()
243
+ return result if isinstance(result, int) else 0
244
+
245
+ def filter_documents(
246
+ self,
247
+ filters: Optional[Dict[str, Any]] = None,
248
+ ) -> List[Document]:
249
+ """Return documents matching *filters*, or all documents when *None*.
250
+
251
+ Passes *filters* directly to VelesDB's scroll operation. The real
252
+ SDK returns ``Iterator[List[Dict]]`` and has no ``limit`` kwarg, so
253
+ we drive the iterator ourselves and stop once ``self._scroll_limit``
254
+ documents have been collected. Increase ``scroll_limit`` on the
255
+ constructor for collections larger than the default 10 000.
256
+ """
257
+ col = self._get_collection()
258
+ documents: List[Document] = []
259
+ for batch in col.scroll(filter=filters):
260
+ for raw in batch:
261
+ if len(documents) >= self._scroll_limit:
262
+ return documents
263
+ documents.append(_result_to_doc(raw))
264
+ return documents
265
+
266
+ def write_documents(
267
+ self,
268
+ documents: List[Document],
269
+ policy: DuplicatePolicy = DuplicatePolicy.NONE,
270
+ ) -> int:
271
+ """Write *documents* to VelesDB and return the number written.
272
+
273
+ VelesDB upsert semantics apply for policies other than ``FAIL``:
274
+ an existing point with the same integer ID is overwritten.
275
+
276
+ When *policy* is ``DuplicatePolicy.FAIL`` this method scans the
277
+ collection before writing and raises :class:`DuplicateDocumentError`
278
+ if any incoming document already exists. For large collections
279
+ prefer ``OVERWRITE`` or ``NONE`` to avoid the pre-scan cost.
280
+
281
+ Raises:
282
+ DuplicateDocumentError: When *policy* is ``FAIL`` and at least
283
+ one document already exists in the store.
284
+ ValueError: When a SHA-256 hash collision is detected — two
285
+ distinct string IDs that map to the same integer ID.
286
+ """
287
+ if not documents:
288
+ return 0
289
+ int_id_map = _build_int_id_map(documents)
290
+ col = self._get_collection()
291
+ if policy == DuplicatePolicy.FAIL:
292
+ _enforce_fail_policy(col, int_id_map)
293
+ points = _documents_to_points(documents)
294
+ result = col.upsert(points)
295
+ return result if isinstance(result, int) else len(points)
296
+
297
+ def delete_documents(
298
+ self,
299
+ document_ids: Optional[List[str]] = None,
300
+ ) -> None:
301
+ """Delete documents identified by their Haystack string IDs."""
302
+ if not document_ids:
303
+ return
304
+ int_ids = [_str_id_to_int(did) for did in document_ids]
305
+ self._get_collection().delete(int_ids)
306
+
307
+ def embedding_retrieval(
308
+ self,
309
+ query_embedding: List[float],
310
+ *,
311
+ top_k: int = 10,
312
+ filters: Optional[Dict[str, Any]] = None,
313
+ scale_score: bool = True,
314
+ ) -> List[Document]:
315
+ """Return the *top_k* documents most similar to *query_embedding*.
316
+
317
+ Args:
318
+ query_embedding: Dense query vector.
319
+ top_k: Maximum number of documents to return.
320
+ filters: Optional VelesDB filter dict to restrict the search space.
321
+ scale_score: When ``True`` and ``metric="cosine"``, scores are
322
+ normalised from ``[-1, 1]`` to ``[0, 1]``. Ignored for other
323
+ metrics, where raw scores are returned unchanged.
324
+ """
325
+ results: List[dict] = self._get_collection().search(
326
+ vector=query_embedding,
327
+ top_k=top_k,
328
+ filter=filters,
329
+ )
330
+ return [_result_to_doc(r, scale_score=scale_score, metric=self._metric) for r in results]
331
+
332
+ # ------------------------------------------------------------------
333
+ # Haystack pipeline serialisation
334
+ # ------------------------------------------------------------------
335
+
336
+ def to_dict(self) -> Dict[str, Any]:
337
+ """Serialise the store configuration for Haystack pipeline YAML."""
338
+ return default_to_dict(
339
+ self,
340
+ path=self._path,
341
+ collection_name=self._collection_name,
342
+ embedding_dim=self._embedding_dim,
343
+ metric=self._metric,
344
+ scroll_limit=self._scroll_limit,
345
+ )
346
+
347
+ @classmethod
348
+ def from_dict(cls, data: Dict[str, Any]) -> "VelesDBDocumentStore":
349
+ """Restore a store instance from a Haystack pipeline config dict."""
350
+ return default_from_dict(cls, data)
File without changes
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.4
2
+ Name: haystack-velesdb
3
+ Version: 1.14.1
4
+ Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
5
+ Author-email: VelesDB Team <contact@wiscale.fr>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cyberlife-coder/VelesDB
8
+ Project-URL: Documentation, https://velesdb.com/docs/integrations/haystack
9
+ Project-URL: Repository, https://github.com/cyberlife-coder/VelesDB
10
+ Keywords: haystack,velesdb,vector-database,embeddings,rag,local-first,semantic-search
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: haystack-ai>=2.0.0
24
+ Requires-Dist: velesdb>=1.13.2
25
+ Requires-Dist: velesdb-common>=1.13.2
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest<9.0,>=7.0; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # haystack-velesdb
31
+
32
+ A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
33
+ the local-first, microsecond-latency vector database.
34
+
35
+ This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
36
+ connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install haystack-velesdb
42
+ ```
43
+
44
+ For development:
45
+
46
+ ```bash
47
+ pip install -e "integrations/haystack[dev]"
48
+ ```
49
+
50
+ ## Quick start
51
+
52
+ ```python
53
+ from haystack_velesdb import VelesDBDocumentStore
54
+ from haystack.dataclasses import Document
55
+
56
+ store = VelesDBDocumentStore(
57
+ path="./my_docs",
58
+ collection_name="knowledge_base",
59
+ embedding_dim=768,
60
+ metric="cosine",
61
+ )
62
+
63
+ # Write pre-embedded documents
64
+ documents = [
65
+ Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
66
+ Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
67
+ ]
68
+ store.write_documents(documents)
69
+
70
+ # Retrieve by vector
71
+ results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
72
+ for doc in results:
73
+ print(doc.content, doc.score)
74
+ ```
75
+
76
+ ## Full RAG pipeline
77
+
78
+ See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
79
+ and semantic search example using `SentenceTransformersDocumentEmbedder`.
80
+
81
+ ```python
82
+ from haystack import Pipeline
83
+ from haystack.components.converters import PyPDFToDocument
84
+ from haystack.components.embedders import (
85
+ SentenceTransformersDocumentEmbedder,
86
+ SentenceTransformersTextEmbedder,
87
+ )
88
+ from haystack.components.preprocessors import DocumentSplitter
89
+ from haystack.components.writers import DocumentWriter
90
+ from haystack_velesdb import VelesDBDocumentStore
91
+
92
+ store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
93
+
94
+ # Indexing pipeline
95
+ indexer = Pipeline()
96
+ indexer.add_component("converter", PyPDFToDocument())
97
+ indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
98
+ indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
99
+ indexer.add_component("writer", DocumentWriter(document_store=store))
100
+ indexer.connect("converter", "splitter")
101
+ indexer.connect("splitter", "embedder")
102
+ indexer.connect("embedder", "writer")
103
+ indexer.run({"converter": {"sources": ["paper.pdf"]}})
104
+
105
+ # Query pipeline
106
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
107
+
108
+ querier = Pipeline()
109
+ querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
110
+ querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
111
+ querier.connect("embedder.embedding", "retriever.query_embedding")
112
+ result = querier.run({"embedder": {"text": "What is VelesDB?"}})
113
+ print(result["retriever"]["documents"])
114
+ ```
115
+
116
+ ## API reference
117
+
118
+ ### `VelesDBDocumentStore`
119
+
120
+ | Parameter | Default | Description |
121
+ |-----------|---------|-------------|
122
+ | `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
123
+ | `collection_name` | `"haystack_documents"` | VelesDB collection name |
124
+ | `embedding_dim` | `768` | Embedding vector dimension |
125
+ | `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
126
+
127
+ ### Methods
128
+
129
+ | Method | Description |
130
+ |--------|-------------|
131
+ | `write_documents(documents, policy)` | Upsert documents; returns count written |
132
+ | `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
133
+ | `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
134
+ | `count_documents()` | Total document count |
135
+ | `delete_documents(document_ids)` | Delete by Haystack string IDs |
136
+ | `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
137
+
138
+ **Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
139
+ and always overwrite on collision. `FAIL` is fully enforced: a pre-scan is
140
+ performed before writing and `DuplicateDocumentError` is raised if any document
141
+ already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
142
+
143
+ **Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
144
+ integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots). For a
145
+ 1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
146
+ is negligible for typical RAG workloads. A `ValueError` is raised at write time
147
+ if a collision is detected between a new document and an existing one.
148
+
149
+ **Note on `scale_score`:** When `True` (default), cosine similarity scores
150
+ are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
151
+ in downstream re-ranking.
152
+
153
+ ## Running tests
154
+
155
+ ```bash
156
+ cd integrations/haystack
157
+ pip install -e ".[dev]"
158
+ pytest tests/ -v
159
+ ```
160
+
161
+ Tests use lightweight fake VelesDB objects — no running server required.
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/haystack_velesdb/__init__.py
5
+ src/haystack_velesdb/document_store.py
6
+ src/haystack_velesdb/py.typed
7
+ src/haystack_velesdb.egg-info/PKG-INFO
8
+ src/haystack_velesdb.egg-info/SOURCES.txt
9
+ src/haystack_velesdb.egg-info/dependency_links.txt
10
+ src/haystack_velesdb.egg-info/requires.txt
11
+ src/haystack_velesdb.egg-info/top_level.txt
12
+ tests/test_document_store.py
@@ -0,0 +1,6 @@
1
+ haystack-ai>=2.0.0
2
+ velesdb>=1.13.2
3
+ velesdb-common>=1.13.2
4
+
5
+ [dev]
6
+ pytest<9.0,>=7.0
@@ -0,0 +1 @@
1
+ haystack_velesdb
@@ -0,0 +1,400 @@
1
+ """Unit tests for VelesDBDocumentStore.
2
+
3
+ All external dependencies (haystack, velesdb) are replaced with lightweight
4
+ stubs so no server or framework install is required to run the suite.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import importlib.util
9
+ import sys
10
+ import types
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Haystack 2.x stubs — mirror the public API surface used by document_store.py
18
+ # ---------------------------------------------------------------------------
19
+
20
+
21
+ @dataclass
22
+ class Document:
23
+ id: str = ""
24
+ content: Optional[str] = None
25
+ embedding: Optional[List[float]] = None
26
+ meta: Dict[str, Any] = field(default_factory=dict)
27
+ score: Optional[float] = None
28
+
29
+
30
+ class DuplicatePolicy(Enum):
31
+ NONE = "none"
32
+ SKIP = "skip"
33
+ OVERWRITE = "overwrite"
34
+ FAIL = "fail"
35
+
36
+
37
+ class DuplicateDocumentError(Exception):
38
+ pass
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Fake VelesDB objects — deterministic, no I/O
43
+ # ---------------------------------------------------------------------------
44
+
45
+
46
+ class _FakeCollection:
47
+ def __init__(self) -> None:
48
+ self._points: dict = {} # int_id -> point dict
49
+
50
+ def upsert(self, points: list) -> int:
51
+ for p in points:
52
+ self._points[p["id"]] = p
53
+ return len(points)
54
+
55
+ def get(self, int_ids: list) -> list:
56
+ return [
57
+ {"id": iid, "payload": self._points[iid].get("payload", {})}
58
+ if iid in self._points else None
59
+ for iid in int_ids
60
+ ]
61
+
62
+ # `filter=` mirrors the public velesdb SDK kwarg name on Collection.search /
63
+ # Collection.scroll; renaming it would break the kwargs contract under test.
64
+ def search( # pylint: disable=redefined-builtin
65
+ self, vector: list, top_k: int = 10, filter: Any = None
66
+ ) -> list:
67
+ del vector, filter # the fake ignores these
68
+ return [
69
+ {"id": p["id"], "score": 0.9, "payload": p.get("payload", {})}
70
+ for p in list(self._points.values())[:top_k]
71
+ ]
72
+
73
+ def scroll( # pylint: disable=redefined-builtin
74
+ self,
75
+ *,
76
+ batch_size: int = 100,
77
+ filter: Any = None,
78
+ as_dataframe: bool = False,
79
+ backend: str = "pandas",
80
+ ) -> Any:
81
+ """Match the real velesdb SDK signature: kwargs-only, returns
82
+ Iterator[List[Dict]]. The real SDK has no ``limit`` kwarg — callers
83
+ drive the iterator and stop themselves.
84
+ """
85
+ del filter, as_dataframe, backend # the fake ignores these
86
+ all_points = [
87
+ {"id": p["id"], "score": None, "payload": p.get("payload", {})}
88
+ for p in self._points.values()
89
+ ]
90
+ for offset in range(0, len(all_points), batch_size):
91
+ yield all_points[offset : offset + batch_size]
92
+
93
+ def delete(self, int_ids: list) -> None:
94
+ for iid in int_ids:
95
+ self._points.pop(iid, None)
96
+
97
+ def count(self) -> int:
98
+ return len(self._points)
99
+
100
+
101
+ class _FakeDatabase:
102
+ def __init__(self, path: str) -> None:
103
+ self._collections: dict = {}
104
+
105
+ def get_collection(self, name: str) -> _FakeCollection:
106
+ if name not in self._collections:
107
+ raise KeyError(name)
108
+ return self._collections[name]
109
+
110
+ def create_collection(
111
+ self, name: str, dimension: int, metric: str
112
+ ) -> _FakeCollection:
113
+ col = _FakeCollection()
114
+ self._collections[name] = col
115
+ return col
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # Module loader — inject stubs, load document_store from source
120
+ # ---------------------------------------------------------------------------
121
+
122
+
123
+ def _load_module() -> types.ModuleType:
124
+ root = Path(__file__).resolve().parents[1] / "src" / "haystack_velesdb"
125
+
126
+ haystack_pkg = types.ModuleType("haystack")
127
+ haystack_pkg.default_to_dict = lambda obj, **kw: { # type: ignore[attr-defined]
128
+ "type": type(obj).__name__,
129
+ "init_parameters": kw,
130
+ }
131
+ haystack_pkg.default_from_dict = lambda cls, d: cls( # type: ignore[attr-defined]
132
+ **d.get("init_parameters", {})
133
+ )
134
+ sys.modules["haystack"] = haystack_pkg
135
+
136
+ dc_mod = types.ModuleType("haystack.dataclasses")
137
+ dc_mod.Document = Document # type: ignore[attr-defined]
138
+ sys.modules["haystack.dataclasses"] = dc_mod
139
+
140
+ ds_pkg = types.ModuleType("haystack.document_stores")
141
+ sys.modules["haystack.document_stores"] = ds_pkg
142
+ types_mod = types.ModuleType("haystack.document_stores.types")
143
+ types_mod.DuplicatePolicy = DuplicatePolicy # type: ignore[attr-defined]
144
+ sys.modules["haystack.document_stores.types"] = types_mod
145
+ errors_mod = types.ModuleType("haystack.document_stores.errors")
146
+ errors_mod.DuplicateDocumentError = DuplicateDocumentError # type: ignore[attr-defined]
147
+ sys.modules["haystack.document_stores.errors"] = errors_mod
148
+
149
+ sys.modules["velesdb"] = types.SimpleNamespace(Database=_FakeDatabase) # type: ignore
150
+
151
+ # Stub velesdb_common.security with no-op validators (real package has its own tests).
152
+ def _passthrough(value: Any, *args: Any, **kwargs: Any) -> Any:
153
+ return value
154
+
155
+ vc_mod = types.ModuleType("velesdb_common")
156
+ sys.modules["velesdb_common"] = vc_mod
157
+ vc_sec = types.ModuleType("velesdb_common.security")
158
+ vc_sec.validate_path = _passthrough # type: ignore[attr-defined]
159
+ vc_sec.validate_collection_name = _passthrough # type: ignore[attr-defined]
160
+ vc_sec.validate_metric = _passthrough # type: ignore[attr-defined]
161
+ vc_sec.SecurityError = ValueError # type: ignore[attr-defined]
162
+ sys.modules["velesdb_common.security"] = vc_sec
163
+
164
+ pkg = types.ModuleType("haystack_velesdb")
165
+ pkg.__path__ = [str(root)] # type: ignore[attr-defined]
166
+ sys.modules["haystack_velesdb"] = pkg
167
+
168
+ spec = importlib.util.spec_from_file_location(
169
+ "haystack_velesdb.document_store", root / "document_store.py"
170
+ )
171
+ assert spec and spec.loader
172
+ mod = importlib.util.module_from_spec(spec)
173
+ sys.modules["haystack_velesdb.document_store"] = mod
174
+ spec.loader.exec_module(mod) # type: ignore[union-attr]
175
+ return mod
176
+
177
+
178
+ _MOD = _load_module()
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Tests
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def test_write_and_count() -> None:
187
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_write")
188
+ docs = [
189
+ Document(id="a", content="alpha", embedding=[0.1, 0.2, 0.3]),
190
+ Document(id="b", content="beta", embedding=[0.4, 0.5, 0.6]),
191
+ ]
192
+ assert store.write_documents(docs) == 2
193
+ assert store.count_documents() == 2
194
+
195
+
196
+ def test_write_empty_returns_zero() -> None:
197
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_empty")
198
+ assert store.write_documents([]) == 0
199
+
200
+
201
+ def test_embedding_retrieval_returns_documents() -> None:
202
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_retrieval")
203
+ store.write_documents([Document(id="x", content="hello", embedding=[0.1, 0.2, 0.3])])
204
+ results = store.embedding_retrieval([0.1, 0.2, 0.3], top_k=5)
205
+ assert len(results) >= 1
206
+ assert results[0].id == "x"
207
+ assert results[0].content == "hello"
208
+
209
+
210
+ def test_scale_score_normalises_cosine() -> None:
211
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_score")
212
+ store.write_documents([Document(id="y", content="world", embedding=[1.0, 0.0])])
213
+ scaled = store.embedding_retrieval([1.0, 0.0], scale_score=True)
214
+ raw = store.embedding_retrieval([1.0, 0.0], scale_score=False)
215
+ assert scaled[0].score == (0.9 + 1.0) / 2.0
216
+ assert raw[0].score == 0.9
217
+
218
+
219
+ def test_filter_documents_returns_all_when_none() -> None:
220
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_filter")
221
+ store.write_documents([
222
+ Document(id="p", content="foo", embedding=[0.1, 0.2]),
223
+ Document(id="q", content="bar", embedding=[0.7, 0.8]),
224
+ ])
225
+ assert len(store.filter_documents()) == 2
226
+
227
+
228
+ def test_filter_documents_passes_filter_to_scroll() -> None:
229
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_filter_arg")
230
+ store.write_documents([
231
+ Document(id="fa", content="alpha", embedding=[0.1]),
232
+ ])
233
+ # Passing a non-None filter should not raise; the fake scroll ignores it,
234
+ # but this confirms the filter arg is forwarded without error.
235
+ results = store.filter_documents(filters={"field": "value"})
236
+ assert len(results) == 1
237
+
238
+
239
+ def test_scale_score_not_applied_for_non_cosine_metric() -> None:
240
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_score_nc", metric="euclidean")
241
+ store.write_documents([Document(id="z", content="raw", embedding=[1.0])])
242
+ scaled = store.embedding_retrieval([1.0], scale_score=True)
243
+ # For euclidean metric scale_score should be a no-op — raw score returned.
244
+ assert scaled[0].score == 0.9
245
+
246
+
247
+ def test_scroll_limit_is_respected() -> None:
248
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_limit", scroll_limit=1)
249
+ store.write_documents([
250
+ Document(id="r", content="one", embedding=[0.1]),
251
+ Document(id="s", content="two", embedding=[0.2]),
252
+ ])
253
+ # With scroll_limit=1 the fake scroll caps at 1 result.
254
+ assert len(store.filter_documents()) == 1
255
+
256
+
257
+ def test_delete_documents() -> None:
258
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_delete")
259
+ store.write_documents([
260
+ Document(id="del1", content="remove me", embedding=[0.1, 0.2]),
261
+ Document(id="keep1", content="keep me", embedding=[0.3, 0.4]),
262
+ ])
263
+ assert store.count_documents() == 2
264
+ store.delete_documents(["del1"])
265
+ assert store.count_documents() == 1
266
+ remaining = store.filter_documents()
267
+ assert remaining[0].id == "keep1"
268
+
269
+
270
+ def test_document_metadata_round_trips() -> None:
271
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_meta")
272
+ store.write_documents([
273
+ Document(id="m1", content="meta test", embedding=[0.5], meta={"source": "wiki"})
274
+ ])
275
+ docs = store.filter_documents()
276
+ assert docs[0].meta.get("source") == "wiki"
277
+
278
+
279
+ def test_reserved_meta_keys_do_not_corrupt_payload() -> None:
280
+ """doc.meta containing reserved keys must not overwrite canonical fields."""
281
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_reserved")
282
+ # A user accidentally sets meta keys that clash with our reserved names.
283
+ store.write_documents([
284
+ Document(
285
+ id="safe",
286
+ content="real content",
287
+ embedding=[0.1],
288
+ meta={"_doc_id": "evil_id", "content": "evil content"},
289
+ )
290
+ ])
291
+ docs = store.filter_documents()
292
+ assert docs[0].id == "safe", "_doc_id must come from doc.id, not meta"
293
+ assert docs[0].content == "real content", "content must come from doc.content, not meta"
294
+ # Reserved keys should not leak back into meta on retrieval.
295
+ assert "_doc_id" not in docs[0].meta
296
+ assert "content" not in docs[0].meta
297
+
298
+
299
+ def test_get_collection_catches_key_error_and_creates_collection() -> None:
300
+ """_get_collection catches KeyError from get_collection and falls back to create_collection."""
301
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_key_error_path")
302
+ # The fake raises KeyError for unknown collections; _get_collection should
303
+ # catch it and call create_collection instead of letting the error propagate.
304
+ assert store.count_documents() == 0
305
+ assert store._collection is not None
306
+
307
+
308
+ def test_write_documents_fail_policy_raises_on_duplicate() -> None:
309
+ """DuplicatePolicy.FAIL raises DuplicateDocumentError when a document already exists."""
310
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_fail_dup")
311
+ doc = Document(id="dup1", content="original", embedding=[0.1, 0.2])
312
+ store.write_documents([doc])
313
+
314
+ import pytest
315
+ with pytest.raises(DuplicateDocumentError):
316
+ store.write_documents([doc], policy=DuplicatePolicy.FAIL)
317
+
318
+
319
+ def test_write_documents_fail_policy_succeeds_for_new_docs() -> None:
320
+ """DuplicatePolicy.FAIL succeeds when none of the documents already exist."""
321
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_fail_new")
322
+ doc = Document(id="new_only", content="fresh", embedding=[0.5])
323
+ result = store.write_documents([doc], policy=DuplicatePolicy.FAIL)
324
+ assert result == 1
325
+ assert store.count_documents() == 1
326
+
327
+
328
+ def test_serialisation_round_trip() -> None:
329
+ store = _MOD.VelesDBDocumentStore(
330
+ path="/tmp/hs_serial",
331
+ collection_name="serial",
332
+ embedding_dim=384,
333
+ metric="euclidean",
334
+ scroll_limit=5_000,
335
+ )
336
+ d = store.to_dict()
337
+ assert d["init_parameters"]["embedding_dim"] == 384
338
+ assert d["init_parameters"]["metric"] == "euclidean"
339
+ assert d["init_parameters"]["scroll_limit"] == 5_000
340
+ restored = _MOD.VelesDBDocumentStore.from_dict(d)
341
+ assert restored._embedding_dim == 384
342
+ assert restored._metric == "euclidean"
343
+ assert restored._scroll_limit == 5_000
344
+
345
+
346
+ def test_filter_documents_drives_scroll_iterator_across_batches() -> None:
347
+ """Regression: filter_documents must drive the Iterator returned by
348
+ Collection.scroll() (the real SDK returns Iterator[List[Dict]], it does
349
+ not return a flat list nor accept a 'limit' kwarg). With batch_size=100
350
+ in the fake, a 2-document collection yields a single 2-element batch,
351
+ and the helper must collect both.
352
+ """
353
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_iter_drive")
354
+ store.write_documents(
355
+ [
356
+ Document(id="i1", content="one", embedding=[0.1]),
357
+ Document(id="i2", content="two", embedding=[0.2]),
358
+ ]
359
+ )
360
+ docs = store.filter_documents()
361
+ assert {d.id for d in docs} == {"i1", "i2"}
362
+
363
+
364
+ def test_result_to_doc_raises_on_missing_doc_id() -> None:
365
+ """Regression: a VelesDB point with no `_doc_id` payload key must raise
366
+ rather than silently fall back to str(int_id). The previous fallback
367
+ corrupted delete_documents() because str(int_id) re-hashes via SHA-256
368
+ to a different integer, so the delete would no-op without raising.
369
+ """
370
+ import pytest
371
+
372
+ raw = {"id": 12345, "score": 0.9, "payload": {"content": "orphan"}}
373
+ with pytest.raises(ValueError, match="no '_doc_id'"):
374
+ _MOD._result_to_doc(raw)
375
+
376
+
377
+ def test_get_collection_returns_none_and_creates_collection() -> None:
378
+ """_get_collection handles SDK returning None (the production SDK behavior)."""
379
+ class _FakeDatabaseReturnsNone:
380
+ def __init__(self, path: str) -> None:
381
+ self._collections: dict = {}
382
+
383
+ def get_collection(self, name: str) -> Optional[_FakeCollection]:
384
+ return None # Real VelesDB SDK returns None for unknown collections.
385
+
386
+ def create_collection(
387
+ self, name: str, dimension: int, metric: str
388
+ ) -> _FakeCollection:
389
+ col = _FakeCollection()
390
+ self._collections[name] = col
391
+ return col
392
+
393
+ original_velesdb = _MOD.velesdb
394
+ try:
395
+ _MOD.velesdb = types.SimpleNamespace(Database=_FakeDatabaseReturnsNone) # type: ignore
396
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name="t_none_path")
397
+ assert store.count_documents() == 0
398
+ assert store._collection is not None
399
+ finally:
400
+ _MOD.velesdb = original_velesdb