haystack-velesdb 1.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_velesdb/__init__.py +6 -0
- haystack_velesdb/document_store.py +350 -0
- haystack_velesdb/py.typed +0 -0
- haystack_velesdb-1.14.1.dist-info/METADATA +161 -0
- haystack_velesdb-1.14.1.dist-info/RECORD +8 -0
- haystack_velesdb-1.14.1.dist-info/WHEEL +5 -0
- haystack_velesdb-1.14.1.dist-info/licenses/LICENSE +21 -0
- haystack_velesdb-1.14.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""Haystack 2.x DocumentStore backed by VelesDB.
|
|
2
|
+
|
|
3
|
+
Implements the Haystack ``DocumentStore`` protocol so VelesDB can be used
|
|
4
|
+
as the vector backend in any Haystack 2.x indexing or retrieval pipeline.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from haystack import default_from_dict, default_to_dict
|
|
13
|
+
from haystack.dataclasses import Document
|
|
14
|
+
from haystack.document_stores.errors import DuplicateDocumentError
|
|
15
|
+
from haystack.document_stores.types import DuplicatePolicy
|
|
16
|
+
|
|
17
|
+
import velesdb
|
|
18
|
+
from velesdb_common.security import (
|
|
19
|
+
validate_collection_name,
|
|
20
|
+
validate_metric,
|
|
21
|
+
validate_path,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
__all__ = ["VelesDBDocumentStore"]
|
|
27
|
+
|
|
28
|
+
_DEFAULT_COLLECTION = "haystack_documents"
|
|
29
|
+
_DEFAULT_DIMENSION = 768
|
|
30
|
+
_DEFAULT_METRIC = "cosine"
|
|
31
|
+
_DEFAULT_SCROLL_LIMIT = 10_000
|
|
32
|
+
_INT63_MASK = (1 << 63) - 1
|
|
33
|
+
# Reserved keys stored by this integration in the VelesDB payload.
|
|
34
|
+
_RESERVED_PAYLOAD_KEYS = frozenset({"_doc_id", "content"})
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _str_id_to_int(doc_id: str) -> int:
|
|
38
|
+
"""Map a Haystack string document ID to a stable positive 63-bit integer.
|
|
39
|
+
|
|
40
|
+
Uses the first 8 bytes of SHA-256, masked to 63 bits (~9.2 × 10¹⁸ slots).
|
|
41
|
+
Collision probability for a 1 M-document collection is roughly 5 × 10⁻¹⁴ —
|
|
42
|
+
negligible for typical RAG workloads but not zero. If two distinct string
|
|
43
|
+
IDs produce the same integer ID, :meth:`write_documents` raises
|
|
44
|
+
:class:`ValueError` rather than silently overwriting the existing document.
|
|
45
|
+
"""
|
|
46
|
+
return int.from_bytes(hashlib.sha256(doc_id.encode()).digest()[:8], "big") & _INT63_MASK
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _doc_to_point(doc: Document) -> dict:
|
|
50
|
+
"""Convert a Haystack Document to a VelesDB point dict.
|
|
51
|
+
|
|
52
|
+
Reserved payload keys (``_doc_id``, ``content``) are always written from
|
|
53
|
+
the document's canonical fields, not from ``doc.meta``. Any meta entry
|
|
54
|
+
that shares a reserved name is silently dropped from the payload to
|
|
55
|
+
prevent round-trip corruption.
|
|
56
|
+
"""
|
|
57
|
+
payload: dict = {}
|
|
58
|
+
# Merge meta first; reserved keys are excluded so they cannot
|
|
59
|
+
# clobber the canonical doc identity written below.
|
|
60
|
+
if doc.meta:
|
|
61
|
+
for k, v in doc.meta.items():
|
|
62
|
+
if k not in _RESERVED_PAYLOAD_KEYS:
|
|
63
|
+
payload[k] = v
|
|
64
|
+
payload["_doc_id"] = doc.id
|
|
65
|
+
if doc.content is not None:
|
|
66
|
+
payload["content"] = doc.content
|
|
67
|
+
point: dict = {"id": _str_id_to_int(doc.id), "payload": payload}
|
|
68
|
+
if doc.embedding is not None:
|
|
69
|
+
point["vector"] = list(doc.embedding)
|
|
70
|
+
return point
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _result_to_doc(
|
|
74
|
+
result: dict, *, scale_score: bool = False, metric: str = "cosine"
|
|
75
|
+
) -> Document:
|
|
76
|
+
"""Convert a VelesDB search or scroll result to a Haystack Document.
|
|
77
|
+
|
|
78
|
+
Requires ``_doc_id`` to be present in the payload. Points written by
|
|
79
|
+
:meth:`VelesDBDocumentStore.write_documents` always carry that key, so
|
|
80
|
+
a missing ``_doc_id`` means the underlying VelesDB collection was
|
|
81
|
+
populated by a different code path (raw ``col.upsert``, migration
|
|
82
|
+
scripts, mixed tooling). Falling back to the stringified integer ID
|
|
83
|
+
would silently corrupt :meth:`delete_documents`: the integer-as-string
|
|
84
|
+
re-hashes through SHA-256 to a *different* integer, so the delete
|
|
85
|
+
would no-op without raising. We fail fast instead.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
ValueError: When ``_doc_id`` is missing from the payload.
|
|
89
|
+
"""
|
|
90
|
+
payload = result.get("payload", {})
|
|
91
|
+
doc_id = payload.get("_doc_id")
|
|
92
|
+
if doc_id is None:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"VelesDB point id={result.get('id')} has no '_doc_id' field in "
|
|
95
|
+
"its payload. VelesDBDocumentStore requires every point in the "
|
|
96
|
+
"underlying collection to be written via write_documents(); "
|
|
97
|
+
"points populated by raw col.upsert() or external migration "
|
|
98
|
+
"scripts cannot be round-tripped because the stringified "
|
|
99
|
+
"integer ID would re-hash to a different integer and break "
|
|
100
|
+
"delete_documents()."
|
|
101
|
+
)
|
|
102
|
+
content = payload.get("content")
|
|
103
|
+
meta = {k: v for k, v in payload.items() if k not in _RESERVED_PAYLOAD_KEYS}
|
|
104
|
+
raw_score: Optional[float] = result.get("score")
|
|
105
|
+
if scale_score and raw_score is not None and metric == "cosine":
|
|
106
|
+
# Normalise cosine similarity from [-1, 1] to [0, 1].
|
|
107
|
+
# Only meaningful for cosine; l2 and dot scores have different ranges.
|
|
108
|
+
score: Optional[float] = (raw_score + 1.0) / 2.0
|
|
109
|
+
else:
|
|
110
|
+
score = raw_score
|
|
111
|
+
return Document(id=doc_id, content=content, meta=meta, score=score)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_int_id_map(documents: List[Document]) -> Dict[int, str]:
|
|
115
|
+
"""Map every document's integer ID back to its string ID, raising on
|
|
116
|
+
in-batch SHA-256 collisions.
|
|
117
|
+
|
|
118
|
+
Two distinct string IDs that hash to the same 63-bit integer would
|
|
119
|
+
silently overwrite each other on upsert. This helper is the first
|
|
120
|
+
line of defence: it detects collisions inside a single
|
|
121
|
+
``write_documents`` batch before any state hits the collection.
|
|
122
|
+
"""
|
|
123
|
+
int_id_map: Dict[int, str] = {}
|
|
124
|
+
for doc in documents:
|
|
125
|
+
iid = _str_id_to_int(doc.id)
|
|
126
|
+
if iid in int_id_map and int_id_map[iid] != doc.id:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
f"SHA-256 collision in write batch: '{int_id_map[iid]}' and "
|
|
129
|
+
f"'{doc.id}' map to the same integer ID {iid}. "
|
|
130
|
+
"Rename one of the documents."
|
|
131
|
+
)
|
|
132
|
+
int_id_map[iid] = doc.id
|
|
133
|
+
return int_id_map
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _enforce_fail_policy(col: Any, int_id_map: Dict[int, str]) -> None:
|
|
137
|
+
"""For ``DuplicatePolicy.FAIL``, raise if any incoming integer ID
|
|
138
|
+
already exists in the collection, or if a stored point points to a
|
|
139
|
+
different string ID (cross-store SHA-256 collision).
|
|
140
|
+
|
|
141
|
+
Uses point-by-point ``col.get(int_ids)`` — O(batch_size) — instead of
|
|
142
|
+
a full scroll, so collections larger than ``scroll_limit`` are still
|
|
143
|
+
correctly enforced.
|
|
144
|
+
"""
|
|
145
|
+
existing_points: List[Any] = col.get(list(int_id_map.keys()))
|
|
146
|
+
conflicts: List[str] = []
|
|
147
|
+
for point in existing_points:
|
|
148
|
+
if point is None:
|
|
149
|
+
continue
|
|
150
|
+
iid = point["id"]
|
|
151
|
+
existing_str = point.get("payload", {}).get("_doc_id", str(iid))
|
|
152
|
+
str_id = int_id_map[iid]
|
|
153
|
+
if existing_str != str_id:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"SHA-256 collision on write: incoming document '{str_id}' "
|
|
156
|
+
f"maps to the same integer ID {iid} as existing document "
|
|
157
|
+
f"'{existing_str}'. Rename one of the documents."
|
|
158
|
+
)
|
|
159
|
+
conflicts.append(str_id)
|
|
160
|
+
if conflicts:
|
|
161
|
+
raise DuplicateDocumentError(
|
|
162
|
+
f"Documents already exist (policy=FAIL): {conflicts}"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _documents_to_points(documents: List[Document]) -> List[dict]:
|
|
167
|
+
"""Convert each document to its VelesDB point dict, logging documents
|
|
168
|
+
that lack an embedding so the caller still gets feedback when the
|
|
169
|
+
underlying SDK accepts vector-less points.
|
|
170
|
+
"""
|
|
171
|
+
points: List[dict] = []
|
|
172
|
+
for doc in documents:
|
|
173
|
+
if doc.embedding is None:
|
|
174
|
+
logger.warning(
|
|
175
|
+
"Document '%s' has no embedding; stored without vector.", doc.id
|
|
176
|
+
)
|
|
177
|
+
points.append(_doc_to_point(doc))
|
|
178
|
+
return points
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class VelesDBDocumentStore:
|
|
182
|
+
"""Haystack 2.x DocumentStore backed by a local VelesDB collection.
|
|
183
|
+
|
|
184
|
+
Stores documents (with optional embeddings) in VelesDB and exposes the
|
|
185
|
+
standard Haystack retrieval interface so this store works as a drop-in
|
|
186
|
+
backend for ``EmbeddingRetriever`` and similar pipeline components.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
path: Directory path where VelesDB persists data.
|
|
190
|
+
collection_name: Name of the VelesDB collection to use.
|
|
191
|
+
embedding_dim: Dimensionality of the embedding vectors.
|
|
192
|
+
metric: Distance metric: ``"cosine"``, ``"euclidean"``, or ``"dot"``.
|
|
193
|
+
scroll_limit: Maximum documents returned by :meth:`filter_documents`.
|
|
194
|
+
Increase this value when your collection exceeds 10 000 documents.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
def __init__( # pylint: disable=too-many-arguments,too-many-positional-arguments
|
|
198
|
+
self,
|
|
199
|
+
path: str = "./velesdb_haystack",
|
|
200
|
+
collection_name: str = _DEFAULT_COLLECTION,
|
|
201
|
+
embedding_dim: int = _DEFAULT_DIMENSION,
|
|
202
|
+
metric: str = _DEFAULT_METRIC,
|
|
203
|
+
scroll_limit: int = _DEFAULT_SCROLL_LIMIT,
|
|
204
|
+
) -> None:
|
|
205
|
+
self._path = validate_path(path)
|
|
206
|
+
self._collection_name = validate_collection_name(collection_name)
|
|
207
|
+
self._embedding_dim = embedding_dim
|
|
208
|
+
self._metric = validate_metric(metric)
|
|
209
|
+
self._scroll_limit = scroll_limit
|
|
210
|
+
self._db: Optional[Any] = None
|
|
211
|
+
self._collection: Optional[Any] = None
|
|
212
|
+
|
|
213
|
+
# ------------------------------------------------------------------
|
|
214
|
+
# Internal connection management
|
|
215
|
+
# ------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
def _get_collection(self) -> Any:
|
|
218
|
+
"""Return the VelesDB collection, opening or creating it as needed."""
|
|
219
|
+
if self._db is None:
|
|
220
|
+
self._db = velesdb.Database(self._path)
|
|
221
|
+
if self._collection is None:
|
|
222
|
+
col: Optional[Any] = None
|
|
223
|
+
try:
|
|
224
|
+
col = self._db.get_collection(self._collection_name)
|
|
225
|
+
except KeyError:
|
|
226
|
+
pass
|
|
227
|
+
if col is None:
|
|
228
|
+
col = self._db.create_collection(
|
|
229
|
+
self._collection_name,
|
|
230
|
+
dimension=self._embedding_dim,
|
|
231
|
+
metric=self._metric,
|
|
232
|
+
)
|
|
233
|
+
self._collection = col
|
|
234
|
+
return self._collection
|
|
235
|
+
|
|
236
|
+
# ------------------------------------------------------------------
|
|
237
|
+
# DocumentStore protocol
|
|
238
|
+
# ------------------------------------------------------------------
|
|
239
|
+
|
|
240
|
+
def count_documents(self) -> int:
|
|
241
|
+
"""Return the total number of documents in the store."""
|
|
242
|
+
result = self._get_collection().count()
|
|
243
|
+
return result if isinstance(result, int) else 0
|
|
244
|
+
|
|
245
|
+
def filter_documents(
|
|
246
|
+
self,
|
|
247
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
248
|
+
) -> List[Document]:
|
|
249
|
+
"""Return documents matching *filters*, or all documents when *None*.
|
|
250
|
+
|
|
251
|
+
Passes *filters* directly to VelesDB's scroll operation. The real
|
|
252
|
+
SDK returns ``Iterator[List[Dict]]`` and has no ``limit`` kwarg, so
|
|
253
|
+
we drive the iterator ourselves and stop once ``self._scroll_limit``
|
|
254
|
+
documents have been collected. Increase ``scroll_limit`` on the
|
|
255
|
+
constructor for collections larger than the default 10 000.
|
|
256
|
+
"""
|
|
257
|
+
col = self._get_collection()
|
|
258
|
+
documents: List[Document] = []
|
|
259
|
+
for batch in col.scroll(filter=filters):
|
|
260
|
+
for raw in batch:
|
|
261
|
+
if len(documents) >= self._scroll_limit:
|
|
262
|
+
return documents
|
|
263
|
+
documents.append(_result_to_doc(raw))
|
|
264
|
+
return documents
|
|
265
|
+
|
|
266
|
+
def write_documents(
|
|
267
|
+
self,
|
|
268
|
+
documents: List[Document],
|
|
269
|
+
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
270
|
+
) -> int:
|
|
271
|
+
"""Write *documents* to VelesDB and return the number written.
|
|
272
|
+
|
|
273
|
+
VelesDB upsert semantics apply for policies other than ``FAIL``:
|
|
274
|
+
an existing point with the same integer ID is overwritten.
|
|
275
|
+
|
|
276
|
+
When *policy* is ``DuplicatePolicy.FAIL`` this method scans the
|
|
277
|
+
collection before writing and raises :class:`DuplicateDocumentError`
|
|
278
|
+
if any incoming document already exists. For large collections
|
|
279
|
+
prefer ``OVERWRITE`` or ``NONE`` to avoid the pre-scan cost.
|
|
280
|
+
|
|
281
|
+
Raises:
|
|
282
|
+
DuplicateDocumentError: When *policy* is ``FAIL`` and at least
|
|
283
|
+
one document already exists in the store.
|
|
284
|
+
ValueError: When a SHA-256 hash collision is detected — two
|
|
285
|
+
distinct string IDs that map to the same integer ID.
|
|
286
|
+
"""
|
|
287
|
+
if not documents:
|
|
288
|
+
return 0
|
|
289
|
+
int_id_map = _build_int_id_map(documents)
|
|
290
|
+
col = self._get_collection()
|
|
291
|
+
if policy == DuplicatePolicy.FAIL:
|
|
292
|
+
_enforce_fail_policy(col, int_id_map)
|
|
293
|
+
points = _documents_to_points(documents)
|
|
294
|
+
result = col.upsert(points)
|
|
295
|
+
return result if isinstance(result, int) else len(points)
|
|
296
|
+
|
|
297
|
+
def delete_documents(
|
|
298
|
+
self,
|
|
299
|
+
document_ids: Optional[List[str]] = None,
|
|
300
|
+
) -> None:
|
|
301
|
+
"""Delete documents identified by their Haystack string IDs."""
|
|
302
|
+
if not document_ids:
|
|
303
|
+
return
|
|
304
|
+
int_ids = [_str_id_to_int(did) for did in document_ids]
|
|
305
|
+
self._get_collection().delete(int_ids)
|
|
306
|
+
|
|
307
|
+
def embedding_retrieval(
|
|
308
|
+
self,
|
|
309
|
+
query_embedding: List[float],
|
|
310
|
+
*,
|
|
311
|
+
top_k: int = 10,
|
|
312
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
313
|
+
scale_score: bool = True,
|
|
314
|
+
) -> List[Document]:
|
|
315
|
+
"""Return the *top_k* documents most similar to *query_embedding*.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
query_embedding: Dense query vector.
|
|
319
|
+
top_k: Maximum number of documents to return.
|
|
320
|
+
filters: Optional VelesDB filter dict to restrict the search space.
|
|
321
|
+
scale_score: When ``True`` and ``metric="cosine"``, scores are
|
|
322
|
+
normalised from ``[-1, 1]`` to ``[0, 1]``. Ignored for other
|
|
323
|
+
metrics, where raw scores are returned unchanged.
|
|
324
|
+
"""
|
|
325
|
+
results: List[dict] = self._get_collection().search(
|
|
326
|
+
vector=query_embedding,
|
|
327
|
+
top_k=top_k,
|
|
328
|
+
filter=filters,
|
|
329
|
+
)
|
|
330
|
+
return [_result_to_doc(r, scale_score=scale_score, metric=self._metric) for r in results]
|
|
331
|
+
|
|
332
|
+
# ------------------------------------------------------------------
|
|
333
|
+
# Haystack pipeline serialisation
|
|
334
|
+
# ------------------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
337
|
+
"""Serialise the store configuration for Haystack pipeline YAML."""
|
|
338
|
+
return default_to_dict(
|
|
339
|
+
self,
|
|
340
|
+
path=self._path,
|
|
341
|
+
collection_name=self._collection_name,
|
|
342
|
+
embedding_dim=self._embedding_dim,
|
|
343
|
+
metric=self._metric,
|
|
344
|
+
scroll_limit=self._scroll_limit,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
@classmethod
|
|
348
|
+
def from_dict(cls, data: Dict[str, Any]) -> "VelesDBDocumentStore":
|
|
349
|
+
"""Restore a store instance from a Haystack pipeline config dict."""
|
|
350
|
+
return default_from_dict(cls, data)
|
|
File without changes
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: haystack-velesdb
|
|
3
|
+
Version: 1.14.1
|
|
4
|
+
Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
|
|
5
|
+
Author-email: VelesDB Team <contact@wiscale.fr>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/cyberlife-coder/VelesDB
|
|
8
|
+
Project-URL: Documentation, https://velesdb.com/docs/integrations/haystack
|
|
9
|
+
Project-URL: Repository, https://github.com/cyberlife-coder/VelesDB
|
|
10
|
+
Keywords: haystack,velesdb,vector-database,embeddings,rag,local-first,semantic-search
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: haystack-ai>=2.0.0
|
|
24
|
+
Requires-Dist: velesdb>=1.13.2
|
|
25
|
+
Requires-Dist: velesdb-common>=1.13.2
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest<9.0,>=7.0; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# haystack-velesdb
|
|
31
|
+
|
|
32
|
+
A Haystack 2.x `DocumentStore` backed by [VelesDB](https://github.com/cyberlife-coder/VelesDB) —
|
|
33
|
+
the local-first, microsecond-latency vector database.
|
|
34
|
+
|
|
35
|
+
This integration joins the existing [LangChain](../langchain/) and [LlamaIndex](../llamaindex/)
|
|
36
|
+
connectors, completing the trio of major Python RAG frameworks supported by VelesDB.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install haystack-velesdb
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
For development:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install -e "integrations/haystack[dev]"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quick start
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from haystack_velesdb import VelesDBDocumentStore
|
|
54
|
+
from haystack.dataclasses import Document
|
|
55
|
+
|
|
56
|
+
store = VelesDBDocumentStore(
|
|
57
|
+
path="./my_docs",
|
|
58
|
+
collection_name="knowledge_base",
|
|
59
|
+
embedding_dim=768,
|
|
60
|
+
metric="cosine",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Write pre-embedded documents
|
|
64
|
+
documents = [
|
|
65
|
+
Document(id="doc1", content="VelesDB is fast.", embedding=[0.1, 0.2, ...]),
|
|
66
|
+
Document(id="doc2", content="Local-first AI memory.", embedding=[0.3, 0.4, ...]),
|
|
67
|
+
]
|
|
68
|
+
store.write_documents(documents)
|
|
69
|
+
|
|
70
|
+
# Retrieve by vector
|
|
71
|
+
results = store.embedding_retrieval(query_embedding=[0.1, 0.2, ...], top_k=5)
|
|
72
|
+
for doc in results:
|
|
73
|
+
print(doc.content, doc.score)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Full RAG pipeline
|
|
77
|
+
|
|
78
|
+
See [`examples/rag_pipeline.py`](examples/rag_pipeline.py) for a complete PDF ingestion
|
|
79
|
+
and semantic search example using `SentenceTransformersDocumentEmbedder`.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from haystack import Pipeline
|
|
83
|
+
from haystack.components.converters import PyPDFToDocument
|
|
84
|
+
from haystack.components.embedders import (
|
|
85
|
+
SentenceTransformersDocumentEmbedder,
|
|
86
|
+
SentenceTransformersTextEmbedder,
|
|
87
|
+
)
|
|
88
|
+
from haystack.components.preprocessors import DocumentSplitter
|
|
89
|
+
from haystack.components.writers import DocumentWriter
|
|
90
|
+
from haystack_velesdb import VelesDBDocumentStore
|
|
91
|
+
|
|
92
|
+
store = VelesDBDocumentStore(path="./rag_store", embedding_dim=384)
|
|
93
|
+
|
|
94
|
+
# Indexing pipeline
|
|
95
|
+
indexer = Pipeline()
|
|
96
|
+
indexer.add_component("converter", PyPDFToDocument())
|
|
97
|
+
indexer.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
|
|
98
|
+
indexer.add_component("embedder", SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2"))
|
|
99
|
+
indexer.add_component("writer", DocumentWriter(document_store=store))
|
|
100
|
+
indexer.connect("converter", "splitter")
|
|
101
|
+
indexer.connect("splitter", "embedder")
|
|
102
|
+
indexer.connect("embedder", "writer")
|
|
103
|
+
indexer.run({"converter": {"sources": ["paper.pdf"]}})
|
|
104
|
+
|
|
105
|
+
# Query pipeline
|
|
106
|
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
|
107
|
+
|
|
108
|
+
querier = Pipeline()
|
|
109
|
+
querier.add_component("embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
|
|
110
|
+
querier.add_component("retriever", InMemoryEmbeddingRetriever(document_store=store))
|
|
111
|
+
querier.connect("embedder.embedding", "retriever.query_embedding")
|
|
112
|
+
result = querier.run({"embedder": {"text": "What is VelesDB?"}})
|
|
113
|
+
print(result["retriever"]["documents"])
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## API reference
|
|
117
|
+
|
|
118
|
+
### `VelesDBDocumentStore`
|
|
119
|
+
|
|
120
|
+
| Parameter | Default | Description |
|
|
121
|
+
|-----------|---------|-------------|
|
|
122
|
+
| `path` | `"./velesdb_haystack"` | Directory where VelesDB persists data |
|
|
123
|
+
| `collection_name` | `"haystack_documents"` | VelesDB collection name |
|
|
124
|
+
| `embedding_dim` | `768` | Embedding vector dimension |
|
|
125
|
+
| `metric` | `"cosine"` | Distance metric: `"cosine"`, `"euclidean"`, or `"dot"` |
|
|
126
|
+
|
|
127
|
+
### Methods
|
|
128
|
+
|
|
129
|
+
| Method | Description |
|
|
130
|
+
|--------|-------------|
|
|
131
|
+
| `write_documents(documents, policy)` | Upsert documents; returns count written |
|
|
132
|
+
| `filter_documents(filters)` | Scroll documents matching a VelesDB filter dict |
|
|
133
|
+
| `embedding_retrieval(query_embedding, top_k, filters, scale_score)` | Vector similarity search |
|
|
134
|
+
| `count_documents()` | Total document count |
|
|
135
|
+
| `delete_documents(document_ids)` | Delete by Haystack string IDs |
|
|
136
|
+
| `to_dict()` / `from_dict()` | Haystack pipeline serialisation |
|
|
137
|
+
|
|
138
|
+
**Note on `DuplicatePolicy`:** `NONE` and `OVERWRITE` use VelesDB upsert semantics
|
|
139
|
+
and always overwrite on collision. `FAIL` is fully enforced: a pre-scan is
|
|
140
|
+
performed before writing and `DuplicateDocumentError` is raised if any document
|
|
141
|
+
already exists (prefer `OVERWRITE` or `NONE` for bulk loads to skip the scan cost).
|
|
142
|
+
|
|
143
|
+
**Note on document IDs and SHA-256:** Haystack string IDs are mapped to 63-bit
|
|
144
|
+
integers using the first 8 bytes of SHA-256 (~9.2 × 10¹⁸ slots). For a
|
|
145
|
+
1 M-document collection the collision probability is roughly 5 × 10⁻¹⁴, which
|
|
146
|
+
is negligible for typical RAG workloads. A `ValueError` is raised at write time
|
|
147
|
+
if a collision is detected between a new document and an existing one.
|
|
148
|
+
|
|
149
|
+
**Note on `scale_score`:** When `True` (default), cosine similarity scores
|
|
150
|
+
are normalised from `[-1, 1]` to `[0, 1]` so they behave like probabilities
|
|
151
|
+
in downstream re-ranking.
|
|
152
|
+
|
|
153
|
+
## Running tests
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
cd integrations/haystack
|
|
157
|
+
pip install -e ".[dev]"
|
|
158
|
+
pytest tests/ -v
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Tests use lightweight fake VelesDB objects — no running server required.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
haystack_velesdb/__init__.py,sha256=cLohq27QUAJIgaJI6lbULnleSLBsF5bpvdcYSzgHFAU,182
|
|
2
|
+
haystack_velesdb/document_store.py,sha256=osxoJYH7pQIz6ZPGT1xz3eXaaVLB9mRT5fetHJFqLhg,14283
|
|
3
|
+
haystack_velesdb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
haystack_velesdb-1.14.1.dist-info/licenses/LICENSE,sha256=YpLk4nyLjvWm2Ls3Tel-oI1iTgFlNpxJN-6kb5Ck1zY,1069
|
|
5
|
+
haystack_velesdb-1.14.1.dist-info/METADATA,sha256=by3EY-zqOCoagq-K-j_NQh1UFVIZ3E-HOwi6HDAAFsw,6024
|
|
6
|
+
haystack_velesdb-1.14.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
haystack_velesdb-1.14.1.dist-info/top_level.txt,sha256=t5iFDRUDDh5niR6Dc_3gGe-4HnUHITvo6F1hIALXUns,17
|
|
8
|
+
haystack_velesdb-1.14.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 VelesDB Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
haystack_velesdb
|