haystack-velesdb 2.0.0__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {haystack_velesdb-2.0.0/src/haystack_velesdb.egg-info → haystack_velesdb-3.0.0}/PKG-INFO +1 -1
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/pyproject.toml +1 -1
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb/__init__.py +1 -1
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb/document_store.py +107 -23
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0/src/haystack_velesdb.egg-info}/PKG-INFO +1 -1
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/tests/test_document_store.py +284 -1
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/LICENSE +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/README.md +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/setup.cfg +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb/py.typed +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/SOURCES.txt +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/dependency_links.txt +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/requires.txt +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/top_level.txt +0 -0
- {haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/tests/test_real_haystack_integration.py +0 -0
|
@@ -5,7 +5,6 @@ as the vector backend in any Haystack 2.x indexing or retrieval pipeline.
|
|
|
5
5
|
"""
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import hashlib
|
|
9
8
|
import logging
|
|
10
9
|
from typing import Any, Dict, List, Optional
|
|
11
10
|
|
|
@@ -15,9 +14,12 @@ from haystack.document_stores.errors import DuplicateDocumentError
|
|
|
15
14
|
from haystack.document_stores.types import DuplicatePolicy
|
|
16
15
|
|
|
17
16
|
import velesdb
|
|
17
|
+
from velesdb_common.fusion import build_fusion_strategy
|
|
18
|
+
from velesdb_common.ids import stable_hash_id
|
|
18
19
|
from velesdb_common.security import (
|
|
19
20
|
validate_collection_name,
|
|
20
21
|
validate_metric,
|
|
22
|
+
validate_named_sparse_vector,
|
|
21
23
|
validate_path,
|
|
22
24
|
)
|
|
23
25
|
|
|
@@ -29,7 +31,6 @@ _DEFAULT_COLLECTION = "haystack_documents"
|
|
|
29
31
|
_DEFAULT_DIMENSION = 768
|
|
30
32
|
_DEFAULT_METRIC = "cosine"
|
|
31
33
|
_DEFAULT_SCROLL_LIMIT = 10_000
|
|
32
|
-
_INT63_MASK = (1 << 63) - 1
|
|
33
34
|
# Reserved keys stored by this integration in the VelesDB payload.
|
|
34
35
|
_RESERVED_PAYLOAD_KEYS = frozenset({"_doc_id", "content"})
|
|
35
36
|
|
|
@@ -193,25 +194,17 @@ def _translate_haystack_filter(
|
|
|
193
194
|
return {"condition": _translate_condition(filters)}
|
|
194
195
|
|
|
195
196
|
|
|
196
|
-
def
|
|
197
|
-
"""Map a Haystack string document ID to a stable positive 63-bit integer.
|
|
198
|
-
|
|
199
|
-
Uses the first 8 bytes of SHA-256, masked to 63 bits (~9.2 × 10¹⁸ slots).
|
|
200
|
-
Collision probability for a 1 M-document collection is roughly 5 × 10⁻¹⁴ —
|
|
201
|
-
negligible for typical RAG workloads but not zero. If two distinct string
|
|
202
|
-
IDs produce the same integer ID, :meth:`write_documents` raises
|
|
203
|
-
:class:`ValueError` rather than silently overwriting the existing document.
|
|
204
|
-
"""
|
|
205
|
-
return int.from_bytes(hashlib.sha256(doc_id.encode()).digest()[:8], "big") & _INT63_MASK
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def _doc_to_point(doc: Document) -> dict:
|
|
197
|
+
def _doc_to_point(doc: Document, sparse_vector: Optional[dict] = None) -> dict:
|
|
209
198
|
"""Convert a Haystack Document to a VelesDB point dict.
|
|
210
199
|
|
|
211
200
|
Reserved payload keys (``_doc_id``, ``content``) are always written from
|
|
212
201
|
the document's canonical fields, not from ``doc.meta``. Any meta entry
|
|
213
202
|
that shares a reserved name is silently dropped from the payload to
|
|
214
203
|
prevent round-trip corruption.
|
|
204
|
+
|
|
205
|
+
When *sparse_vector* is given (a flat ``dict[int, float]`` or a named
|
|
206
|
+
``dict[str, dict[int, float]]`` mapping) it is attached so the upsert
|
|
207
|
+
creates the matching sparse index for hybrid retrieval.
|
|
215
208
|
"""
|
|
216
209
|
payload: dict = {}
|
|
217
210
|
# Merge meta first; reserved keys are excluded so they cannot
|
|
@@ -223,9 +216,11 @@ def _doc_to_point(doc: Document) -> dict:
|
|
|
223
216
|
payload["_doc_id"] = doc.id
|
|
224
217
|
if doc.content is not None:
|
|
225
218
|
payload["content"] = doc.content
|
|
226
|
-
point: dict = {"id":
|
|
219
|
+
point: dict = {"id": stable_hash_id(doc.id), "payload": payload}
|
|
227
220
|
if doc.embedding is not None:
|
|
228
221
|
point["vector"] = list(doc.embedding)
|
|
222
|
+
if sparse_vector is not None:
|
|
223
|
+
point["sparse_vector"] = sparse_vector
|
|
229
224
|
return point
|
|
230
225
|
|
|
231
226
|
|
|
@@ -281,7 +276,7 @@ def _build_int_id_map(documents: List[Document]) -> Dict[int, str]:
|
|
|
281
276
|
"""
|
|
282
277
|
int_id_map: Dict[int, str] = {}
|
|
283
278
|
for doc in documents:
|
|
284
|
-
iid =
|
|
279
|
+
iid = stable_hash_id(doc.id)
|
|
285
280
|
if iid in int_id_map and int_id_map[iid] != doc.id:
|
|
286
281
|
raise ValueError(
|
|
287
282
|
f"SHA-256 collision in write batch: '{int_id_map[iid]}' and "
|
|
@@ -342,18 +337,47 @@ def _filter_skip_policy(
|
|
|
342
337
|
return [doc for doc in documents if str_to_int[doc.id] not in existing_int_ids]
|
|
343
338
|
|
|
344
339
|
|
|
345
|
-
def
|
|
340
|
+
def _build_sparse_by_id(
|
|
341
|
+
documents: List[Document],
|
|
342
|
+
sparse_vectors: Optional[List[dict]],
|
|
343
|
+
) -> Dict[str, dict]:
|
|
344
|
+
"""Map each document id to its validated sparse vector.
|
|
345
|
+
|
|
346
|
+
Keying by document id (rather than list position) keeps the sparse
|
|
347
|
+
vectors aligned with their documents even when ``DuplicatePolicy.SKIP``
|
|
348
|
+
drops a subset before upsert. Each entry is validated as a flat
|
|
349
|
+
``dict[int, float]`` or a named ``dict[str, dict[int, float]]`` mapping.
|
|
350
|
+
"""
|
|
351
|
+
if sparse_vectors is None:
|
|
352
|
+
return {}
|
|
353
|
+
sparse_by_id: Dict[str, dict] = {}
|
|
354
|
+
for idx, doc in enumerate(documents):
|
|
355
|
+
if idx >= len(sparse_vectors):
|
|
356
|
+
break
|
|
357
|
+
sparse_by_id[doc.id] = validate_named_sparse_vector(sparse_vectors[idx])
|
|
358
|
+
return sparse_by_id
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _documents_to_points(
|
|
362
|
+
documents: List[Document],
|
|
363
|
+
sparse_by_id: Optional[Dict[str, dict]] = None,
|
|
364
|
+
) -> List[dict]:
|
|
346
365
|
"""Convert each document to its VelesDB point dict, logging documents
|
|
347
366
|
that lack an embedding so the caller still gets feedback when the
|
|
348
367
|
underlying SDK accepts vector-less points.
|
|
368
|
+
|
|
369
|
+
*sparse_by_id* (when given) maps document ids to their sparse vector dict;
|
|
370
|
+
each is attached to its point so the upsert creates the corresponding
|
|
371
|
+
sparse index.
|
|
349
372
|
"""
|
|
373
|
+
sparse_by_id = sparse_by_id or {}
|
|
350
374
|
points: List[dict] = []
|
|
351
375
|
for doc in documents:
|
|
352
376
|
if doc.embedding is None:
|
|
353
377
|
logger.warning(
|
|
354
378
|
"Document '%s' has no embedding; stored without vector.", doc.id
|
|
355
379
|
)
|
|
356
|
-
points.append(_doc_to_point(doc))
|
|
380
|
+
points.append(_doc_to_point(doc, sparse_vector=sparse_by_id.get(doc.id)))
|
|
357
381
|
return points
|
|
358
382
|
|
|
359
383
|
|
|
@@ -460,6 +484,7 @@ class VelesDBDocumentStore:
|
|
|
460
484
|
self,
|
|
461
485
|
documents: List[Document],
|
|
462
486
|
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
487
|
+
sparse_vectors: Optional[List[dict]] = None,
|
|
463
488
|
) -> int:
|
|
464
489
|
"""Write *documents* to VelesDB and return the number written.
|
|
465
490
|
|
|
@@ -476,6 +501,16 @@ class VelesDBDocumentStore:
|
|
|
476
501
|
incoming document already exists. Prefer ``OVERWRITE`` or
|
|
477
502
|
``NONE`` for large batches to avoid the pre-scan cost.
|
|
478
503
|
|
|
504
|
+
Args:
|
|
505
|
+
documents: Documents to write.
|
|
506
|
+
policy: Duplicate-handling policy (see above).
|
|
507
|
+
sparse_vectors: Optional list aligned with *documents*; each entry
|
|
508
|
+
is a flat ``dict[int, float]`` or a named
|
|
509
|
+
``dict[str, dict[int, float]]`` mapping (e.g.
|
|
510
|
+
``{"bge_m3": {0: 1.5}}``). A named mapping creates the named
|
|
511
|
+
sparse index so it can later be queried with
|
|
512
|
+
``sparse_index_name="bge_m3"``.
|
|
513
|
+
|
|
479
514
|
Raises:
|
|
480
515
|
DuplicateDocumentError: When *policy* is ``FAIL`` and at least
|
|
481
516
|
one document already exists in the store.
|
|
@@ -484,6 +519,7 @@ class VelesDBDocumentStore:
|
|
|
484
519
|
"""
|
|
485
520
|
if not documents:
|
|
486
521
|
return 0
|
|
522
|
+
sparse_by_id = _build_sparse_by_id(documents, sparse_vectors)
|
|
487
523
|
int_id_map = _build_int_id_map(documents)
|
|
488
524
|
col = self._get_collection()
|
|
489
525
|
if policy == DuplicatePolicy.FAIL:
|
|
@@ -495,7 +531,7 @@ class VelesDBDocumentStore:
|
|
|
495
531
|
return 0
|
|
496
532
|
else:
|
|
497
533
|
survivors = documents
|
|
498
|
-
points = _documents_to_points(survivors)
|
|
534
|
+
points = _documents_to_points(survivors, sparse_by_id)
|
|
499
535
|
result = col.upsert(points)
|
|
500
536
|
return result if isinstance(result, int) else len(points)
|
|
501
537
|
|
|
@@ -506,7 +542,7 @@ class VelesDBDocumentStore:
|
|
|
506
542
|
"""Delete documents identified by their Haystack string IDs."""
|
|
507
543
|
if not document_ids:
|
|
508
544
|
return
|
|
509
|
-
int_ids = [
|
|
545
|
+
int_ids = [stable_hash_id(did) for did in document_ids]
|
|
510
546
|
self._get_collection().delete(int_ids)
|
|
511
547
|
|
|
512
548
|
def embedding_retrieval(
|
|
@@ -516,6 +552,8 @@ class VelesDBDocumentStore:
|
|
|
516
552
|
top_k: int = 10,
|
|
517
553
|
filters: Optional[Dict[str, Any]] = None,
|
|
518
554
|
scale_score: bool = True,
|
|
555
|
+
fusion: Optional[str] = None,
|
|
556
|
+
fusion_params: Optional[dict] = None,
|
|
519
557
|
) -> List[Document]:
|
|
520
558
|
"""Return the *top_k* documents most similar to *query_embedding*.
|
|
521
559
|
|
|
@@ -527,13 +565,29 @@ class VelesDBDocumentStore:
|
|
|
527
565
|
forwarded; ``meta.<key>`` is stripped to ``<key>``.
|
|
528
566
|
scale_score: When ``True`` and ``metric="cosine"``, scores are
|
|
529
567
|
normalised from ``[-1, 1]`` to ``[0, 1]``. Ignored for other
|
|
530
|
-
metrics, where raw scores are returned unchanged.
|
|
568
|
+
metrics, where raw scores are returned unchanged. Score
|
|
569
|
+
scaling does not apply to fused (``fusion``) results, whose
|
|
570
|
+
scores come from the fusion strategy rather than the metric.
|
|
571
|
+
fusion: Optional fusion strategy name applied to the ranking —
|
|
572
|
+
one of ``"average"``, ``"maximum"``, ``"rrf"``,
|
|
573
|
+
``"weighted"``, ``"relative_score"`` / ``"rsf"``. When set,
|
|
574
|
+
the query is ranked through the chosen
|
|
575
|
+
:class:`velesdb.FusionStrategy`, which changes the result
|
|
576
|
+
ordering relative to the default dense ranking. ``filters``
|
|
577
|
+
are not supported together with ``fusion``.
|
|
578
|
+
fusion_params: Optional parameters for *fusion* (see
|
|
579
|
+
:func:`velesdb_common.fusion.build_fusion_strategy`).
|
|
531
580
|
|
|
532
581
|
Raises:
|
|
533
582
|
NotImplementedError: When *filters* uses an operator VelesDB
|
|
534
583
|
does not support.
|
|
535
|
-
ValueError: When *filters* is structurally malformed
|
|
584
|
+
ValueError: When *filters* is structurally malformed, or when
|
|
585
|
+
*filters* is combined with *fusion*.
|
|
536
586
|
"""
|
|
587
|
+
if fusion is not None:
|
|
588
|
+
return self._fusion_retrieval(
|
|
589
|
+
query_embedding, top_k, filters, fusion, fusion_params
|
|
590
|
+
)
|
|
537
591
|
veles_filter = _translate_haystack_filter(filters)
|
|
538
592
|
results: List[dict] = self._get_collection().search_request(
|
|
539
593
|
velesdb.SearchOptions(
|
|
@@ -544,6 +598,36 @@ class VelesDBDocumentStore:
|
|
|
544
598
|
)
|
|
545
599
|
return [_result_to_doc(r, scale_score=scale_score, metric=self._metric) for r in results]
|
|
546
600
|
|
|
601
|
+
def _fusion_retrieval(
|
|
602
|
+
self,
|
|
603
|
+
query_embedding: List[float],
|
|
604
|
+
top_k: int,
|
|
605
|
+
filters: Optional[Dict[str, Any]],
|
|
606
|
+
fusion: str,
|
|
607
|
+
fusion_params: Optional[dict],
|
|
608
|
+
) -> List[Document]:
|
|
609
|
+
"""Rank a single query through a :class:`velesdb.FusionStrategy`.
|
|
610
|
+
|
|
611
|
+
Delegates to ``Collection.multi_query_search`` with a one-element
|
|
612
|
+
query list so the chosen strategy decides the fused scores. The
|
|
613
|
+
shared :func:`velesdb_common.fusion.build_fusion_strategy` builder is
|
|
614
|
+
reused (same as the LangChain and LlamaIndex integrations).
|
|
615
|
+
"""
|
|
616
|
+
if filters is not None:
|
|
617
|
+
raise ValueError(
|
|
618
|
+
"fusion cannot be combined with filters; apply filters in a "
|
|
619
|
+
"separate dense embedding_retrieval call or omit fusion."
|
|
620
|
+
)
|
|
621
|
+
strategy = build_fusion_strategy(fusion, fusion_params)
|
|
622
|
+
results: List[dict] = self._get_collection().multi_query_search(
|
|
623
|
+
vectors=[query_embedding],
|
|
624
|
+
top_k=top_k,
|
|
625
|
+
fusion=strategy,
|
|
626
|
+
)
|
|
627
|
+
# Fused scores are strategy-derived, not metric similarities, so the
|
|
628
|
+
# cosine [-1, 1] -> [0, 1] rescaling is intentionally not applied.
|
|
629
|
+
return [_result_to_doc(r, metric=self._metric) for r in results]
|
|
630
|
+
|
|
547
631
|
# ------------------------------------------------------------------
|
|
548
632
|
# Haystack pipeline serialisation
|
|
549
633
|
# ------------------------------------------------------------------
|
|
@@ -43,6 +43,78 @@ class DuplicateDocumentError(Exception):
|
|
|
43
43
|
# ---------------------------------------------------------------------------
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
class _FakeFusionStrategy:
|
|
47
|
+
"""Minimal stand-in for velesdb.FusionStrategy.
|
|
48
|
+
|
|
49
|
+
Records the strategy name so a fake ``multi_query_search`` can vary its
|
|
50
|
+
result ordering by strategy (mirrors the real binding, where different
|
|
51
|
+
strategies produce different fused scores).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, name: str, params: Optional[dict] = None) -> None:
|
|
55
|
+
self.name = name
|
|
56
|
+
self.params = params or {}
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def average() -> "_FakeFusionStrategy":
|
|
60
|
+
return _FakeFusionStrategy("average")
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def maximum() -> "_FakeFusionStrategy":
|
|
64
|
+
return _FakeFusionStrategy("maximum")
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def rrf(k: int = 60) -> "_FakeFusionStrategy":
|
|
68
|
+
return _FakeFusionStrategy("rrf", {"k": k})
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def weighted(
|
|
72
|
+
avg_weight: float = 0.6,
|
|
73
|
+
max_weight: float = 0.3,
|
|
74
|
+
hit_weight: float = 0.1,
|
|
75
|
+
) -> "_FakeFusionStrategy":
|
|
76
|
+
return _FakeFusionStrategy(
|
|
77
|
+
"weighted",
|
|
78
|
+
{
|
|
79
|
+
"avg_weight": avg_weight,
|
|
80
|
+
"max_weight": max_weight,
|
|
81
|
+
"hit_weight": hit_weight,
|
|
82
|
+
},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def relative_score(
|
|
87
|
+
dense_weight: float, sparse_weight: float
|
|
88
|
+
) -> "_FakeFusionStrategy":
|
|
89
|
+
return _FakeFusionStrategy(
|
|
90
|
+
"relative_score",
|
|
91
|
+
{"dense_weight": dense_weight, "sparse_weight": sparse_weight},
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _build_fake_fusion(
|
|
96
|
+
fusion: str, fusion_params: Optional[dict] = None
|
|
97
|
+
) -> _FakeFusionStrategy:
|
|
98
|
+
"""Stand-in for velesdb_common.fusion.build_fusion_strategy.
|
|
99
|
+
|
|
100
|
+
Maps the strategy name to the matching FusionStrategy factory so the
|
|
101
|
+
document store's fusion routing can be exercised without the real
|
|
102
|
+
velesdb_common package.
|
|
103
|
+
"""
|
|
104
|
+
params = fusion_params or {}
|
|
105
|
+
if fusion in ("relative_score", "rsf"):
|
|
106
|
+
return _FakeFusionStrategy.relative_score(
|
|
107
|
+
params.get("dense_weight", 0.5), params.get("sparse_weight", 0.5)
|
|
108
|
+
)
|
|
109
|
+
if fusion == "weighted":
|
|
110
|
+
return _FakeFusionStrategy.weighted()
|
|
111
|
+
if fusion == "maximum":
|
|
112
|
+
return _FakeFusionStrategy.maximum()
|
|
113
|
+
if fusion == "average":
|
|
114
|
+
return _FakeFusionStrategy.average()
|
|
115
|
+
return _FakeFusionStrategy.rrf(params.get("k", 60))
|
|
116
|
+
|
|
117
|
+
|
|
46
118
|
class _FakeSearchOptions:
|
|
47
119
|
"""Minimal stand-in for velesdb.SearchOptions used by search_request."""
|
|
48
120
|
|
|
@@ -95,6 +167,33 @@ class _FakeCollection:
|
|
|
95
167
|
"""Canonical search entry point — delegate to the legacy `search`."""
|
|
96
168
|
return self.search(opts.vector, top_k=opts.top_k, filter=opts.filter)
|
|
97
169
|
|
|
170
|
+
def multi_query_search(
|
|
171
|
+
self,
|
|
172
|
+
vectors: list,
|
|
173
|
+
top_k: int = 10,
|
|
174
|
+
fusion: Any = None,
|
|
175
|
+
filter: Any = None, # pylint: disable=redefined-builtin
|
|
176
|
+
) -> list:
|
|
177
|
+
"""Fused multi-query search whose ordering depends on the strategy.
|
|
178
|
+
|
|
179
|
+
The real binding produces strategy-dependent fused scores. This fake
|
|
180
|
+
reproduces that observable behaviour: the points are sorted by a
|
|
181
|
+
per-strategy key so callers can assert that ``fusion='rsf'`` and
|
|
182
|
+
``fusion='weighted'`` yield different orderings.
|
|
183
|
+
"""
|
|
184
|
+
del vectors, filter # the fake ignores these
|
|
185
|
+
points = list(self._points.values())
|
|
186
|
+
name = getattr(fusion, "name", "rrf")
|
|
187
|
+
# Reverse the order for relative_score so the resulting ranking
|
|
188
|
+
# differs from the default (rrf) and from weighted.
|
|
189
|
+
reverse = name in ("relative_score", "rsf")
|
|
190
|
+
ordered = points[::-1] if reverse else points
|
|
191
|
+
results = [
|
|
192
|
+
{"id": p["id"], "score": 0.9, "payload": p.get("payload", {})}
|
|
193
|
+
for p in ordered[:top_k]
|
|
194
|
+
]
|
|
195
|
+
return results
|
|
196
|
+
|
|
98
197
|
def scroll( # pylint: disable=redefined-builtin
|
|
99
198
|
self,
|
|
100
199
|
*,
|
|
@@ -172,7 +271,9 @@ def _load_module() -> types.ModuleType:
|
|
|
172
271
|
sys.modules["haystack.document_stores.errors"] = errors_mod
|
|
173
272
|
|
|
174
273
|
sys.modules["velesdb"] = types.SimpleNamespace( # type: ignore
|
|
175
|
-
Database=_FakeDatabase,
|
|
274
|
+
Database=_FakeDatabase,
|
|
275
|
+
SearchOptions=_FakeSearchOptions,
|
|
276
|
+
FusionStrategy=_FakeFusionStrategy,
|
|
176
277
|
)
|
|
177
278
|
|
|
178
279
|
# Stub velesdb_common.security with no-op validators (real package has its own tests).
|
|
@@ -181,13 +282,29 @@ def _load_module() -> types.ModuleType:
|
|
|
181
282
|
|
|
182
283
|
vc_mod = types.ModuleType("velesdb_common")
|
|
183
284
|
sys.modules["velesdb_common"] = vc_mod
|
|
285
|
+
|
|
286
|
+
# Load the REAL velesdb_common.ids (pure stdlib) so the store exercises the
|
|
287
|
+
# canonical stable_hash_id rather than a forked copy (single-source-of-truth
|
|
288
|
+
# + license hygiene — see docs/planning/CORE_PARITY_REMEDIATION.md T3).
|
|
289
|
+
_ids_path = Path(__file__).resolve().parents[2] / "common" / "src" / "velesdb_common" / "ids.py"
|
|
290
|
+
_ids_spec = importlib.util.spec_from_file_location("velesdb_common.ids", _ids_path)
|
|
291
|
+
assert _ids_spec and _ids_spec.loader
|
|
292
|
+
vc_ids = importlib.util.module_from_spec(_ids_spec)
|
|
293
|
+
sys.modules["velesdb_common.ids"] = vc_ids
|
|
294
|
+
_ids_spec.loader.exec_module(vc_ids)
|
|
295
|
+
|
|
184
296
|
vc_sec = types.ModuleType("velesdb_common.security")
|
|
185
297
|
vc_sec.validate_path = _passthrough # type: ignore[attr-defined]
|
|
186
298
|
vc_sec.validate_collection_name = _passthrough # type: ignore[attr-defined]
|
|
187
299
|
vc_sec.validate_metric = _passthrough # type: ignore[attr-defined]
|
|
300
|
+
vc_sec.validate_named_sparse_vector = _passthrough # type: ignore[attr-defined]
|
|
188
301
|
vc_sec.SecurityError = ValueError # type: ignore[attr-defined]
|
|
189
302
|
sys.modules["velesdb_common.security"] = vc_sec
|
|
190
303
|
|
|
304
|
+
vc_fusion = types.ModuleType("velesdb_common.fusion")
|
|
305
|
+
vc_fusion.build_fusion_strategy = _build_fake_fusion # type: ignore[attr-defined]
|
|
306
|
+
sys.modules["velesdb_common.fusion"] = vc_fusion
|
|
307
|
+
|
|
191
308
|
pkg = types.ModuleType("haystack_velesdb")
|
|
192
309
|
pkg.__path__ = [str(root)] # type: ignore[attr-defined]
|
|
193
310
|
sys.modules["haystack_velesdb"] = pkg
|
|
@@ -782,3 +899,169 @@ def test_embedding_retrieval_translates_haystack_filter_to_veles_shape() -> None
|
|
|
782
899
|
}, "embedding_retrieval must translate Haystack filter to VelesDB Filter shape"
|
|
783
900
|
finally:
|
|
784
901
|
_MOD.velesdb = original_velesdb
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
# ---------------------------------------------------------------------------
|
|
905
|
+
# I1: fusion (RSF / Weighted) on embedding_retrieval
|
|
906
|
+
# ---------------------------------------------------------------------------
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def _store_with_three_docs(name: str) -> Any:
|
|
910
|
+
store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name=name)
|
|
911
|
+
store.write_documents([
|
|
912
|
+
Document(id="d1", content="one", embedding=[0.1]),
|
|
913
|
+
Document(id="d2", content="two", embedding=[0.2]),
|
|
914
|
+
Document(id="d3", content="three", embedding=[0.3]),
|
|
915
|
+
])
|
|
916
|
+
return store
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def test_embedding_retrieval_fusion_changes_ordering_vs_default() -> None:
|
|
920
|
+
"""fusion='rsf' must reorder results relative to the default ranking."""
|
|
921
|
+
store = _store_with_three_docs("t_fusion_rsf")
|
|
922
|
+
default_ids = [d.id for d in store.embedding_retrieval([0.1], top_k=3)]
|
|
923
|
+
rsf_ids = [
|
|
924
|
+
d.id for d in store.embedding_retrieval([0.1], top_k=3, fusion="rsf")
|
|
925
|
+
]
|
|
926
|
+
assert rsf_ids != default_ids, "fusion='rsf' must change result ordering"
|
|
927
|
+
assert sorted(rsf_ids) == sorted(default_ids), "same doc set, different order"
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def test_embedding_retrieval_rsf_and_weighted_differ() -> None:
|
|
931
|
+
"""rsf and weighted fusion must produce different orderings."""
|
|
932
|
+
store = _store_with_three_docs("t_fusion_pair")
|
|
933
|
+
rsf_ids = [
|
|
934
|
+
d.id for d in store.embedding_retrieval([0.1], top_k=3, fusion="rsf")
|
|
935
|
+
]
|
|
936
|
+
weighted_ids = [
|
|
937
|
+
d.id
|
|
938
|
+
for d in store.embedding_retrieval([0.1], top_k=3, fusion="weighted")
|
|
939
|
+
]
|
|
940
|
+
assert rsf_ids != weighted_ids, "rsf and weighted must differ in ordering"
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def test_embedding_retrieval_fusion_passes_params() -> None:
|
|
944
|
+
"""fusion_params must reach build_fusion_strategy and the collection."""
|
|
945
|
+
captured: dict = {}
|
|
946
|
+
|
|
947
|
+
class _CapturingCollection(_FakeCollection):
|
|
948
|
+
def multi_query_search(
|
|
949
|
+
self,
|
|
950
|
+
vectors: list,
|
|
951
|
+
top_k: int = 10,
|
|
952
|
+
fusion: Any = None,
|
|
953
|
+
filter: Any = None, # pylint: disable=redefined-builtin
|
|
954
|
+
) -> list:
|
|
955
|
+
captured["fusion_name"] = getattr(fusion, "name", None)
|
|
956
|
+
captured["fusion_params"] = getattr(fusion, "params", None)
|
|
957
|
+
return super().multi_query_search(
|
|
958
|
+
vectors, top_k=top_k, fusion=fusion, filter=filter
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
class _CapturingDatabase:
|
|
962
|
+
def __init__(self, path: str) -> None:
|
|
963
|
+
self._col = _CapturingCollection()
|
|
964
|
+
|
|
965
|
+
def get_collection(self, name: str) -> _CapturingCollection:
|
|
966
|
+
return self._col
|
|
967
|
+
|
|
968
|
+
def create_collection(
|
|
969
|
+
self, name: str, dimension: int, metric: str
|
|
970
|
+
) -> _CapturingCollection:
|
|
971
|
+
return self._col
|
|
972
|
+
|
|
973
|
+
original_velesdb = _MOD.velesdb
|
|
974
|
+
try:
|
|
975
|
+
_MOD.velesdb = types.SimpleNamespace( # type: ignore
|
|
976
|
+
Database=_CapturingDatabase,
|
|
977
|
+
SearchOptions=_FakeSearchOptions,
|
|
978
|
+
FusionStrategy=_FakeFusionStrategy,
|
|
979
|
+
)
|
|
980
|
+
store = _MOD.VelesDBDocumentStore(
|
|
981
|
+
path="/tmp/hs", collection_name="t_fusion_params"
|
|
982
|
+
)
|
|
983
|
+
store.write_documents([Document(id="p", content="x", embedding=[0.5])])
|
|
984
|
+
store.embedding_retrieval(
|
|
985
|
+
[0.5],
|
|
986
|
+
top_k=3,
|
|
987
|
+
fusion="rsf",
|
|
988
|
+
fusion_params={"dense_weight": 0.7, "sparse_weight": 0.3},
|
|
989
|
+
)
|
|
990
|
+
assert captured["fusion_name"] == "relative_score"
|
|
991
|
+
assert captured["fusion_params"]["dense_weight"] == 0.7
|
|
992
|
+
finally:
|
|
993
|
+
_MOD.velesdb = original_velesdb
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
# ---------------------------------------------------------------------------
|
|
997
|
+
# I2: named-sparse-index creation on write_documents
|
|
998
|
+
# ---------------------------------------------------------------------------
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
def test_write_documents_forwards_named_sparse_vectors() -> None:
|
|
1002
|
+
"""A named sparse vector dict must reach the upserted point so the
|
|
1003
|
+
underlying named sparse index is created.
|
|
1004
|
+
"""
|
|
1005
|
+
captured: dict = {}
|
|
1006
|
+
|
|
1007
|
+
class _CapturingCollection(_FakeCollection):
|
|
1008
|
+
def upsert(self, points: list) -> int:
|
|
1009
|
+
captured["points"] = points
|
|
1010
|
+
return super().upsert(points)
|
|
1011
|
+
|
|
1012
|
+
class _CapturingDatabase:
|
|
1013
|
+
def __init__(self, path: str) -> None:
|
|
1014
|
+
self._col = _CapturingCollection()
|
|
1015
|
+
|
|
1016
|
+
def get_collection(self, name: str) -> _CapturingCollection:
|
|
1017
|
+
return self._col
|
|
1018
|
+
|
|
1019
|
+
def create_collection(
|
|
1020
|
+
self, name: str, dimension: int, metric: str
|
|
1021
|
+
) -> _CapturingCollection:
|
|
1022
|
+
return self._col
|
|
1023
|
+
|
|
1024
|
+
original_velesdb = _MOD.velesdb
|
|
1025
|
+
try:
|
|
1026
|
+
_MOD.velesdb = types.SimpleNamespace( # type: ignore
|
|
1027
|
+
Database=_CapturingDatabase,
|
|
1028
|
+
SearchOptions=_FakeSearchOptions,
|
|
1029
|
+
FusionStrategy=_FakeFusionStrategy,
|
|
1030
|
+
)
|
|
1031
|
+
store = _MOD.VelesDBDocumentStore(
|
|
1032
|
+
path="/tmp/hs", collection_name="t_named_sparse"
|
|
1033
|
+
)
|
|
1034
|
+
store.write_documents(
|
|
1035
|
+
[Document(id="s1", content="hi", embedding=[0.5])],
|
|
1036
|
+
sparse_vectors=[{"bge_m3": {0: 1.5, 7: 0.8}}],
|
|
1037
|
+
)
|
|
1038
|
+
point = captured["points"][0]
|
|
1039
|
+
assert point["sparse_vector"] == {"bge_m3": {0: 1.5, 7: 0.8}}
|
|
1040
|
+
finally:
|
|
1041
|
+
_MOD.velesdb = original_velesdb
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
def test_id_hashing_uses_canonical_stable_hash_id():
|
|
1045
|
+
"""T3: the store delegates string->int ID hashing to the shared
|
|
1046
|
+
velesdb_common.ids.stable_hash_id, not a forked local copy. This keeps the
|
|
1047
|
+
same logical document mapped to the same VelesDB point ID across every
|
|
1048
|
+
integration (single source of truth) and avoids re-implementing the hash in
|
|
1049
|
+
an MIT package (license hygiene). See docs/planning/CORE_PARITY_REMEDIATION.md.
|
|
1050
|
+
"""
|
|
1051
|
+
import hashlib
|
|
1052
|
+
|
|
1053
|
+
import velesdb_common.ids as canonical_ids
|
|
1054
|
+
|
|
1055
|
+
# the module imported the canonical helper, and the forked copy is gone
|
|
1056
|
+
assert _MOD.stable_hash_id is canonical_ids.stable_hash_id
|
|
1057
|
+
assert not hasattr(_MOD, "_str_id_to_int")
|
|
1058
|
+
assert not hasattr(_MOD, "_INT63_MASK")
|
|
1059
|
+
|
|
1060
|
+
# and it yields the canonical positive-63-bit value
|
|
1061
|
+
for doc_id in ["", "doc-1", "héllo-世界", "Document_42::chunk#3", "a" * 500]:
|
|
1062
|
+
expected = (
|
|
1063
|
+
int.from_bytes(hashlib.sha256(doc_id.encode("utf-8")).digest()[:8], "big")
|
|
1064
|
+
& 0x7FFFFFFFFFFFFFFF
|
|
1065
|
+
)
|
|
1066
|
+
assert _MOD.stable_hash_id(doc_id) == expected
|
|
1067
|
+
assert 0 <= _MOD.stable_hash_id(doc_id) <= 0x7FFFFFFFFFFFFFFF
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/requires.txt
RENAMED
|
File without changes
|
{haystack_velesdb-2.0.0 → haystack_velesdb-3.0.0}/src/haystack_velesdb.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|