haystack-velesdb 2.0.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haystack-velesdb
3
- Version: 2.0.0
3
+ Version: 3.0.0
4
4
  Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
5
5
  Author-email: VelesDB Team <contact@wiscale.fr>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "haystack-velesdb"
7
- version = "2.0.0"
7
+ version = "3.0.0"
8
8
  description = "Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -3,4 +3,4 @@
3
3
  from haystack_velesdb.document_store import VelesDBDocumentStore
4
4
 
5
5
  __all__ = ["VelesDBDocumentStore"]
6
- __version__ = "2.0.0"
6
+ __version__ = "3.0.0"
@@ -5,7 +5,6 @@ as the vector backend in any Haystack 2.x indexing or retrieval pipeline.
5
5
  """
6
6
  from __future__ import annotations
7
7
 
8
- import hashlib
9
8
  import logging
10
9
  from typing import Any, Dict, List, Optional
11
10
 
@@ -15,9 +14,12 @@ from haystack.document_stores.errors import DuplicateDocumentError
15
14
  from haystack.document_stores.types import DuplicatePolicy
16
15
 
17
16
  import velesdb
17
+ from velesdb_common.fusion import build_fusion_strategy
18
+ from velesdb_common.ids import stable_hash_id
18
19
  from velesdb_common.security import (
19
20
  validate_collection_name,
20
21
  validate_metric,
22
+ validate_named_sparse_vector,
21
23
  validate_path,
22
24
  )
23
25
 
@@ -29,7 +31,6 @@ _DEFAULT_COLLECTION = "haystack_documents"
29
31
  _DEFAULT_DIMENSION = 768
30
32
  _DEFAULT_METRIC = "cosine"
31
33
  _DEFAULT_SCROLL_LIMIT = 10_000
32
- _INT63_MASK = (1 << 63) - 1
33
34
  # Reserved keys stored by this integration in the VelesDB payload.
34
35
  _RESERVED_PAYLOAD_KEYS = frozenset({"_doc_id", "content"})
35
36
 
@@ -193,25 +194,17 @@ def _translate_haystack_filter(
193
194
  return {"condition": _translate_condition(filters)}
194
195
 
195
196
 
196
- def _str_id_to_int(doc_id: str) -> int:
197
- """Map a Haystack string document ID to a stable positive 63-bit integer.
198
-
199
- Uses the first 8 bytes of SHA-256, masked to 63 bits (~9.2 × 10¹⁸ slots).
200
- Collision probability for a 1 M-document collection is roughly 5 × 10⁻¹⁴ —
201
- negligible for typical RAG workloads but not zero. If two distinct string
202
- IDs produce the same integer ID, :meth:`write_documents` raises
203
- :class:`ValueError` rather than silently overwriting the existing document.
204
- """
205
- return int.from_bytes(hashlib.sha256(doc_id.encode()).digest()[:8], "big") & _INT63_MASK
206
-
207
-
208
- def _doc_to_point(doc: Document) -> dict:
197
+ def _doc_to_point(doc: Document, sparse_vector: Optional[dict] = None) -> dict:
209
198
  """Convert a Haystack Document to a VelesDB point dict.
210
199
 
211
200
  Reserved payload keys (``_doc_id``, ``content``) are always written from
212
201
  the document's canonical fields, not from ``doc.meta``. Any meta entry
213
202
  that shares a reserved name is silently dropped from the payload to
214
203
  prevent round-trip corruption.
204
+
205
+ When *sparse_vector* is given (a flat ``dict[int, float]`` or a named
206
+ ``dict[str, dict[int, float]]`` mapping) it is attached so the upsert
207
+ creates the matching sparse index for hybrid retrieval.
215
208
  """
216
209
  payload: dict = {}
217
210
  # Merge meta first; reserved keys are excluded so they cannot
@@ -223,9 +216,11 @@ def _doc_to_point(doc: Document) -> dict:
223
216
  payload["_doc_id"] = doc.id
224
217
  if doc.content is not None:
225
218
  payload["content"] = doc.content
226
- point: dict = {"id": _str_id_to_int(doc.id), "payload": payload}
219
+ point: dict = {"id": stable_hash_id(doc.id), "payload": payload}
227
220
  if doc.embedding is not None:
228
221
  point["vector"] = list(doc.embedding)
222
+ if sparse_vector is not None:
223
+ point["sparse_vector"] = sparse_vector
229
224
  return point
230
225
 
231
226
 
@@ -281,7 +276,7 @@ def _build_int_id_map(documents: List[Document]) -> Dict[int, str]:
281
276
  """
282
277
  int_id_map: Dict[int, str] = {}
283
278
  for doc in documents:
284
- iid = _str_id_to_int(doc.id)
279
+ iid = stable_hash_id(doc.id)
285
280
  if iid in int_id_map and int_id_map[iid] != doc.id:
286
281
  raise ValueError(
287
282
  f"SHA-256 collision in write batch: '{int_id_map[iid]}' and "
@@ -342,18 +337,47 @@ def _filter_skip_policy(
342
337
  return [doc for doc in documents if str_to_int[doc.id] not in existing_int_ids]
343
338
 
344
339
 
345
- def _documents_to_points(documents: List[Document]) -> List[dict]:
340
+ def _build_sparse_by_id(
341
+ documents: List[Document],
342
+ sparse_vectors: Optional[List[dict]],
343
+ ) -> Dict[str, dict]:
344
+ """Map each document id to its validated sparse vector.
345
+
346
+ Keying by document id (rather than list position) keeps the sparse
347
+ vectors aligned with their documents even when ``DuplicatePolicy.SKIP``
348
+ drops a subset before upsert. Each entry is validated as a flat
349
+ ``dict[int, float]`` or a named ``dict[str, dict[int, float]]`` mapping.
350
+ """
351
+ if sparse_vectors is None:
352
+ return {}
353
+ sparse_by_id: Dict[str, dict] = {}
354
+ for idx, doc in enumerate(documents):
355
+ if idx >= len(sparse_vectors):
356
+ break
357
+ sparse_by_id[doc.id] = validate_named_sparse_vector(sparse_vectors[idx])
358
+ return sparse_by_id
359
+
360
+
361
+ def _documents_to_points(
362
+ documents: List[Document],
363
+ sparse_by_id: Optional[Dict[str, dict]] = None,
364
+ ) -> List[dict]:
346
365
  """Convert each document to its VelesDB point dict, logging documents
347
366
  that lack an embedding so the caller still gets feedback when the
348
367
  underlying SDK accepts vector-less points.
368
+
369
+ *sparse_by_id* (when given) maps document ids to their sparse vector dict;
370
+ each is attached to its point so the upsert creates the corresponding
371
+ sparse index.
349
372
  """
373
+ sparse_by_id = sparse_by_id or {}
350
374
  points: List[dict] = []
351
375
  for doc in documents:
352
376
  if doc.embedding is None:
353
377
  logger.warning(
354
378
  "Document '%s' has no embedding; stored without vector.", doc.id
355
379
  )
356
- points.append(_doc_to_point(doc))
380
+ points.append(_doc_to_point(doc, sparse_vector=sparse_by_id.get(doc.id)))
357
381
  return points
358
382
 
359
383
 
@@ -460,6 +484,7 @@ class VelesDBDocumentStore:
460
484
  self,
461
485
  documents: List[Document],
462
486
  policy: DuplicatePolicy = DuplicatePolicy.NONE,
487
+ sparse_vectors: Optional[List[dict]] = None,
463
488
  ) -> int:
464
489
  """Write *documents* to VelesDB and return the number written.
465
490
 
@@ -476,6 +501,16 @@ class VelesDBDocumentStore:
476
501
  incoming document already exists. Prefer ``OVERWRITE`` or
477
502
  ``NONE`` for large batches to avoid the pre-scan cost.
478
503
 
504
+ Args:
505
+ documents: Documents to write.
506
+ policy: Duplicate-handling policy (see above).
507
+ sparse_vectors: Optional list aligned with *documents*; each entry
508
+ is a flat ``dict[int, float]`` or a named
509
+ ``dict[str, dict[int, float]]`` mapping (e.g.
510
+ ``{"bge_m3": {0: 1.5}}``). A named mapping creates the named
511
+ sparse index so it can later be queried with
512
+ ``sparse_index_name="bge_m3"``.
513
+
479
514
  Raises:
480
515
  DuplicateDocumentError: When *policy* is ``FAIL`` and at least
481
516
  one document already exists in the store.
@@ -484,6 +519,7 @@ class VelesDBDocumentStore:
484
519
  """
485
520
  if not documents:
486
521
  return 0
522
+ sparse_by_id = _build_sparse_by_id(documents, sparse_vectors)
487
523
  int_id_map = _build_int_id_map(documents)
488
524
  col = self._get_collection()
489
525
  if policy == DuplicatePolicy.FAIL:
@@ -495,7 +531,7 @@ class VelesDBDocumentStore:
495
531
  return 0
496
532
  else:
497
533
  survivors = documents
498
- points = _documents_to_points(survivors)
534
+ points = _documents_to_points(survivors, sparse_by_id)
499
535
  result = col.upsert(points)
500
536
  return result if isinstance(result, int) else len(points)
501
537
 
@@ -506,7 +542,7 @@ class VelesDBDocumentStore:
506
542
  """Delete documents identified by their Haystack string IDs."""
507
543
  if not document_ids:
508
544
  return
509
- int_ids = [_str_id_to_int(did) for did in document_ids]
545
+ int_ids = [stable_hash_id(did) for did in document_ids]
510
546
  self._get_collection().delete(int_ids)
511
547
 
512
548
  def embedding_retrieval(
@@ -516,6 +552,8 @@ class VelesDBDocumentStore:
516
552
  top_k: int = 10,
517
553
  filters: Optional[Dict[str, Any]] = None,
518
554
  scale_score: bool = True,
555
+ fusion: Optional[str] = None,
556
+ fusion_params: Optional[dict] = None,
519
557
  ) -> List[Document]:
520
558
  """Return the *top_k* documents most similar to *query_embedding*.
521
559
 
@@ -527,13 +565,29 @@ class VelesDBDocumentStore:
527
565
  forwarded; ``meta.<key>`` is stripped to ``<key>``.
528
566
  scale_score: When ``True`` and ``metric="cosine"``, scores are
529
567
  normalised from ``[-1, 1]`` to ``[0, 1]``. Ignored for other
530
- metrics, where raw scores are returned unchanged.
568
+ metrics, where raw scores are returned unchanged. Score
569
+ scaling does not apply to fused (``fusion``) results, whose
570
+ scores come from the fusion strategy rather than the metric.
571
+ fusion: Optional fusion strategy name applied to the ranking —
572
+ one of ``"average"``, ``"maximum"``, ``"rrf"``,
573
+ ``"weighted"``, ``"relative_score"`` / ``"rsf"``. When set,
574
+ the query is ranked through the chosen
575
+ :class:`velesdb.FusionStrategy`, which changes the result
576
+ ordering relative to the default dense ranking. ``filters``
577
+ are not supported together with ``fusion``.
578
+ fusion_params: Optional parameters for *fusion* (see
579
+ :func:`velesdb_common.fusion.build_fusion_strategy`).
531
580
 
532
581
  Raises:
533
582
  NotImplementedError: When *filters* uses an operator VelesDB
534
583
  does not support.
535
- ValueError: When *filters* is structurally malformed.
584
+ ValueError: When *filters* is structurally malformed, or when
585
+ *filters* is combined with *fusion*.
536
586
  """
587
+ if fusion is not None:
588
+ return self._fusion_retrieval(
589
+ query_embedding, top_k, filters, fusion, fusion_params
590
+ )
537
591
  veles_filter = _translate_haystack_filter(filters)
538
592
  results: List[dict] = self._get_collection().search_request(
539
593
  velesdb.SearchOptions(
@@ -544,6 +598,36 @@ class VelesDBDocumentStore:
544
598
  )
545
599
  return [_result_to_doc(r, scale_score=scale_score, metric=self._metric) for r in results]
546
600
 
601
+ def _fusion_retrieval(
602
+ self,
603
+ query_embedding: List[float],
604
+ top_k: int,
605
+ filters: Optional[Dict[str, Any]],
606
+ fusion: str,
607
+ fusion_params: Optional[dict],
608
+ ) -> List[Document]:
609
+ """Rank a single query through a :class:`velesdb.FusionStrategy`.
610
+
611
+ Delegates to ``Collection.multi_query_search`` with a one-element
612
+ query list so the chosen strategy decides the fused scores. The
613
+ shared :func:`velesdb_common.fusion.build_fusion_strategy` builder is
614
+ reused (same as the LangChain and LlamaIndex integrations).
615
+ """
616
+ if filters is not None:
617
+ raise ValueError(
618
+ "fusion cannot be combined with filters; apply filters in a "
619
+ "separate dense embedding_retrieval call or omit fusion."
620
+ )
621
+ strategy = build_fusion_strategy(fusion, fusion_params)
622
+ results: List[dict] = self._get_collection().multi_query_search(
623
+ vectors=[query_embedding],
624
+ top_k=top_k,
625
+ fusion=strategy,
626
+ )
627
+ # Fused scores are strategy-derived, not metric similarities, so the
628
+ # cosine [-1, 1] -> [0, 1] rescaling is intentionally not applied.
629
+ return [_result_to_doc(r, metric=self._metric) for r in results]
630
+
547
631
  # ------------------------------------------------------------------
548
632
  # Haystack pipeline serialisation
549
633
  # ------------------------------------------------------------------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haystack-velesdb
3
- Version: 2.0.0
3
+ Version: 3.0.0
4
4
  Summary: Haystack 2.x DocumentStore for VelesDB: The Local AI Memory Database.
5
5
  Author-email: VelesDB Team <contact@wiscale.fr>
6
6
  License: MIT
@@ -43,6 +43,78 @@ class DuplicateDocumentError(Exception):
43
43
  # ---------------------------------------------------------------------------
44
44
 
45
45
 
46
+ class _FakeFusionStrategy:
47
+ """Minimal stand-in for velesdb.FusionStrategy.
48
+
49
+ Records the strategy name so a fake ``multi_query_search`` can vary its
50
+ result ordering by strategy (mirrors the real binding, where different
51
+ strategies produce different fused scores).
52
+ """
53
+
54
+ def __init__(self, name: str, params: Optional[dict] = None) -> None:
55
+ self.name = name
56
+ self.params = params or {}
57
+
58
+ @staticmethod
59
+ def average() -> "_FakeFusionStrategy":
60
+ return _FakeFusionStrategy("average")
61
+
62
+ @staticmethod
63
+ def maximum() -> "_FakeFusionStrategy":
64
+ return _FakeFusionStrategy("maximum")
65
+
66
+ @staticmethod
67
+ def rrf(k: int = 60) -> "_FakeFusionStrategy":
68
+ return _FakeFusionStrategy("rrf", {"k": k})
69
+
70
+ @staticmethod
71
+ def weighted(
72
+ avg_weight: float = 0.6,
73
+ max_weight: float = 0.3,
74
+ hit_weight: float = 0.1,
75
+ ) -> "_FakeFusionStrategy":
76
+ return _FakeFusionStrategy(
77
+ "weighted",
78
+ {
79
+ "avg_weight": avg_weight,
80
+ "max_weight": max_weight,
81
+ "hit_weight": hit_weight,
82
+ },
83
+ )
84
+
85
+ @staticmethod
86
+ def relative_score(
87
+ dense_weight: float, sparse_weight: float
88
+ ) -> "_FakeFusionStrategy":
89
+ return _FakeFusionStrategy(
90
+ "relative_score",
91
+ {"dense_weight": dense_weight, "sparse_weight": sparse_weight},
92
+ )
93
+
94
+
95
+ def _build_fake_fusion(
96
+ fusion: str, fusion_params: Optional[dict] = None
97
+ ) -> _FakeFusionStrategy:
98
+ """Stand-in for velesdb_common.fusion.build_fusion_strategy.
99
+
100
+ Maps the strategy name to the matching FusionStrategy factory so the
101
+ document store's fusion routing can be exercised without the real
102
+ velesdb_common package.
103
+ """
104
+ params = fusion_params or {}
105
+ if fusion in ("relative_score", "rsf"):
106
+ return _FakeFusionStrategy.relative_score(
107
+ params.get("dense_weight", 0.5), params.get("sparse_weight", 0.5)
108
+ )
109
+ if fusion == "weighted":
110
+ return _FakeFusionStrategy.weighted()
111
+ if fusion == "maximum":
112
+ return _FakeFusionStrategy.maximum()
113
+ if fusion == "average":
114
+ return _FakeFusionStrategy.average()
115
+ return _FakeFusionStrategy.rrf(params.get("k", 60))
116
+
117
+
46
118
  class _FakeSearchOptions:
47
119
  """Minimal stand-in for velesdb.SearchOptions used by search_request."""
48
120
 
@@ -95,6 +167,33 @@ class _FakeCollection:
95
167
  """Canonical search entry point — delegate to the legacy `search`."""
96
168
  return self.search(opts.vector, top_k=opts.top_k, filter=opts.filter)
97
169
 
170
+ def multi_query_search(
171
+ self,
172
+ vectors: list,
173
+ top_k: int = 10,
174
+ fusion: Any = None,
175
+ filter: Any = None, # pylint: disable=redefined-builtin
176
+ ) -> list:
177
+ """Fused multi-query search whose ordering depends on the strategy.
178
+
179
+ The real binding produces strategy-dependent fused scores. This fake
180
+ reproduces that observable behaviour: the points are sorted by a
181
+ per-strategy key so callers can assert that ``fusion='rsf'`` and
182
+ ``fusion='weighted'`` yield different orderings.
183
+ """
184
+ del vectors, filter # the fake ignores these
185
+ points = list(self._points.values())
186
+ name = getattr(fusion, "name", "rrf")
187
+ # Reverse the order for relative_score so the resulting ranking
188
+ # differs from the default (rrf) and from weighted.
189
+ reverse = name in ("relative_score", "rsf")
190
+ ordered = points[::-1] if reverse else points
191
+ results = [
192
+ {"id": p["id"], "score": 0.9, "payload": p.get("payload", {})}
193
+ for p in ordered[:top_k]
194
+ ]
195
+ return results
196
+
98
197
  def scroll( # pylint: disable=redefined-builtin
99
198
  self,
100
199
  *,
@@ -172,7 +271,9 @@ def _load_module() -> types.ModuleType:
172
271
  sys.modules["haystack.document_stores.errors"] = errors_mod
173
272
 
174
273
  sys.modules["velesdb"] = types.SimpleNamespace( # type: ignore
175
- Database=_FakeDatabase, SearchOptions=_FakeSearchOptions
274
+ Database=_FakeDatabase,
275
+ SearchOptions=_FakeSearchOptions,
276
+ FusionStrategy=_FakeFusionStrategy,
176
277
  )
177
278
 
178
279
  # Stub velesdb_common.security with no-op validators (real package has its own tests).
@@ -181,13 +282,29 @@ def _load_module() -> types.ModuleType:
181
282
 
182
283
  vc_mod = types.ModuleType("velesdb_common")
183
284
  sys.modules["velesdb_common"] = vc_mod
285
+
286
+ # Load the REAL velesdb_common.ids (pure stdlib) so the store exercises the
287
+ # canonical stable_hash_id rather than a forked copy (single-source-of-truth
288
+ # + license hygiene — see docs/planning/CORE_PARITY_REMEDIATION.md T3).
289
+ _ids_path = Path(__file__).resolve().parents[2] / "common" / "src" / "velesdb_common" / "ids.py"
290
+ _ids_spec = importlib.util.spec_from_file_location("velesdb_common.ids", _ids_path)
291
+ assert _ids_spec and _ids_spec.loader
292
+ vc_ids = importlib.util.module_from_spec(_ids_spec)
293
+ sys.modules["velesdb_common.ids"] = vc_ids
294
+ _ids_spec.loader.exec_module(vc_ids)
295
+
184
296
  vc_sec = types.ModuleType("velesdb_common.security")
185
297
  vc_sec.validate_path = _passthrough # type: ignore[attr-defined]
186
298
  vc_sec.validate_collection_name = _passthrough # type: ignore[attr-defined]
187
299
  vc_sec.validate_metric = _passthrough # type: ignore[attr-defined]
300
+ vc_sec.validate_named_sparse_vector = _passthrough # type: ignore[attr-defined]
188
301
  vc_sec.SecurityError = ValueError # type: ignore[attr-defined]
189
302
  sys.modules["velesdb_common.security"] = vc_sec
190
303
 
304
+ vc_fusion = types.ModuleType("velesdb_common.fusion")
305
+ vc_fusion.build_fusion_strategy = _build_fake_fusion # type: ignore[attr-defined]
306
+ sys.modules["velesdb_common.fusion"] = vc_fusion
307
+
191
308
  pkg = types.ModuleType("haystack_velesdb")
192
309
  pkg.__path__ = [str(root)] # type: ignore[attr-defined]
193
310
  sys.modules["haystack_velesdb"] = pkg
@@ -782,3 +899,169 @@ def test_embedding_retrieval_translates_haystack_filter_to_veles_shape() -> None
782
899
  }, "embedding_retrieval must translate Haystack filter to VelesDB Filter shape"
783
900
  finally:
784
901
  _MOD.velesdb = original_velesdb
902
+
903
+
904
+ # ---------------------------------------------------------------------------
905
+ # I1: fusion (RSF / Weighted) on embedding_retrieval
906
+ # ---------------------------------------------------------------------------
907
+
908
+
909
+ def _store_with_three_docs(name: str) -> Any:
910
+ store = _MOD.VelesDBDocumentStore(path="/tmp/hs", collection_name=name)
911
+ store.write_documents([
912
+ Document(id="d1", content="one", embedding=[0.1]),
913
+ Document(id="d2", content="two", embedding=[0.2]),
914
+ Document(id="d3", content="three", embedding=[0.3]),
915
+ ])
916
+ return store
917
+
918
+
919
+ def test_embedding_retrieval_fusion_changes_ordering_vs_default() -> None:
920
+ """fusion='rsf' must reorder results relative to the default ranking."""
921
+ store = _store_with_three_docs("t_fusion_rsf")
922
+ default_ids = [d.id for d in store.embedding_retrieval([0.1], top_k=3)]
923
+ rsf_ids = [
924
+ d.id for d in store.embedding_retrieval([0.1], top_k=3, fusion="rsf")
925
+ ]
926
+ assert rsf_ids != default_ids, "fusion='rsf' must change result ordering"
927
+ assert sorted(rsf_ids) == sorted(default_ids), "same doc set, different order"
928
+
929
+
930
+ def test_embedding_retrieval_rsf_and_weighted_differ() -> None:
931
+ """rsf and weighted fusion must produce different orderings."""
932
+ store = _store_with_three_docs("t_fusion_pair")
933
+ rsf_ids = [
934
+ d.id for d in store.embedding_retrieval([0.1], top_k=3, fusion="rsf")
935
+ ]
936
+ weighted_ids = [
937
+ d.id
938
+ for d in store.embedding_retrieval([0.1], top_k=3, fusion="weighted")
939
+ ]
940
+ assert rsf_ids != weighted_ids, "rsf and weighted must differ in ordering"
941
+
942
+
943
+ def test_embedding_retrieval_fusion_passes_params() -> None:
944
+ """fusion_params must reach build_fusion_strategy and the collection."""
945
+ captured: dict = {}
946
+
947
+ class _CapturingCollection(_FakeCollection):
948
+ def multi_query_search(
949
+ self,
950
+ vectors: list,
951
+ top_k: int = 10,
952
+ fusion: Any = None,
953
+ filter: Any = None, # pylint: disable=redefined-builtin
954
+ ) -> list:
955
+ captured["fusion_name"] = getattr(fusion, "name", None)
956
+ captured["fusion_params"] = getattr(fusion, "params", None)
957
+ return super().multi_query_search(
958
+ vectors, top_k=top_k, fusion=fusion, filter=filter
959
+ )
960
+
961
+ class _CapturingDatabase:
962
+ def __init__(self, path: str) -> None:
963
+ self._col = _CapturingCollection()
964
+
965
+ def get_collection(self, name: str) -> _CapturingCollection:
966
+ return self._col
967
+
968
+ def create_collection(
969
+ self, name: str, dimension: int, metric: str
970
+ ) -> _CapturingCollection:
971
+ return self._col
972
+
973
+ original_velesdb = _MOD.velesdb
974
+ try:
975
+ _MOD.velesdb = types.SimpleNamespace( # type: ignore
976
+ Database=_CapturingDatabase,
977
+ SearchOptions=_FakeSearchOptions,
978
+ FusionStrategy=_FakeFusionStrategy,
979
+ )
980
+ store = _MOD.VelesDBDocumentStore(
981
+ path="/tmp/hs", collection_name="t_fusion_params"
982
+ )
983
+ store.write_documents([Document(id="p", content="x", embedding=[0.5])])
984
+ store.embedding_retrieval(
985
+ [0.5],
986
+ top_k=3,
987
+ fusion="rsf",
988
+ fusion_params={"dense_weight": 0.7, "sparse_weight": 0.3},
989
+ )
990
+ assert captured["fusion_name"] == "relative_score"
991
+ assert captured["fusion_params"]["dense_weight"] == 0.7
992
+ finally:
993
+ _MOD.velesdb = original_velesdb
994
+
995
+
996
+ # ---------------------------------------------------------------------------
997
+ # I2: named-sparse-index creation on write_documents
998
+ # ---------------------------------------------------------------------------
999
+
1000
+
1001
+ def test_write_documents_forwards_named_sparse_vectors() -> None:
1002
+ """A named sparse vector dict must reach the upserted point so the
1003
+ underlying named sparse index is created.
1004
+ """
1005
+ captured: dict = {}
1006
+
1007
+ class _CapturingCollection(_FakeCollection):
1008
+ def upsert(self, points: list) -> int:
1009
+ captured["points"] = points
1010
+ return super().upsert(points)
1011
+
1012
+ class _CapturingDatabase:
1013
+ def __init__(self, path: str) -> None:
1014
+ self._col = _CapturingCollection()
1015
+
1016
+ def get_collection(self, name: str) -> _CapturingCollection:
1017
+ return self._col
1018
+
1019
+ def create_collection(
1020
+ self, name: str, dimension: int, metric: str
1021
+ ) -> _CapturingCollection:
1022
+ return self._col
1023
+
1024
+ original_velesdb = _MOD.velesdb
1025
+ try:
1026
+ _MOD.velesdb = types.SimpleNamespace( # type: ignore
1027
+ Database=_CapturingDatabase,
1028
+ SearchOptions=_FakeSearchOptions,
1029
+ FusionStrategy=_FakeFusionStrategy,
1030
+ )
1031
+ store = _MOD.VelesDBDocumentStore(
1032
+ path="/tmp/hs", collection_name="t_named_sparse"
1033
+ )
1034
+ store.write_documents(
1035
+ [Document(id="s1", content="hi", embedding=[0.5])],
1036
+ sparse_vectors=[{"bge_m3": {0: 1.5, 7: 0.8}}],
1037
+ )
1038
+ point = captured["points"][0]
1039
+ assert point["sparse_vector"] == {"bge_m3": {0: 1.5, 7: 0.8}}
1040
+ finally:
1041
+ _MOD.velesdb = original_velesdb
1042
+
1043
+
1044
+ def test_id_hashing_uses_canonical_stable_hash_id():
1045
+ """T3: the store delegates string->int ID hashing to the shared
1046
+ velesdb_common.ids.stable_hash_id, not a forked local copy. This keeps the
1047
+ same logical document mapped to the same VelesDB point ID across every
1048
+ integration (single source of truth) and avoids re-implementing the hash in
1049
+ an MIT package (license hygiene). See docs/planning/CORE_PARITY_REMEDIATION.md.
1050
+ """
1051
+ import hashlib
1052
+
1053
+ import velesdb_common.ids as canonical_ids
1054
+
1055
+ # the module imported the canonical helper, and the forked copy is gone
1056
+ assert _MOD.stable_hash_id is canonical_ids.stable_hash_id
1057
+ assert not hasattr(_MOD, "_str_id_to_int")
1058
+ assert not hasattr(_MOD, "_INT63_MASK")
1059
+
1060
+ # and it yields the canonical positive-63-bit value
1061
+ for doc_id in ["", "doc-1", "héllo-世界", "Document_42::chunk#3", "a" * 500]:
1062
+ expected = (
1063
+ int.from_bytes(hashlib.sha256(doc_id.encode("utf-8")).digest()[:8], "big")
1064
+ & 0x7FFFFFFFFFFFFFFF
1065
+ )
1066
+ assert _MOD.stable_hash_id(doc_id) == expected
1067
+ assert 0 <= _MOD.stable_hash_id(doc_id) <= 0x7FFFFFFFFFFFFFFF