sie-haystack 0.1.9__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/.gitignore +3 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/PKG-INFO +22 -2
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/README.md +21 -1
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/pyproject.toml +2 -2
- sie_haystack-0.2.0/src/haystack_integrations/components/embedders/sie/__init__.py +25 -0
- sie_haystack-0.2.0/src/haystack_integrations/components/extractors/sie/__init__.py +11 -0
- sie_haystack-0.2.0/src/haystack_integrations/components/rankers/sie/__init__.py +5 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/src/sie_haystack/__init__.py +15 -1
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/src/sie_haystack/embedders.py +263 -63
- sie_haystack-0.2.0/src/sie_haystack/extractors.py +255 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/conftest.py +24 -2
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/test_embedders.py +202 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/test_extractors.py +8 -2
- sie_haystack-0.2.0/tests/test_namespace_aliases.py +73 -0
- sie_haystack-0.1.9/src/sie_haystack/extractors.py +0 -144
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/src/sie_haystack/rankers.py +0 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/__init__.py +0 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/test_integration.py +0 -0
- {sie_haystack-0.1.9 → sie_haystack-0.2.0}/tests/test_rankers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sie-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: SIE integration for Haystack
|
|
5
5
|
Author-email: Superlinked <dev@superlinked.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -30,11 +30,31 @@ SIE integration for Haystack.
|
|
|
30
30
|
pip install sie-haystack
|
|
31
31
|
```
|
|
32
32
|
|
|
33
|
+
## Imports
|
|
34
|
+
|
|
35
|
+
Preferred import paths follow Haystack's namespace convention:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from haystack_integrations.components.embedders.sie import (
|
|
39
|
+
SIEDocumentEmbedder,
|
|
40
|
+
SIETextEmbedder,
|
|
41
|
+
)
|
|
42
|
+
from haystack_integrations.components.rankers.sie import SIERanker
|
|
43
|
+
from haystack_integrations.components.extractors.sie import SIEExtractor
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
The legacy flat imports remain supported for compatibility:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from sie_haystack import SIEDocumentEmbedder, SIEExtractor, SIERanker, SIETextEmbedder
|
|
50
|
+
```
|
|
51
|
+
|
|
33
52
|
## Usage
|
|
34
53
|
|
|
35
54
|
```python
|
|
36
55
|
from haystack import Document
|
|
37
|
-
from
|
|
56
|
+
from haystack_integrations.components.embedders.sie import SIEDocumentEmbedder, SIETextEmbedder
|
|
57
|
+
from haystack_integrations.components.rankers.sie import SIERanker
|
|
38
58
|
|
|
39
59
|
# Embed a query
|
|
40
60
|
text_embedder = SIETextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
|
|
@@ -8,11 +8,31 @@ SIE integration for Haystack.
|
|
|
8
8
|
pip install sie-haystack
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
+
## Imports
|
|
12
|
+
|
|
13
|
+
Preferred import paths follow Haystack's namespace convention:
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from haystack_integrations.components.embedders.sie import (
|
|
17
|
+
SIEDocumentEmbedder,
|
|
18
|
+
SIETextEmbedder,
|
|
19
|
+
)
|
|
20
|
+
from haystack_integrations.components.rankers.sie import SIERanker
|
|
21
|
+
from haystack_integrations.components.extractors.sie import SIEExtractor
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
The legacy flat imports remain supported for compatibility:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from sie_haystack import SIEDocumentEmbedder, SIEExtractor, SIERanker, SIETextEmbedder
|
|
28
|
+
```
|
|
29
|
+
|
|
11
30
|
## Usage
|
|
12
31
|
|
|
13
32
|
```python
|
|
14
33
|
from haystack import Document
|
|
15
|
-
from
|
|
34
|
+
from haystack_integrations.components.embedders.sie import SIEDocumentEmbedder, SIETextEmbedder
|
|
35
|
+
from haystack_integrations.components.rankers.sie import SIERanker
|
|
16
36
|
|
|
17
37
|
# Embed a query
|
|
18
38
|
text_embedder = SIETextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "sie-haystack"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "SIE integration for Haystack"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -33,7 +33,7 @@ requires = ["hatchling"]
|
|
|
33
33
|
build-backend = "hatchling.build"
|
|
34
34
|
|
|
35
35
|
[tool.hatch.build.targets.wheel]
|
|
36
|
-
packages = ["src/sie_haystack"]
|
|
36
|
+
packages = ["src/sie_haystack", "src/haystack_integrations"]
|
|
37
37
|
|
|
38
38
|
[tool.pytest.ini_options]
|
|
39
39
|
asyncio_mode = "auto"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Haystack namespace exports for SIE embedders.
|
|
2
|
+
|
|
3
|
+
This mirrors Haystack's `haystack_integrations.components.*` convention
|
|
4
|
+
while keeping the existing `sie_haystack` imports available.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from sie_haystack.embedders import (
|
|
8
|
+
SIEDocumentEmbedder,
|
|
9
|
+
SIEImageEmbedder,
|
|
10
|
+
SIEMultivectorDocumentEmbedder,
|
|
11
|
+
SIEMultivectorTextEmbedder,
|
|
12
|
+
SIESparseDocumentEmbedder,
|
|
13
|
+
SIESparseTextEmbedder,
|
|
14
|
+
SIETextEmbedder,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"SIEDocumentEmbedder",
|
|
19
|
+
"SIEImageEmbedder",
|
|
20
|
+
"SIEMultivectorDocumentEmbedder",
|
|
21
|
+
"SIEMultivectorTextEmbedder",
|
|
22
|
+
"SIESparseDocumentEmbedder",
|
|
23
|
+
"SIESparseTextEmbedder",
|
|
24
|
+
"SIETextEmbedder",
|
|
25
|
+
]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Haystack namespace exports for SIE extractors."""
|
|
2
|
+
|
|
3
|
+
from sie_haystack.extractors import Classification, DetectedObject, Entity, Relation, SIEExtractor
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"Classification",
|
|
7
|
+
"DetectedObject",
|
|
8
|
+
"Entity",
|
|
9
|
+
"Relation",
|
|
10
|
+
"SIEExtractor",
|
|
11
|
+
]
|
|
@@ -10,6 +10,10 @@ Sparse Embedders (for hybrid search):
|
|
|
10
10
|
- SIESparseTextEmbedder: Sparse embeddings for queries
|
|
11
11
|
- SIESparseDocumentEmbedder: Sparse embeddings for documents
|
|
12
12
|
|
|
13
|
+
Multivector Embedders (ColBERT):
|
|
14
|
+
- SIEMultivectorTextEmbedder: Per-token embeddings for queries
|
|
15
|
+
- SIEMultivectorDocumentEmbedder: Per-token embeddings for documents
|
|
16
|
+
|
|
13
17
|
Rankers and Extractors:
|
|
14
18
|
- SIERanker: Reranks documents by relevance to a query
|
|
15
19
|
- SIEExtractor: Extracts entities from text
|
|
@@ -45,16 +49,26 @@ Hybrid search example:
|
|
|
45
49
|
|
|
46
50
|
from sie_haystack.embedders import (
|
|
47
51
|
SIEDocumentEmbedder,
|
|
52
|
+
SIEImageEmbedder,
|
|
53
|
+
SIEMultivectorDocumentEmbedder,
|
|
54
|
+
SIEMultivectorTextEmbedder,
|
|
48
55
|
SIESparseDocumentEmbedder,
|
|
49
56
|
SIESparseTextEmbedder,
|
|
50
57
|
SIETextEmbedder,
|
|
51
58
|
)
|
|
52
|
-
from sie_haystack.extractors import SIEExtractor
|
|
59
|
+
from sie_haystack.extractors import Classification, DetectedObject, Entity, Relation, SIEExtractor
|
|
53
60
|
from sie_haystack.rankers import SIERanker
|
|
54
61
|
|
|
55
62
|
__all__ = [
|
|
63
|
+
"Classification",
|
|
64
|
+
"DetectedObject",
|
|
65
|
+
"Entity",
|
|
66
|
+
"Relation",
|
|
56
67
|
"SIEDocumentEmbedder",
|
|
57
68
|
"SIEExtractor",
|
|
69
|
+
"SIEImageEmbedder",
|
|
70
|
+
"SIEMultivectorDocumentEmbedder",
|
|
71
|
+
"SIEMultivectorTextEmbedder",
|
|
58
72
|
"SIERanker",
|
|
59
73
|
"SIESparseDocumentEmbedder",
|
|
60
74
|
"SIESparseTextEmbedder",
|
|
@@ -5,6 +5,8 @@ Provides embedder components following Haystack's conventions:
|
|
|
5
5
|
- SIEDocumentEmbedder: For embedding documents - dense embeddings
|
|
6
6
|
- SIESparseTextEmbedder: For sparse embeddings of queries (hybrid search)
|
|
7
7
|
- SIESparseDocumentEmbedder: For sparse embeddings of documents (hybrid search)
|
|
8
|
+
- SIEMultivectorTextEmbedder: For multivector (ColBERT) embeddings of queries
|
|
9
|
+
- SIEMultivectorDocumentEmbedder: For multivector (ColBERT) embeddings of documents
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
from __future__ import annotations
|
|
@@ -12,6 +14,9 @@ from __future__ import annotations
|
|
|
12
14
|
from typing import Any
|
|
13
15
|
|
|
14
16
|
from haystack import Document, component
|
|
17
|
+
from sie_sdk import SIEClient
|
|
18
|
+
from sie_sdk.encoding import dense_embedding, multivector_embedding, sparse_embedding
|
|
19
|
+
from sie_sdk.types import Item
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
@component
|
|
@@ -55,8 +60,6 @@ class SIETextEmbedder:
|
|
|
55
60
|
def client(self) -> Any:
|
|
56
61
|
"""Lazily initialize the SIE client."""
|
|
57
62
|
if self._client is None:
|
|
58
|
-
from sie_sdk import SIEClient
|
|
59
|
-
|
|
60
63
|
self._client = SIEClient(
|
|
61
64
|
self._base_url,
|
|
62
65
|
timeout_s=self._timeout_s,
|
|
@@ -79,24 +82,13 @@ class SIETextEmbedder:
|
|
|
79
82
|
Returns:
|
|
80
83
|
Dictionary with "embedding" key containing the embedding vector.
|
|
81
84
|
"""
|
|
82
|
-
from sie_sdk.types import Item
|
|
83
|
-
|
|
84
85
|
result = self.client.encode(
|
|
85
86
|
self._model,
|
|
86
87
|
Item(text=text),
|
|
87
88
|
output_types=["dense"],
|
|
88
89
|
options={"is_query": True},
|
|
89
90
|
)
|
|
90
|
-
embedding
|
|
91
|
-
return {"embedding": embedding}
|
|
92
|
-
|
|
93
|
-
def _extract_dense(self, result: Any) -> list[float]:
|
|
94
|
-
"""Extract dense embedding from SDK result."""
|
|
95
|
-
# SDK returns {"dense": np.ndarray, ...}
|
|
96
|
-
dense = result.get("dense") if isinstance(result, dict) else getattr(result, "dense", None)
|
|
97
|
-
if dense is None:
|
|
98
|
-
return []
|
|
99
|
-
return dense.tolist() if hasattr(dense, "tolist") else list(dense)
|
|
91
|
+
return {"embedding": dense_embedding(result)}
|
|
100
92
|
|
|
101
93
|
|
|
102
94
|
@component
|
|
@@ -146,8 +138,6 @@ class SIEDocumentEmbedder:
|
|
|
146
138
|
def client(self) -> Any:
|
|
147
139
|
"""Lazily initialize the SIE client."""
|
|
148
140
|
if self._client is None:
|
|
149
|
-
from sie_sdk import SIEClient
|
|
150
|
-
|
|
151
141
|
self._client = SIEClient(
|
|
152
142
|
self._base_url,
|
|
153
143
|
timeout_s=self._timeout_s,
|
|
@@ -173,8 +163,6 @@ class SIEDocumentEmbedder:
|
|
|
173
163
|
if not documents:
|
|
174
164
|
return {"documents": []}
|
|
175
165
|
|
|
176
|
-
from sie_sdk.types import Item
|
|
177
|
-
|
|
178
166
|
# Build text to embed for each document
|
|
179
167
|
texts = [self._build_text(doc) for doc in documents]
|
|
180
168
|
items = [Item(text=text) for text in texts]
|
|
@@ -188,7 +176,7 @@ class SIEDocumentEmbedder:
|
|
|
188
176
|
|
|
189
177
|
# Store embeddings on documents
|
|
190
178
|
for doc, result in zip(documents, results, strict=True):
|
|
191
|
-
doc.embedding =
|
|
179
|
+
doc.embedding = dense_embedding(result)
|
|
192
180
|
|
|
193
181
|
return {"documents": documents}
|
|
194
182
|
|
|
@@ -201,14 +189,6 @@ class SIEDocumentEmbedder:
|
|
|
201
189
|
parts.append(doc.content or "")
|
|
202
190
|
return " ".join(parts)
|
|
203
191
|
|
|
204
|
-
def _extract_dense(self, result: Any) -> list[float]:
|
|
205
|
-
"""Extract dense embedding from SDK result."""
|
|
206
|
-
# SDK returns {"dense": np.ndarray, ...}
|
|
207
|
-
dense = result.get("dense") if isinstance(result, dict) else getattr(result, "dense", None)
|
|
208
|
-
if dense is None:
|
|
209
|
-
return []
|
|
210
|
-
return dense.tolist() if hasattr(dense, "tolist") else list(dense)
|
|
211
|
-
|
|
212
192
|
|
|
213
193
|
@component
|
|
214
194
|
class SIESparseTextEmbedder:
|
|
@@ -252,8 +232,6 @@ class SIESparseTextEmbedder:
|
|
|
252
232
|
def client(self) -> Any:
|
|
253
233
|
"""Lazily initialize the SIE client."""
|
|
254
234
|
if self._client is None:
|
|
255
|
-
from sie_sdk import SIEClient
|
|
256
|
-
|
|
257
235
|
self._client = SIEClient(
|
|
258
236
|
self._base_url,
|
|
259
237
|
timeout_s=self._timeout_s,
|
|
@@ -276,29 +254,13 @@ class SIESparseTextEmbedder:
|
|
|
276
254
|
Returns:
|
|
277
255
|
Dictionary with "sparse_embedding" key containing dict with "indices" and "values" lists.
|
|
278
256
|
"""
|
|
279
|
-
from sie_sdk.types import Item
|
|
280
|
-
|
|
281
257
|
result = self.client.encode(
|
|
282
258
|
self._model,
|
|
283
259
|
Item(text=text),
|
|
284
260
|
output_types=["sparse"],
|
|
285
261
|
options={"is_query": True},
|
|
286
262
|
)
|
|
287
|
-
sparse_embedding
|
|
288
|
-
return {"sparse_embedding": sparse_embedding}
|
|
289
|
-
|
|
290
|
-
def _extract_sparse(self, result: Any) -> dict[str, list]:
|
|
291
|
-
"""Extract sparse embedding from SDK result."""
|
|
292
|
-
# SDK returns {"sparse": {"indices": np.ndarray, "values": np.ndarray}, ...}
|
|
293
|
-
sparse = result.get("sparse") if isinstance(result, dict) else getattr(result, "sparse", None)
|
|
294
|
-
if sparse is None:
|
|
295
|
-
return {"indices": [], "values": []}
|
|
296
|
-
indices = sparse.get("indices") if isinstance(sparse, dict) else getattr(sparse, "indices", None)
|
|
297
|
-
values = sparse.get("values") if isinstance(sparse, dict) else getattr(sparse, "values", None)
|
|
298
|
-
return {
|
|
299
|
-
"indices": indices.tolist() if hasattr(indices, "tolist") else list(indices or []),
|
|
300
|
-
"values": values.tolist() if hasattr(values, "tolist") else list(values or []),
|
|
301
|
-
}
|
|
263
|
+
return {"sparse_embedding": sparse_embedding(result)}
|
|
302
264
|
|
|
303
265
|
|
|
304
266
|
@component
|
|
@@ -349,8 +311,6 @@ class SIESparseDocumentEmbedder:
|
|
|
349
311
|
def client(self) -> Any:
|
|
350
312
|
"""Lazily initialize the SIE client."""
|
|
351
313
|
if self._client is None:
|
|
352
|
-
from sie_sdk import SIEClient
|
|
353
|
-
|
|
354
314
|
self._client = SIEClient(
|
|
355
315
|
self._base_url,
|
|
356
316
|
timeout_s=self._timeout_s,
|
|
@@ -376,8 +336,6 @@ class SIESparseDocumentEmbedder:
|
|
|
376
336
|
if not documents:
|
|
377
337
|
return {"documents": []}
|
|
378
338
|
|
|
379
|
-
from sie_sdk.types import Item
|
|
380
|
-
|
|
381
339
|
# Build text to embed for each document
|
|
382
340
|
texts = [self._build_text(doc) for doc in documents]
|
|
383
341
|
items = [Item(text=text) for text in texts]
|
|
@@ -391,7 +349,7 @@ class SIESparseDocumentEmbedder:
|
|
|
391
349
|
|
|
392
350
|
# Store sparse embeddings on documents in meta
|
|
393
351
|
for doc, result in zip(documents, results, strict=True):
|
|
394
|
-
doc.meta["_sparse_embedding"] =
|
|
352
|
+
doc.meta["_sparse_embedding"] = sparse_embedding(result)
|
|
395
353
|
|
|
396
354
|
return {"documents": documents}
|
|
397
355
|
|
|
@@ -404,15 +362,257 @@ class SIESparseDocumentEmbedder:
|
|
|
404
362
|
parts.append(doc.content or "")
|
|
405
363
|
return " ".join(parts)
|
|
406
364
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
365
|
+
|
|
366
|
+
@component
|
|
367
|
+
class SIEImageEmbedder:
|
|
368
|
+
"""Embeds images using SIE multimodal models (CLIP, SigLIP, ColPali).
|
|
369
|
+
|
|
370
|
+
Use this component in Haystack pipelines for image embedding with models
|
|
371
|
+
that support image input.
|
|
372
|
+
|
|
373
|
+
Example:
|
|
374
|
+
>>> embedder = SIEImageEmbedder(
|
|
375
|
+
... base_url="http://localhost:8080",
|
|
376
|
+
... model="openai/clip-vit-large-patch14",
|
|
377
|
+
... )
|
|
378
|
+
>>> result = embedder.run(images=["/path/to/photo.jpg"])
|
|
379
|
+
>>> embeddings = result["embeddings"] # list[list[float]]
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
base_url: URL of the SIE server.
|
|
383
|
+
model: Model name to use for encoding. Must support image input.
|
|
384
|
+
gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
|
|
385
|
+
options: Model-specific options. Passed to SDK as default.
|
|
386
|
+
timeout_s: Request timeout in seconds.
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
def __init__(
|
|
390
|
+
self,
|
|
391
|
+
base_url: str = "http://localhost:8080",
|
|
392
|
+
model: str = "openai/clip-vit-large-patch14",
|
|
393
|
+
*,
|
|
394
|
+
gpu: str | None = None,
|
|
395
|
+
options: dict[str, Any] | None = None,
|
|
396
|
+
timeout_s: float = 180.0,
|
|
397
|
+
) -> None:
|
|
398
|
+
self._base_url = base_url
|
|
399
|
+
self._model = model
|
|
400
|
+
self._gpu = gpu
|
|
401
|
+
self._options = options
|
|
402
|
+
self._timeout_s = timeout_s
|
|
403
|
+
self._client: Any = None
|
|
404
|
+
|
|
405
|
+
@property
|
|
406
|
+
def client(self) -> Any:
|
|
407
|
+
"""Lazily initialize the SIE client."""
|
|
408
|
+
if self._client is None:
|
|
409
|
+
self._client = SIEClient(
|
|
410
|
+
self._base_url,
|
|
411
|
+
timeout_s=self._timeout_s,
|
|
412
|
+
gpu=self._gpu,
|
|
413
|
+
options=self._options,
|
|
414
|
+
)
|
|
415
|
+
return self._client
|
|
416
|
+
|
|
417
|
+
def warm_up(self) -> None:
|
|
418
|
+
"""Warm up the component by initializing the client."""
|
|
419
|
+
_ = self.client
|
|
420
|
+
|
|
421
|
+
@component.output_types(embeddings=list[list[float]])
|
|
422
|
+
def run(self, images: list[str | bytes]) -> dict[str, list[list[float]]]:
|
|
423
|
+
"""Embed images and return their embeddings.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
images: List of image file paths (str) or raw image bytes.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Dictionary with "embeddings" key containing list of embedding vectors.
|
|
430
|
+
"""
|
|
431
|
+
if not images:
|
|
432
|
+
return {"embeddings": []}
|
|
433
|
+
|
|
434
|
+
items = [Item(images=[img]) for img in images]
|
|
435
|
+
|
|
436
|
+
results = self.client.encode(
|
|
437
|
+
self._model,
|
|
438
|
+
items,
|
|
439
|
+
output_types=["dense"],
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return {"embeddings": [dense_embedding(result) for result in results]}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
@component
|
|
446
|
+
class SIEMultivectorTextEmbedder:
|
|
447
|
+
"""Embeds a single text string using SIE multivector (ColBERT) embeddings.
|
|
448
|
+
|
|
449
|
+
Produces per-token embeddings for late-interaction retrieval and scoring.
|
|
450
|
+
Use this component for embedding queries in ColBERT pipelines.
|
|
451
|
+
|
|
452
|
+
Example:
|
|
453
|
+
>>> embedder = SIEMultivectorTextEmbedder(
|
|
454
|
+
... base_url="http://localhost:8080",
|
|
455
|
+
... model="jinaai/jina-colbert-v2",
|
|
456
|
+
... )
|
|
457
|
+
>>> result = embedder.run(text="What is vector search?")
|
|
458
|
+
>>> multivector = result["multivector_embedding"] # list[list[float]]
|
|
459
|
+
"""
|
|
460
|
+
|
|
461
|
+
def __init__(
|
|
462
|
+
self,
|
|
463
|
+
base_url: str = "http://localhost:8080",
|
|
464
|
+
model: str = "jinaai/jina-colbert-v2",
|
|
465
|
+
*,
|
|
466
|
+
gpu: str | None = None,
|
|
467
|
+
options: dict[str, Any] | None = None,
|
|
468
|
+
timeout_s: float = 180.0,
|
|
469
|
+
) -> None:
|
|
470
|
+
"""Initialize the multivector text embedder.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
base_url: URL of the SIE server.
|
|
474
|
+
model: Model name to use for encoding. Must support multivector output
|
|
475
|
+
(e.g., jinaai/jina-colbert-v2).
|
|
476
|
+
gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
|
|
477
|
+
options: Model-specific options. Passed to SDK as default.
|
|
478
|
+
timeout_s: Request timeout in seconds.
|
|
479
|
+
"""
|
|
480
|
+
self._base_url = base_url
|
|
481
|
+
self._model = model
|
|
482
|
+
self._gpu = gpu
|
|
483
|
+
self._options = options
|
|
484
|
+
self._timeout_s = timeout_s
|
|
485
|
+
self._client: Any = None
|
|
486
|
+
|
|
487
|
+
@property
|
|
488
|
+
def client(self) -> Any:
|
|
489
|
+
"""Lazily initialize the SIE client."""
|
|
490
|
+
if self._client is None:
|
|
491
|
+
self._client = SIEClient(
|
|
492
|
+
self._base_url,
|
|
493
|
+
timeout_s=self._timeout_s,
|
|
494
|
+
gpu=self._gpu,
|
|
495
|
+
options=self._options,
|
|
496
|
+
)
|
|
497
|
+
return self._client
|
|
498
|
+
|
|
499
|
+
def warm_up(self) -> None:
|
|
500
|
+
"""Warm up the component by initializing the client."""
|
|
501
|
+
_ = self.client
|
|
502
|
+
|
|
503
|
+
@component.output_types(multivector_embedding=list)
|
|
504
|
+
def run(self, text: str) -> dict[str, list[list[float]]]:
|
|
505
|
+
"""Embed a single text string with multivector (ColBERT) embeddings.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
text: The text to embed.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
Dictionary with "multivector_embedding" key containing per-token embeddings.
|
|
512
|
+
"""
|
|
513
|
+
result = self.client.encode(
|
|
514
|
+
self._model,
|
|
515
|
+
Item(text=text),
|
|
516
|
+
output_types=["multivector"],
|
|
517
|
+
options={"is_query": True},
|
|
518
|
+
)
|
|
519
|
+
return {"multivector_embedding": multivector_embedding(result["multivector"])}
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@component
|
|
523
|
+
class SIEMultivectorDocumentEmbedder:
|
|
524
|
+
"""Embeds documents using SIE multivector (ColBERT) embeddings.
|
|
525
|
+
|
|
526
|
+
Produces per-token embeddings for late-interaction retrieval. Stores
|
|
527
|
+
multivector embeddings on each document's metadata.
|
|
528
|
+
|
|
529
|
+
Example:
|
|
530
|
+
>>> from haystack import Document
|
|
531
|
+
>>> embedder = SIEMultivectorDocumentEmbedder(
|
|
532
|
+
... base_url="http://localhost:8080",
|
|
533
|
+
... model="jinaai/jina-colbert-v2",
|
|
534
|
+
... )
|
|
535
|
+
>>> docs = [Document(content="Python is a programming language.")]
|
|
536
|
+
>>> result = embedder.run(documents=docs)
|
|
537
|
+
>>> embedded_docs = result["documents"]
|
|
538
|
+
>>> print(embedded_docs[0].meta["_multivector_embedding"]) # list[list[float]]
|
|
539
|
+
"""
|
|
540
|
+
|
|
541
|
+
def __init__(
|
|
542
|
+
self,
|
|
543
|
+
base_url: str = "http://localhost:8080",
|
|
544
|
+
model: str = "jinaai/jina-colbert-v2",
|
|
545
|
+
*,
|
|
546
|
+
gpu: str | None = None,
|
|
547
|
+
options: dict[str, Any] | None = None,
|
|
548
|
+
timeout_s: float = 180.0,
|
|
549
|
+
meta_fields_to_embed: list[str] | None = None,
|
|
550
|
+
) -> None:
|
|
551
|
+
"""Initialize the multivector document embedder.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
base_url: URL of the SIE server.
|
|
555
|
+
model: Model name to use for encoding. Must support multivector output
|
|
556
|
+
(e.g., jinaai/jina-colbert-v2).
|
|
557
|
+
gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
|
|
558
|
+
options: Model-specific options. Passed to SDK as default.
|
|
559
|
+
timeout_s: Request timeout in seconds.
|
|
560
|
+
meta_fields_to_embed: List of metadata fields to include in embedding.
|
|
561
|
+
"""
|
|
562
|
+
self._base_url = base_url
|
|
563
|
+
self._model = model
|
|
564
|
+
self._gpu = gpu
|
|
565
|
+
self._options = options
|
|
566
|
+
self._timeout_s = timeout_s
|
|
567
|
+
self._meta_fields_to_embed = meta_fields_to_embed or []
|
|
568
|
+
self._client: Any = None
|
|
569
|
+
|
|
570
|
+
@property
|
|
571
|
+
def client(self) -> Any:
|
|
572
|
+
"""Lazily initialize the SIE client."""
|
|
573
|
+
if self._client is None:
|
|
574
|
+
self._client = SIEClient(
|
|
575
|
+
self._base_url,
|
|
576
|
+
timeout_s=self._timeout_s,
|
|
577
|
+
gpu=self._gpu,
|
|
578
|
+
options=self._options,
|
|
579
|
+
)
|
|
580
|
+
return self._client
|
|
581
|
+
|
|
582
|
+
def warm_up(self) -> None:
|
|
583
|
+
"""Warm up the component by initializing the client."""
|
|
584
|
+
_ = self.client
|
|
585
|
+
|
|
586
|
+
@component.output_types(documents=list[Document])
|
|
587
|
+
def run(self, documents: list[Document]) -> dict[str, list[Document]]:
|
|
588
|
+
"""Embed documents with multivector (ColBERT) embeddings.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
documents: List of documents to embed.
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
Dictionary with "documents" key containing documents with multivector embeddings
|
|
595
|
+
stored in meta["_multivector_embedding"].
|
|
596
|
+
"""
|
|
597
|
+
if not documents:
|
|
598
|
+
return {"documents": []}
|
|
599
|
+
|
|
600
|
+
texts = [self._build_text(doc) for doc in documents]
|
|
601
|
+
items = [Item(text=text) for text in texts]
|
|
602
|
+
|
|
603
|
+
results = self.client.encode(
|
|
604
|
+
self._model,
|
|
605
|
+
items,
|
|
606
|
+
output_types=["multivector"],
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
for doc, result in zip(documents, results, strict=True):
|
|
610
|
+
doc.meta["_multivector_embedding"] = multivector_embedding(result["multivector"])
|
|
611
|
+
|
|
612
|
+
return {"documents": documents}
|
|
613
|
+
|
|
614
|
+
def _build_text(self, doc: Document) -> str:
|
|
615
|
+
"""Build the text to embed for a document, optionally including metadata fields."""
|
|
616
|
+
parts = [str(doc.meta[field]) for field in self._meta_fields_to_embed if field in doc.meta]
|
|
617
|
+
parts.append(doc.content or "")
|
|
618
|
+
return " ".join(parts)
|