sie-haystack 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ """SIE integration for Haystack.
2
+
3
+ This package provides Haystack components that use SIE for inference:
4
+
5
+ Dense Embedders:
6
+ - SIETextEmbedder: Embeds single text strings (queries)
7
+ - SIEDocumentEmbedder: Embeds documents and stores embeddings on them
8
+
9
+ Sparse Embedders (for hybrid search):
10
+ - SIESparseTextEmbedder: Sparse embeddings for queries
11
+ - SIESparseDocumentEmbedder: Sparse embeddings for documents
12
+
13
+ Rankers and Extractors:
14
+ - SIERanker: Reranks documents by relevance to a query
15
+ - SIEExtractor: Extracts entities from text
16
+
17
+ Example usage:
18
+ from haystack import Document
19
+ from sie_haystack import SIETextEmbedder, SIEDocumentEmbedder, SIERanker
20
+
21
+ # Embed a query
22
+ text_embedder = SIETextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
23
+ result = text_embedder.run(text="What is machine learning?")
24
+ query_embedding = result["embedding"]
25
+
26
+ # Embed documents
27
+ doc_embedder = SIEDocumentEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
28
+ docs = [Document(content="Python is a programming language.")]
29
+ result = doc_embedder.run(documents=docs)
30
+ embedded_docs = result["documents"]
31
+
32
+ # Rerank documents
33
+ ranker = SIERanker(base_url="http://localhost:8080", model="jinaai/jina-reranker-v2-base-multilingual")
34
+ result = ranker.run(query="What is Python?", documents=embedded_docs, top_k=3)
35
+ ranked_docs = result["documents"]
36
+
37
+ Hybrid search example:
38
+ from sie_haystack import SIESparseTextEmbedder, SIESparseDocumentEmbedder
39
+
40
+ # Sparse embeddings for hybrid search with Qdrant
41
+ sparse_text_embedder = SIESparseTextEmbedder(model="BAAI/bge-m3")
42
+ result = sparse_text_embedder.run(text="What is machine learning?")
43
+ sparse_embedding = result["sparse_embedding"] # {"indices": [...], "values": [...]}
44
+ """
45
+
46
+ from sie_haystack.embedders import (
47
+ SIEDocumentEmbedder,
48
+ SIESparseDocumentEmbedder,
49
+ SIESparseTextEmbedder,
50
+ SIETextEmbedder,
51
+ )
52
+ from sie_haystack.extractors import SIEExtractor
53
+ from sie_haystack.rankers import SIERanker
54
+
55
+ __all__ = [
56
+ "SIEDocumentEmbedder",
57
+ "SIEExtractor",
58
+ "SIERanker",
59
+ "SIESparseDocumentEmbedder",
60
+ "SIESparseTextEmbedder",
61
+ "SIETextEmbedder",
62
+ ]
@@ -0,0 +1,418 @@
1
+ """SIE embedding components for Haystack.
2
+
3
+ Provides embedder components following Haystack's conventions:
4
+ - SIETextEmbedder: For embedding single text strings (queries) - dense embeddings
5
+ - SIEDocumentEmbedder: For embedding documents - dense embeddings
6
+ - SIESparseTextEmbedder: For sparse embeddings of queries (hybrid search)
7
+ - SIESparseDocumentEmbedder: For sparse embeddings of documents (hybrid search)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ from haystack import Document, component
15
+
16
+
17
+ @component
18
+ class SIETextEmbedder:
19
+ """Embeds a single text string using SIE.
20
+
21
+ Use this component for embedding queries in retrieval pipelines.
22
+
23
+ Example:
24
+ >>> embedder = SIETextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
25
+ >>> result = embedder.run(text="What is vector search?")
26
+ >>> embedding = result["embedding"] # list[float]
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ base_url: str = "http://localhost:8080",
32
+ model: str = "BAAI/bge-m3",
33
+ *,
34
+ gpu: str | None = None,
35
+ options: dict[str, Any] | None = None,
36
+ timeout_s: float = 180.0,
37
+ ) -> None:
38
+ """Initialize the text embedder.
39
+
40
+ Args:
41
+ base_url: URL of the SIE server.
42
+ model: Model name to use for encoding.
43
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
44
+ options: Model-specific options. Passed to SDK as default.
45
+ timeout_s: Request timeout in seconds.
46
+ """
47
+ self._base_url = base_url
48
+ self._model = model
49
+ self._gpu = gpu
50
+ self._options = options
51
+ self._timeout_s = timeout_s
52
+ self._client: Any = None
53
+
54
+ @property
55
+ def client(self) -> Any:
56
+ """Lazily initialize the SIE client."""
57
+ if self._client is None:
58
+ from sie_sdk import SIEClient
59
+
60
+ self._client = SIEClient(
61
+ self._base_url,
62
+ timeout_s=self._timeout_s,
63
+ gpu=self._gpu,
64
+ options=self._options,
65
+ )
66
+ return self._client
67
+
68
+ def warm_up(self) -> None:
69
+ """Warm up the component by initializing the client."""
70
+ _ = self.client
71
+
72
+ @component.output_types(embedding=list[float])
73
+ def run(self, text: str) -> dict[str, list[float]]:
74
+ """Embed a single text string.
75
+
76
+ Args:
77
+ text: The text to embed.
78
+
79
+ Returns:
80
+ Dictionary with "embedding" key containing the embedding vector.
81
+ """
82
+ from sie_sdk.types import Item
83
+
84
+ result = self.client.encode(
85
+ self._model,
86
+ Item(text=text),
87
+ output_types=["dense"],
88
+ options={"is_query": True},
89
+ )
90
+ embedding = self._extract_dense(result)
91
+ return {"embedding": embedding}
92
+
93
+ def _extract_dense(self, result: Any) -> list[float]:
94
+ """Extract dense embedding from SDK result."""
95
+ # SDK returns {"dense": np.ndarray, ...}
96
+ dense = result.get("dense") if isinstance(result, dict) else getattr(result, "dense", None)
97
+ if dense is None:
98
+ return []
99
+ return dense.tolist() if hasattr(dense, "tolist") else list(dense)
100
+
101
+
102
+ @component
103
+ class SIEDocumentEmbedder:
104
+ """Embeds documents using SIE and stores embeddings on each document.
105
+
106
+ Use this component for embedding documents before indexing.
107
+
108
+ Example:
109
+ >>> from haystack import Document
110
+ >>> embedder = SIEDocumentEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
111
+ >>> docs = [Document(content="Python is a programming language.")]
112
+ >>> result = embedder.run(documents=docs)
113
+ >>> embedded_docs = result["documents"]
114
+ >>> print(embedded_docs[0].embedding) # list[float]
115
+ """
116
+
117
+ def __init__(
118
+ self,
119
+ base_url: str = "http://localhost:8080",
120
+ model: str = "BAAI/bge-m3",
121
+ *,
122
+ gpu: str | None = None,
123
+ options: dict[str, Any] | None = None,
124
+ timeout_s: float = 180.0,
125
+ meta_fields_to_embed: list[str] | None = None,
126
+ ) -> None:
127
+ """Initialize the document embedder.
128
+
129
+ Args:
130
+ base_url: URL of the SIE server.
131
+ model: Model name to use for encoding.
132
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
133
+ options: Model-specific options. Passed to SDK as default.
134
+ timeout_s: Request timeout in seconds.
135
+ meta_fields_to_embed: List of metadata fields to include in embedding.
136
+ """
137
+ self._base_url = base_url
138
+ self._model = model
139
+ self._gpu = gpu
140
+ self._options = options
141
+ self._timeout_s = timeout_s
142
+ self._meta_fields_to_embed = meta_fields_to_embed or []
143
+ self._client: Any = None
144
+
145
+ @property
146
+ def client(self) -> Any:
147
+ """Lazily initialize the SIE client."""
148
+ if self._client is None:
149
+ from sie_sdk import SIEClient
150
+
151
+ self._client = SIEClient(
152
+ self._base_url,
153
+ timeout_s=self._timeout_s,
154
+ gpu=self._gpu,
155
+ options=self._options,
156
+ )
157
+ return self._client
158
+
159
+ def warm_up(self) -> None:
160
+ """Warm up the component by initializing the client."""
161
+ _ = self.client
162
+
163
+ @component.output_types(documents=list[Document])
164
+ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
165
+ """Embed documents and store embeddings on each document.
166
+
167
+ Args:
168
+ documents: List of documents to embed.
169
+
170
+ Returns:
171
+ Dictionary with "documents" key containing documents with embeddings.
172
+ """
173
+ if not documents:
174
+ return {"documents": []}
175
+
176
+ from sie_sdk.types import Item
177
+
178
+ # Build text to embed for each document
179
+ texts = [self._build_text(doc) for doc in documents]
180
+ items = [Item(text=text) for text in texts]
181
+
182
+ # Batch encode
183
+ results = self.client.encode(
184
+ self._model,
185
+ items,
186
+ output_types=["dense"],
187
+ )
188
+
189
+ # Store embeddings on documents
190
+ for doc, result in zip(documents, results, strict=True):
191
+ doc.embedding = self._extract_dense(result)
192
+
193
+ return {"documents": documents}
194
+
195
+ def _build_text(self, doc: Document) -> str:
196
+ """Build the text to embed for a document.
197
+
198
+ Optionally includes metadata fields.
199
+ """
200
+ parts = [str(doc.meta[field]) for field in self._meta_fields_to_embed if field in doc.meta]
201
+ parts.append(doc.content or "")
202
+ return " ".join(parts)
203
+
204
+ def _extract_dense(self, result: Any) -> list[float]:
205
+ """Extract dense embedding from SDK result."""
206
+ # SDK returns {"dense": np.ndarray, ...}
207
+ dense = result.get("dense") if isinstance(result, dict) else getattr(result, "dense", None)
208
+ if dense is None:
209
+ return []
210
+ return dense.tolist() if hasattr(dense, "tolist") else list(dense)
211
+
212
+
213
+ @component
214
+ class SIESparseTextEmbedder:
215
+ """Embeds a single text string using SIE sparse embeddings.
216
+
217
+ Use this component for embedding queries in hybrid search pipelines.
218
+ Works with QdrantHybridRetriever and other hybrid retrievers.
219
+
220
+ Example:
221
+ >>> embedder = SIESparseTextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
222
+ >>> result = embedder.run(text="What is vector search?")
223
+ >>> sparse_embedding = result["sparse_embedding"] # dict with indices/values
224
+ """
225
+
226
+ def __init__(
227
+ self,
228
+ base_url: str = "http://localhost:8080",
229
+ model: str = "BAAI/bge-m3",
230
+ *,
231
+ gpu: str | None = None,
232
+ options: dict[str, Any] | None = None,
233
+ timeout_s: float = 180.0,
234
+ ) -> None:
235
+ """Initialize the sparse text embedder.
236
+
237
+ Args:
238
+ base_url: URL of the SIE server.
239
+ model: Model name to use for encoding. Must support sparse output (e.g., BAAI/bge-m3).
240
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
241
+ options: Model-specific options. Passed to SDK as default.
242
+ timeout_s: Request timeout in seconds.
243
+ """
244
+ self._base_url = base_url
245
+ self._model = model
246
+ self._gpu = gpu
247
+ self._options = options
248
+ self._timeout_s = timeout_s
249
+ self._client: Any = None
250
+
251
+ @property
252
+ def client(self) -> Any:
253
+ """Lazily initialize the SIE client."""
254
+ if self._client is None:
255
+ from sie_sdk import SIEClient
256
+
257
+ self._client = SIEClient(
258
+ self._base_url,
259
+ timeout_s=self._timeout_s,
260
+ gpu=self._gpu,
261
+ options=self._options,
262
+ )
263
+ return self._client
264
+
265
+ def warm_up(self) -> None:
266
+ """Warm up the component by initializing the client."""
267
+ _ = self.client
268
+
269
+ @component.output_types(sparse_embedding=dict)
270
+ def run(self, text: str) -> dict[str, dict[str, list]]:
271
+ """Embed a single text string with sparse embeddings.
272
+
273
+ Args:
274
+ text: The text to embed.
275
+
276
+ Returns:
277
+ Dictionary with "sparse_embedding" key containing dict with "indices" and "values" lists.
278
+ """
279
+ from sie_sdk.types import Item
280
+
281
+ result = self.client.encode(
282
+ self._model,
283
+ Item(text=text),
284
+ output_types=["sparse"],
285
+ options={"is_query": True},
286
+ )
287
+ sparse_embedding = self._extract_sparse(result)
288
+ return {"sparse_embedding": sparse_embedding}
289
+
290
+ def _extract_sparse(self, result: Any) -> dict[str, list]:
291
+ """Extract sparse embedding from SDK result."""
292
+ # SDK returns {"sparse": {"indices": np.ndarray, "values": np.ndarray}, ...}
293
+ sparse = result.get("sparse") if isinstance(result, dict) else getattr(result, "sparse", None)
294
+ if sparse is None:
295
+ return {"indices": [], "values": []}
296
+ indices = sparse.get("indices") if isinstance(sparse, dict) else getattr(sparse, "indices", None)
297
+ values = sparse.get("values") if isinstance(sparse, dict) else getattr(sparse, "values", None)
298
+ return {
299
+ "indices": indices.tolist() if hasattr(indices, "tolist") else list(indices or []),
300
+ "values": values.tolist() if hasattr(values, "tolist") else list(values or []),
301
+ }
302
+
303
+
304
+ @component
305
+ class SIESparseDocumentEmbedder:
306
+ """Embeds documents using SIE sparse embeddings and stores them on each document.
307
+
308
+ Use this component for embedding documents before indexing in hybrid search pipelines.
309
+ Works with QdrantDocumentStore(use_sparse_embeddings=True).
310
+
311
+ Example:
312
+ >>> from haystack import Document
313
+ >>> embedder = SIESparseDocumentEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
314
+ >>> docs = [Document(content="Python is a programming language.")]
315
+ >>> result = embedder.run(documents=docs)
316
+ >>> embedded_docs = result["documents"]
317
+ >>> print(embedded_docs[0].meta["_sparse_embedding"]) # dict with indices/values
318
+ """
319
+
320
+ def __init__(
321
+ self,
322
+ base_url: str = "http://localhost:8080",
323
+ model: str = "BAAI/bge-m3",
324
+ *,
325
+ gpu: str | None = None,
326
+ options: dict[str, Any] | None = None,
327
+ timeout_s: float = 180.0,
328
+ meta_fields_to_embed: list[str] | None = None,
329
+ ) -> None:
330
+ """Initialize the sparse document embedder.
331
+
332
+ Args:
333
+ base_url: URL of the SIE server.
334
+ model: Model name to use for encoding. Must support sparse output (e.g., BAAI/bge-m3).
335
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
336
+ options: Model-specific options. Passed to SDK as default.
337
+ timeout_s: Request timeout in seconds.
338
+ meta_fields_to_embed: List of metadata fields to include in embedding.
339
+ """
340
+ self._base_url = base_url
341
+ self._model = model
342
+ self._gpu = gpu
343
+ self._options = options
344
+ self._timeout_s = timeout_s
345
+ self._meta_fields_to_embed = meta_fields_to_embed or []
346
+ self._client: Any = None
347
+
348
+ @property
349
+ def client(self) -> Any:
350
+ """Lazily initialize the SIE client."""
351
+ if self._client is None:
352
+ from sie_sdk import SIEClient
353
+
354
+ self._client = SIEClient(
355
+ self._base_url,
356
+ timeout_s=self._timeout_s,
357
+ gpu=self._gpu,
358
+ options=self._options,
359
+ )
360
+ return self._client
361
+
362
+ def warm_up(self) -> None:
363
+ """Warm up the component by initializing the client."""
364
+ _ = self.client
365
+
366
+ @component.output_types(documents=list[Document])
367
+ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
368
+ """Embed documents with sparse embeddings and store on each document.
369
+
370
+ Args:
371
+ documents: List of documents to embed.
372
+
373
+ Returns:
374
+ Dictionary with "documents" key containing documents with sparse embeddings.
375
+ """
376
+ if not documents:
377
+ return {"documents": []}
378
+
379
+ from sie_sdk.types import Item
380
+
381
+ # Build text to embed for each document
382
+ texts = [self._build_text(doc) for doc in documents]
383
+ items = [Item(text=text) for text in texts]
384
+
385
+ # Batch encode with sparse output
386
+ results = self.client.encode(
387
+ self._model,
388
+ items,
389
+ output_types=["sparse"],
390
+ )
391
+
392
+ # Store sparse embeddings on documents in meta
393
+ for doc, result in zip(documents, results, strict=True):
394
+ doc.meta["_sparse_embedding"] = self._extract_sparse(result)
395
+
396
+ return {"documents": documents}
397
+
398
+ def _build_text(self, doc: Document) -> str:
399
+ """Build the text to embed for a document.
400
+
401
+ Optionally includes metadata fields.
402
+ """
403
+ parts = [str(doc.meta[field]) for field in self._meta_fields_to_embed if field in doc.meta]
404
+ parts.append(doc.content or "")
405
+ return " ".join(parts)
406
+
407
+ def _extract_sparse(self, result: Any) -> dict[str, list]:
408
+ """Extract sparse embedding from SDK result."""
409
+ # SDK returns {"sparse": {"indices": np.ndarray, "values": np.ndarray}, ...}
410
+ sparse = result.get("sparse") if isinstance(result, dict) else getattr(result, "sparse", None)
411
+ if sparse is None:
412
+ return {"indices": [], "values": []}
413
+ indices = sparse.get("indices") if isinstance(sparse, dict) else getattr(sparse, "indices", None)
414
+ values = sparse.get("values") if isinstance(sparse, dict) else getattr(sparse, "values", None)
415
+ return {
416
+ "indices": indices.tolist() if hasattr(indices, "tolist") else list(indices or []),
417
+ "values": values.tolist() if hasattr(values, "tolist") else list(values or []),
418
+ }
@@ -0,0 +1,144 @@
1
+ """SIE extractor component for Haystack.
2
+
3
+ Provides SIEExtractor for extracting entities from text.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ from haystack import component
12
+
13
+
14
+ @dataclass
15
+ class Entity:
16
+ """Extracted entity with position and label information."""
17
+
18
+ text: str
19
+ label: str
20
+ score: float
21
+ start: int
22
+ end: int
23
+
24
+
25
+ @component
26
+ class SIEExtractor:
27
+ """Extracts entities from text using SIE.
28
+
29
+ Use this component to extract named entities or custom entity types
30
+ from text using GLiNER or similar extraction models.
31
+
32
+ Example:
33
+ >>> extractor = SIEExtractor(
34
+ ... base_url="http://localhost:8080",
35
+ ... model="urchade/gliner_multi-v2.1",
36
+ ... labels=["person", "organization", "location"],
37
+ ... )
38
+ >>> result = extractor.run(text="John Smith works at Google in New York.")
39
+ >>> entities = result["entities"]
40
+ >>> for entity in entities:
41
+ ... print(f"{entity.text} ({entity.label}): {entity.score:.2f}")
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ base_url: str = "http://localhost:8080",
47
+ model: str = "urchade/gliner_multi-v2.1",
48
+ labels: list[str] | None = None,
49
+ *,
50
+ gpu: str | None = None,
51
+ options: dict[str, Any] | None = None,
52
+ timeout_s: float = 180.0,
53
+ ) -> None:
54
+ """Initialize the extractor.
55
+
56
+ Args:
57
+ base_url: URL of the SIE server.
58
+ model: Model name to use for extraction.
59
+ labels: Entity labels to extract (e.g., ["person", "organization"]).
60
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
61
+ options: Model-specific options. Passed to SDK as default.
62
+ timeout_s: Request timeout in seconds.
63
+ """
64
+ self._base_url = base_url
65
+ self._model = model
66
+ self._labels = labels or ["person", "organization", "location"]
67
+ self._gpu = gpu
68
+ self._options = options
69
+ self._timeout_s = timeout_s
70
+ self._client: Any = None
71
+
72
+ @property
73
+ def client(self) -> Any:
74
+ """Lazily initialize the SIE client."""
75
+ if self._client is None:
76
+ from sie_sdk import SIEClient
77
+
78
+ self._client = SIEClient(
79
+ self._base_url,
80
+ timeout_s=self._timeout_s,
81
+ gpu=self._gpu,
82
+ options=self._options,
83
+ )
84
+ return self._client
85
+
86
+ def warm_up(self) -> None:
87
+ """Warm up the component by initializing the client."""
88
+ _ = self.client
89
+
90
+ @component.output_types(entities=list[Entity])
91
+ def run(
92
+ self,
93
+ text: str,
94
+ labels: list[str] | None = None,
95
+ ) -> dict[str, list[Entity]]:
96
+ """Extract entities from text.
97
+
98
+ Args:
99
+ text: The text to extract entities from.
100
+ labels: Override the configured labels for this call.
101
+
102
+ Returns:
103
+ Dictionary with "entities" key containing extracted entities.
104
+ """
105
+ from sie_sdk.types import Item
106
+
107
+ effective_labels = labels if labels is not None else self._labels
108
+
109
+ result = self.client.extract(
110
+ self._model,
111
+ Item(text=text),
112
+ labels=effective_labels,
113
+ )
114
+
115
+ entities = self._build_entities(result)
116
+ return {"entities": entities}
117
+
118
+ def _build_entities(self, result: Any) -> list[Entity]:
119
+ """Build Entity objects from SDK result."""
120
+ entities = []
121
+
122
+ # Result could be a list of entities
123
+ items = result if isinstance(result, list) else []
124
+
125
+ for item in items:
126
+ if isinstance(item, dict):
127
+ entity = Entity(
128
+ text=item.get("text", ""),
129
+ label=item.get("label", ""),
130
+ score=float(item.get("score", 0.0)),
131
+ start=int(item.get("start", 0)),
132
+ end=int(item.get("end", 0)),
133
+ )
134
+ else:
135
+ entity = Entity(
136
+ text=getattr(item, "text", ""),
137
+ label=getattr(item, "label", ""),
138
+ score=float(getattr(item, "score", 0.0)),
139
+ start=int(getattr(item, "start", 0)),
140
+ end=int(getattr(item, "end", 0)),
141
+ )
142
+ entities.append(entity)
143
+
144
+ return entities
@@ -0,0 +1,139 @@
1
+ """SIE ranker component for Haystack.
2
+
3
+ Provides SIERanker for reranking documents by relevance to a query.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ from haystack import Document, component
11
+
12
+
13
+ @component
14
+ class SIERanker:
15
+ """Reranks documents by relevance to a query using SIE.
16
+
17
+ Use this component to improve retrieval precision by reranking
18
+ candidate documents with a cross-encoder model.
19
+
20
+ Example:
21
+ >>> from haystack import Document
22
+ >>> ranker = SIERanker(
23
+ ... base_url="http://localhost:8080",
24
+ ... model="jinaai/jina-reranker-v2-base-multilingual",
25
+ ... top_k=3,
26
+ ... )
27
+ >>> docs = [
28
+ ... Document(content="Python is a programming language."),
29
+ ... Document(content="The weather is sunny today."),
30
+ ... Document(content="Machine learning uses statistical models."),
31
+ ... ]
32
+ >>> result = ranker.run(query="What is Python?", documents=docs)
33
+ >>> ranked_docs = result["documents"] # Top 3 most relevant
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ base_url: str = "http://localhost:8080",
39
+ model: str = "jinaai/jina-reranker-v2-base-multilingual",
40
+ *,
41
+ top_k: int | None = None,
42
+ gpu: str | None = None,
43
+ options: dict[str, Any] | None = None,
44
+ timeout_s: float = 180.0,
45
+ ) -> None:
46
+ """Initialize the ranker.
47
+
48
+ Args:
49
+ base_url: URL of the SIE server.
50
+ model: Model name to use for scoring.
51
+ top_k: Maximum number of documents to return. If None, returns all.
52
+ gpu: GPU type to use (e.g., "l4", "a100"). Passed to SDK as default.
53
+ options: Model-specific options. Passed to SDK as default.
54
+ timeout_s: Request timeout in seconds.
55
+ """
56
+ self._base_url = base_url
57
+ self._model = model
58
+ self._top_k = top_k
59
+ self._gpu = gpu
60
+ self._options = options
61
+ self._timeout_s = timeout_s
62
+ self._client: Any = None
63
+
64
+ @property
65
+ def client(self) -> Any:
66
+ """Lazily initialize the SIE client."""
67
+ if self._client is None:
68
+ from sie_sdk import SIEClient
69
+
70
+ self._client = SIEClient(
71
+ self._base_url,
72
+ timeout_s=self._timeout_s,
73
+ gpu=self._gpu,
74
+ options=self._options,
75
+ )
76
+ return self._client
77
+
78
+ def warm_up(self) -> None:
79
+ """Warm up the component by initializing the client."""
80
+ _ = self.client
81
+
82
+ @component.output_types(documents=list[Document])
83
+ def run(
84
+ self,
85
+ query: str,
86
+ documents: list[Document],
87
+ top_k: int | None = None,
88
+ ) -> dict[str, list[Document]]:
89
+ """Rerank documents by relevance to the query.
90
+
91
+ Args:
92
+ query: The query string to rank against.
93
+ documents: List of documents to rerank.
94
+ top_k: Override the configured top_k for this call.
95
+
96
+ Returns:
97
+ Dictionary with "documents" key containing ranked documents.
98
+ """
99
+ if not documents:
100
+ return {"documents": []}
101
+
102
+ from sie_sdk.types import Item
103
+
104
+ # Prepare items
105
+ query_item = Item(text=query)
106
+ doc_items = [Item(text=doc.content or "") for doc in documents]
107
+
108
+ # Score documents
109
+ results = self.client.score(self._model, query_item, doc_items)
110
+
111
+ # Build scored documents
112
+ scored_docs = []
113
+ for doc, result in zip(documents, results, strict=True):
114
+ score = self._extract_score(result)
115
+ # Store score in document metadata
116
+ doc_with_score = Document(
117
+ id=doc.id,
118
+ content=doc.content,
119
+ meta={**doc.meta, "score": score},
120
+ embedding=doc.embedding,
121
+ )
122
+ scored_docs.append((score, doc_with_score))
123
+
124
+ # Sort by score descending
125
+ scored_docs.sort(key=lambda x: x[0], reverse=True)
126
+
127
+ # Apply top_k
128
+ effective_top_k = top_k if top_k is not None else self._top_k
129
+ ranked_docs = [doc for _, doc in scored_docs]
130
+ if effective_top_k is not None:
131
+ ranked_docs = ranked_docs[:effective_top_k]
132
+
133
+ return {"documents": ranked_docs}
134
+
135
+ def _extract_score(self, result: Any) -> float:
136
+ """Extract score from SDK result."""
137
+ if isinstance(result, dict):
138
+ return float(result.get("score", 0.0))
139
+ return float(getattr(result, "score", 0.0))
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 2.4
2
+ Name: sie-haystack
3
+ Version: 0.1.7
4
+ Summary: SIE integration for Haystack
5
+ Author-email: Superlinked <dev@superlinked.com>
6
+ License: Apache-2.0
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: haystack-ai>=2.0.0
15
+ Requires-Dist: sie-sdk>=0.1.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: chroma-haystack>=2.0.0; extra == 'dev'
18
+ Requires-Dist: chromadb>=0.4.0; extra == 'dev'
19
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
20
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
21
+ Description-Content-Type: text/markdown
22
+
23
+ # sie-haystack
24
+
25
+ SIE integration for Haystack.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install sie-haystack
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from haystack import Document
37
+ from sie_haystack import SIETextEmbedder, SIEDocumentEmbedder, SIERanker
38
+
39
+ # Embed a query
40
+ text_embedder = SIETextEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
41
+ result = text_embedder.run(text="What is machine learning?")
42
+ query_embedding = result["embedding"]
43
+
44
+ # Embed documents
45
+ doc_embedder = SIEDocumentEmbedder(base_url="http://localhost:8080", model="BAAI/bge-m3")
46
+ docs = [Document(content="Python is a programming language.")]
47
+ result = doc_embedder.run(documents=docs)
48
+ embedded_docs = result["documents"]
49
+
50
+ # Rerank documents
51
+ ranker = SIERanker(
52
+ base_url="http://localhost:8080",
53
+ model="jinaai/jina-reranker-v2-base-multilingual"
54
+ )
55
+ result = ranker.run(query="What is Python?", documents=embedded_docs, top_k=3)
56
+ ranked_docs = result["documents"]
57
+ ```
@@ -0,0 +1,7 @@
1
+ sie_haystack/__init__.py,sha256=wFzY72JlWncGhYsOLI3QQjRiDZt_o2XN_LbMDYCOZfo,2188
2
+ sie_haystack/embedders.py,sha256=LxoPZggHVs1VLSMWN5uRYqjXAR-HbhqRiA9KrUDnsRg,14941
3
+ sie_haystack/extractors.py,sha256=-nLhiD0pXBexUVlS3BpmWntJ90XUTnY3Ip2ymU1adTU,4424
4
+ sie_haystack/rankers.py,sha256=6TBEkw4apUqkct-jHCZGNtQ1N3inLLQpk0PfVO-FdqQ,4544
5
+ sie_haystack-0.1.7.dist-info/METADATA,sha256=ZgQeGO0GMuezRIWyeu3saa2nIGs5v0PYLOAKFvrxQ4k,1747
6
+ sie_haystack-0.1.7.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
7
+ sie_haystack-0.1.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any