haystack-experimental 0.14.3__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_experimental/chat_message_stores/__init__.py +1 -1
- haystack_experimental/chat_message_stores/in_memory.py +176 -31
- haystack_experimental/chat_message_stores/types.py +33 -21
- haystack_experimental/components/agents/agent.py +147 -44
- haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
- haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
- haystack_experimental/components/embedders/types/protocol.py +2 -2
- haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
- haystack_experimental/components/retrievers/__init__.py +1 -3
- haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
- haystack_experimental/components/writers/__init__.py +1 -1
- haystack_experimental/components/writers/chat_message_writer.py +25 -22
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/METADATA +24 -31
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/RECORD +17 -24
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/WHEEL +1 -1
- haystack_experimental/components/query/__init__.py +0 -18
- haystack_experimental/components/query/query_expander.py +0 -294
- haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -173
- haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -150
- haystack_experimental/super_components/__init__.py +0 -3
- haystack_experimental/super_components/indexers/__init__.py +0 -11
- haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE +0 -0
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE-MIT.txt +0 -0
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
-
from typing import Any, Optional
|
|
7
|
-
|
|
8
|
-
from haystack import Document, component, default_from_dict, default_to_dict
|
|
9
|
-
from haystack.core.serialization import component_to_dict
|
|
10
|
-
from haystack.utils.deserialization import deserialize_component_inplace
|
|
11
|
-
|
|
12
|
-
from haystack_experimental.components.retrievers.types import TextRetriever
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@component
|
|
16
|
-
class MultiQueryTextRetriever:
|
|
17
|
-
"""
|
|
18
|
-
A component that retrieves documents using multiple queries in parallel with a text-based retriever.
|
|
19
|
-
|
|
20
|
-
This component takes a list of text queries and uses a text-based retriever to find relevant documents for each
|
|
21
|
-
query in parallel, using a thread pool to manage concurrent execution. The results are combined and sorted by
|
|
22
|
-
relevance score.
|
|
23
|
-
|
|
24
|
-
You can use this component in combination with QueryExpander component to enhance the retrieval process.
|
|
25
|
-
|
|
26
|
-
### Usage example
|
|
27
|
-
```python
|
|
28
|
-
from haystack import Document
|
|
29
|
-
from haystack.components.writers import DocumentWriter
|
|
30
|
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
31
|
-
from haystack.document_stores.types import DuplicatePolicy
|
|
32
|
-
from haystack.components.retrievers import InMemoryBM25Retriever
|
|
33
|
-
from haystack_experimental.components.query import QueryExpander
|
|
34
|
-
from haystack_experimental.components.retrievers.multi_query_text_retriever import MultiQueryTextRetriever
|
|
35
|
-
|
|
36
|
-
documents = [
|
|
37
|
-
Document(content="Renewable energy is energy that is collected from renewable resources."),
|
|
38
|
-
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
|
|
39
|
-
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
|
|
40
|
-
Document(content="Hydropower is a form of renewable energy using the flow of water to generate electricity."),
|
|
41
|
-
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth.")
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
document_store = InMemoryDocumentStore()
|
|
45
|
-
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
|
|
46
|
-
doc_writer.run(documents=documents)
|
|
47
|
-
|
|
48
|
-
in_memory_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=1)
|
|
49
|
-
multiquery_retriever = MultiQueryTextRetriever(retriever=in_memory_retriever)
|
|
50
|
-
results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
|
|
51
|
-
for doc in results["documents"]:
|
|
52
|
-
print(f"Content: {doc.content}, Score: {doc.score}")
|
|
53
|
-
>>
|
|
54
|
-
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
|
|
55
|
-
>> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.6157822790079805
|
|
56
|
-
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
|
|
57
|
-
```
|
|
58
|
-
""" # noqa E501
|
|
59
|
-
|
|
60
|
-
def __init__(self, *, retriever: TextRetriever, max_workers: int = 3) -> None:
|
|
61
|
-
"""
|
|
62
|
-
Initialize MultiQueryTextRetriever.
|
|
63
|
-
|
|
64
|
-
:param retriever: The text-based retriever to use for document retrieval.
|
|
65
|
-
:param max_workers: Maximum number of worker threads for parallel processing. Default is 3.
|
|
66
|
-
"""
|
|
67
|
-
self.retriever = retriever
|
|
68
|
-
self.max_workers = max_workers
|
|
69
|
-
self._is_warmed_up = False
|
|
70
|
-
|
|
71
|
-
def warm_up(self) -> None:
|
|
72
|
-
"""
|
|
73
|
-
Warm up the retriever if it has a warm_up method.
|
|
74
|
-
"""
|
|
75
|
-
if not self._is_warmed_up:
|
|
76
|
-
if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
|
|
77
|
-
self.retriever.warm_up()
|
|
78
|
-
self._is_warmed_up = True
|
|
79
|
-
|
|
80
|
-
@component.output_types(documents=list[Document])
|
|
81
|
-
def run(self, queries: list[str], retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, list[Document]]:
|
|
82
|
-
"""
|
|
83
|
-
Retrieve documents using multiple queries in parallel.
|
|
84
|
-
|
|
85
|
-
:param queries: List of text queries to process.
|
|
86
|
-
:param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
|
|
87
|
-
:returns:
|
|
88
|
-
A dictionary containing:
|
|
89
|
-
`documents`: List of retrieved documents sorted by relevance score.
|
|
90
|
-
"""
|
|
91
|
-
docs: list[Document] = []
|
|
92
|
-
seen_contents = set()
|
|
93
|
-
retriever_kwargs = retriever_kwargs or {}
|
|
94
|
-
|
|
95
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
96
|
-
queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
|
|
97
|
-
for result in queries_results:
|
|
98
|
-
if not result:
|
|
99
|
-
continue
|
|
100
|
-
# deduplicate based on content
|
|
101
|
-
for doc in result:
|
|
102
|
-
if doc.content not in seen_contents:
|
|
103
|
-
docs.append(doc)
|
|
104
|
-
seen_contents.add(doc.content)
|
|
105
|
-
|
|
106
|
-
docs.sort(key=lambda x: x.score or 0.0, reverse=True)
|
|
107
|
-
return {"documents": docs}
|
|
108
|
-
|
|
109
|
-
def _run_on_thread(
|
|
110
|
-
self,
|
|
111
|
-
query: str,
|
|
112
|
-
retriever_kwargs: Optional[dict[str, Any]] = None,
|
|
113
|
-
) -> Optional[list[Document]]:
|
|
114
|
-
"""
|
|
115
|
-
Process a single query on a separate thread.
|
|
116
|
-
|
|
117
|
-
:param query: The text query to process.
|
|
118
|
-
:param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
|
|
119
|
-
:returns:
|
|
120
|
-
List of retrieved documents or None if no results.
|
|
121
|
-
"""
|
|
122
|
-
result = self.retriever.run(query=query, **(retriever_kwargs or {}))
|
|
123
|
-
if result and "documents" in result:
|
|
124
|
-
return result["documents"]
|
|
125
|
-
return None
|
|
126
|
-
|
|
127
|
-
def to_dict(self) -> dict[str, Any]:
|
|
128
|
-
"""
|
|
129
|
-
Serializes the component to a dictionary.
|
|
130
|
-
|
|
131
|
-
:returns:
|
|
132
|
-
The serialized component as a dictionary.
|
|
133
|
-
"""
|
|
134
|
-
return default_to_dict(
|
|
135
|
-
self,
|
|
136
|
-
retriever=component_to_dict(obj=self.retriever, name="retriever"),
|
|
137
|
-
max_workers=self.max_workers,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
@classmethod
|
|
141
|
-
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryTextRetriever":
|
|
142
|
-
"""
|
|
143
|
-
Deserializes the component from a dictionary.
|
|
144
|
-
|
|
145
|
-
:param data: The dictionary to deserialize from.
|
|
146
|
-
:returns:
|
|
147
|
-
The deserialized component.
|
|
148
|
-
"""
|
|
149
|
-
deserialize_component_inplace(data["init_parameters"], key="retriever")
|
|
150
|
-
return default_from_dict(cls, data)
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
from haystack_experimental.super_components.indexers.sentence_transformers_document_indexer import (
|
|
6
|
-
SentenceTransformersDocumentIndexer,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
__all__ = [
|
|
10
|
-
"SentenceTransformersDocumentIndexer",
|
|
11
|
-
]
|
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
from typing import Any, Dict, List, Literal, Optional
|
|
6
|
-
|
|
7
|
-
from haystack import Pipeline, component, default_from_dict, default_to_dict
|
|
8
|
-
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
|
9
|
-
from haystack.components.writers import DocumentWriter
|
|
10
|
-
from haystack.core.super_component import SuperComponent
|
|
11
|
-
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
|
12
|
-
from haystack.utils import (
|
|
13
|
-
ComponentDevice,
|
|
14
|
-
Secret,
|
|
15
|
-
deserialize_document_store_in_init_params_inplace,
|
|
16
|
-
deserialize_secrets_inplace,
|
|
17
|
-
)
|
|
18
|
-
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@component
|
|
22
|
-
class SentenceTransformersDocumentIndexer(SuperComponent):
|
|
23
|
-
"""
|
|
24
|
-
A document indexer that takes a list of documents, embeds them using SentenceTransformers, and stores them.
|
|
25
|
-
|
|
26
|
-
Usage:
|
|
27
|
-
|
|
28
|
-
```python
|
|
29
|
-
>>> from haystack import Document
|
|
30
|
-
>>> from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
31
|
-
>>> document_store = InMemoryDocumentStore()
|
|
32
|
-
>>> doc = Document(content="I love pizza!")
|
|
33
|
-
>>> indexer = SentenceTransformersDocumentIndexer(document_store=document_store)
|
|
34
|
-
>>> indexer.warm_up()
|
|
35
|
-
>>> result = indexer.run(documents=[doc])
|
|
36
|
-
>>> print(result)
|
|
37
|
-
{'documents_written': 1}
|
|
38
|
-
>>> document_store.count_documents()
|
|
39
|
-
1
|
|
40
|
-
```
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
def __init__( # pylint: disable=R0917
|
|
44
|
-
self,
|
|
45
|
-
document_store: DocumentStore,
|
|
46
|
-
model: str = "sentence-transformers/all-mpnet-base-v2",
|
|
47
|
-
device: Optional[ComponentDevice] = None,
|
|
48
|
-
token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
|
|
49
|
-
prefix: str = "",
|
|
50
|
-
suffix: str = "",
|
|
51
|
-
batch_size: int = 32,
|
|
52
|
-
progress_bar: bool = True,
|
|
53
|
-
normalize_embeddings: bool = False,
|
|
54
|
-
meta_fields_to_embed: Optional[List[str]] = None,
|
|
55
|
-
embedding_separator: str = "\n",
|
|
56
|
-
trust_remote_code: bool = False,
|
|
57
|
-
truncate_dim: Optional[int] = None,
|
|
58
|
-
model_kwargs: Optional[Dict[str, Any]] = None,
|
|
59
|
-
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
|
|
60
|
-
config_kwargs: Optional[Dict[str, Any]] = None,
|
|
61
|
-
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
|
|
62
|
-
duplicate_policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE,
|
|
63
|
-
) -> None:
|
|
64
|
-
"""
|
|
65
|
-
Initialize the SentenceTransformersDocumentIndexer component.
|
|
66
|
-
|
|
67
|
-
:param document_store: The document store where the documents should be stored.
|
|
68
|
-
:param model: The embedding model to use (local path or Hugging Face model ID).
|
|
69
|
-
:param device: The device to use for loading the model.
|
|
70
|
-
:param token: The API token to download private models from Hugging Face.
|
|
71
|
-
:param prefix: String to add at the beginning of each document text.
|
|
72
|
-
:param suffix: String to add at the end of each document text.
|
|
73
|
-
:param batch_size: Number of documents to embed at once.
|
|
74
|
-
:param progress_bar: If True, shows a progress bar when embedding documents.
|
|
75
|
-
:param normalize_embeddings: If True, embeddings are L2 normalized.
|
|
76
|
-
:param meta_fields_to_embed: List of metadata fields to embed along with the document text.
|
|
77
|
-
:param embedding_separator: Separator used to concatenate metadata fields to document text.
|
|
78
|
-
:param trust_remote_code: If True, allows custom models and scripts.
|
|
79
|
-
:param truncate_dim: Dimension to truncate sentence embeddings to.
|
|
80
|
-
:param model_kwargs: Additional keyword arguments for model initialization.
|
|
81
|
-
:param tokenizer_kwargs: Additional keyword arguments for tokenizer initialization.
|
|
82
|
-
:param config_kwargs: Additional keyword arguments for model configuration.
|
|
83
|
-
:param precision: The precision to use for the embeddings.
|
|
84
|
-
:param duplicate_policy: The duplicate policy to use when writing documents.
|
|
85
|
-
"""
|
|
86
|
-
self.document_store = document_store
|
|
87
|
-
self.model = model
|
|
88
|
-
self.device = device
|
|
89
|
-
self.token = token
|
|
90
|
-
self.prefix = prefix
|
|
91
|
-
self.suffix = suffix
|
|
92
|
-
self.batch_size = batch_size
|
|
93
|
-
self.progress_bar = progress_bar
|
|
94
|
-
self.normalize_embeddings = normalize_embeddings
|
|
95
|
-
self.meta_fields_to_embed = meta_fields_to_embed
|
|
96
|
-
self.embedding_separator = embedding_separator
|
|
97
|
-
self.trust_remote_code = trust_remote_code
|
|
98
|
-
self.truncate_dim = truncate_dim
|
|
99
|
-
self.model_kwargs = model_kwargs
|
|
100
|
-
self.tokenizer_kwargs = tokenizer_kwargs
|
|
101
|
-
self.config_kwargs = config_kwargs
|
|
102
|
-
self.precision = precision
|
|
103
|
-
self.duplicate_policy = duplicate_policy
|
|
104
|
-
|
|
105
|
-
pipeline = Pipeline()
|
|
106
|
-
|
|
107
|
-
pipeline.add_component(
|
|
108
|
-
"embedder",
|
|
109
|
-
SentenceTransformersDocumentEmbedder(
|
|
110
|
-
model=self.model,
|
|
111
|
-
device=self.device,
|
|
112
|
-
token=self.token,
|
|
113
|
-
prefix=self.prefix,
|
|
114
|
-
suffix=self.suffix,
|
|
115
|
-
batch_size=self.batch_size,
|
|
116
|
-
progress_bar=self.progress_bar,
|
|
117
|
-
normalize_embeddings=self.normalize_embeddings,
|
|
118
|
-
meta_fields_to_embed=self.meta_fields_to_embed,
|
|
119
|
-
embedding_separator=self.embedding_separator,
|
|
120
|
-
trust_remote_code=self.trust_remote_code,
|
|
121
|
-
truncate_dim=self.truncate_dim,
|
|
122
|
-
model_kwargs=self.model_kwargs,
|
|
123
|
-
tokenizer_kwargs=self.tokenizer_kwargs,
|
|
124
|
-
config_kwargs=self.config_kwargs,
|
|
125
|
-
precision=self.precision,
|
|
126
|
-
),
|
|
127
|
-
)
|
|
128
|
-
pipeline.add_component(
|
|
129
|
-
"writer",
|
|
130
|
-
DocumentWriter(
|
|
131
|
-
document_store=self.document_store,
|
|
132
|
-
policy=self.duplicate_policy,
|
|
133
|
-
),
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
pipeline.connect("embedder.documents", "writer.documents")
|
|
137
|
-
|
|
138
|
-
super(SentenceTransformersDocumentIndexer, self).__init__(
|
|
139
|
-
pipeline=pipeline,
|
|
140
|
-
input_mapping={"documents": ["embedder.documents"]},
|
|
141
|
-
output_mapping={"writer.documents_written": "documents_written"},
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
145
|
-
"""
|
|
146
|
-
Serialize this instance to a dictionary.
|
|
147
|
-
"""
|
|
148
|
-
serialization_dict = default_to_dict(
|
|
149
|
-
self,
|
|
150
|
-
document_store=self.document_store.to_dict(),
|
|
151
|
-
model=self.model,
|
|
152
|
-
device=self.device.to_dict() if self.device else None,
|
|
153
|
-
token=self.token.to_dict() if self.token else None,
|
|
154
|
-
prefix=self.prefix,
|
|
155
|
-
suffix=self.suffix,
|
|
156
|
-
batch_size=self.batch_size,
|
|
157
|
-
progress_bar=self.progress_bar,
|
|
158
|
-
normalize_embeddings=self.normalize_embeddings,
|
|
159
|
-
meta_fields_to_embed=self.meta_fields_to_embed,
|
|
160
|
-
embedding_separator=self.embedding_separator,
|
|
161
|
-
trust_remote_code=self.trust_remote_code,
|
|
162
|
-
truncate_dim=self.truncate_dim,
|
|
163
|
-
model_kwargs=self.model_kwargs,
|
|
164
|
-
tokenizer_kwargs=self.tokenizer_kwargs,
|
|
165
|
-
config_kwargs=self.config_kwargs,
|
|
166
|
-
precision=self.precision,
|
|
167
|
-
duplicate_policy=self.duplicate_policy.value,
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
|
|
171
|
-
serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
|
|
172
|
-
|
|
173
|
-
return serialization_dict
|
|
174
|
-
|
|
175
|
-
@classmethod
|
|
176
|
-
def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentIndexer":
|
|
177
|
-
"""
|
|
178
|
-
Load an instance of this component from a dictionary.
|
|
179
|
-
"""
|
|
180
|
-
deserialize_document_store_in_init_params_inplace(data)
|
|
181
|
-
init_params = data.get("init_parameters", {})
|
|
182
|
-
|
|
183
|
-
# Handle device deserialization
|
|
184
|
-
if init_params.get("device") is not None:
|
|
185
|
-
init_params["device"] = ComponentDevice.from_dict(init_params["device"])
|
|
186
|
-
|
|
187
|
-
# Handle secrets deserialization
|
|
188
|
-
deserialize_secrets_inplace(init_params, keys=["token"])
|
|
189
|
-
|
|
190
|
-
# Handle model kwargs deserialization
|
|
191
|
-
if init_params.get("model_kwargs") is not None:
|
|
192
|
-
deserialize_hf_model_kwargs(init_params["model_kwargs"])
|
|
193
|
-
|
|
194
|
-
# Handle duplicate policy deserialization
|
|
195
|
-
if policy_value := init_params.get("duplicate_policy"):
|
|
196
|
-
init_params["duplicate_policy"] = DuplicatePolicy(policy_value)
|
|
197
|
-
|
|
198
|
-
data["init_parameters"] = init_params
|
|
199
|
-
return default_from_dict(cls, data)
|
{haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|