haystack-experimental 0.14.3__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. haystack_experimental/chat_message_stores/__init__.py +1 -1
  2. haystack_experimental/chat_message_stores/in_memory.py +176 -31
  3. haystack_experimental/chat_message_stores/types.py +33 -21
  4. haystack_experimental/components/agents/agent.py +147 -44
  5. haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
  6. haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
  7. haystack_experimental/components/embedders/types/protocol.py +2 -2
  8. haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
  9. haystack_experimental/components/retrievers/__init__.py +1 -3
  10. haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
  11. haystack_experimental/components/writers/__init__.py +1 -1
  12. haystack_experimental/components/writers/chat_message_writer.py +25 -22
  13. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/METADATA +24 -31
  14. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/RECORD +17 -24
  15. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/WHEEL +1 -1
  16. haystack_experimental/components/query/__init__.py +0 -18
  17. haystack_experimental/components/query/query_expander.py +0 -294
  18. haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -173
  19. haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -150
  20. haystack_experimental/super_components/__init__.py +0 -3
  21. haystack_experimental/super_components/indexers/__init__.py +0 -11
  22. haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
  23. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE +0 -0
  24. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE-MIT.txt +0 -0
@@ -1,150 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from concurrent.futures import ThreadPoolExecutor
6
- from typing import Any, Optional
7
-
8
- from haystack import Document, component, default_from_dict, default_to_dict
9
- from haystack.core.serialization import component_to_dict
10
- from haystack.utils.deserialization import deserialize_component_inplace
11
-
12
- from haystack_experimental.components.retrievers.types import TextRetriever
13
-
14
-
15
- @component
16
- class MultiQueryTextRetriever:
17
- """
18
- A component that retrieves documents using multiple queries in parallel with a text-based retriever.
19
-
20
- This component takes a list of text queries and uses a text-based retriever to find relevant documents for each
21
- query in parallel, using a thread pool to manage concurrent execution. The results are combined and sorted by
22
- relevance score.
23
-
24
- You can use this component in combination with QueryExpander component to enhance the retrieval process.
25
-
26
- ### Usage example
27
- ```python
28
- from haystack import Document
29
- from haystack.components.writers import DocumentWriter
30
- from haystack.document_stores.in_memory import InMemoryDocumentStore
31
- from haystack.document_stores.types import DuplicatePolicy
32
- from haystack.components.retrievers import InMemoryBM25Retriever
33
- from haystack_experimental.components.query import QueryExpander
34
- from haystack_experimental.components.retrievers.multi_query_text_retriever import MultiQueryTextRetriever
35
-
36
- documents = [
37
- Document(content="Renewable energy is energy that is collected from renewable resources."),
38
- Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
39
- Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
40
- Document(content="Hydropower is a form of renewable energy using the flow of water to generate electricity."),
41
- Document(content="Geothermal energy is heat that comes from the sub-surface of the earth.")
42
- ]
43
-
44
- document_store = InMemoryDocumentStore()
45
- doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
46
- doc_writer.run(documents=documents)
47
-
48
- in_memory_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=1)
49
- multiquery_retriever = MultiQueryTextRetriever(retriever=in_memory_retriever)
50
- results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
51
- for doc in results["documents"]:
52
- print(f"Content: {doc.content}, Score: {doc.score}")
53
- >>
54
- >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
55
- >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.6157822790079805
56
- >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
57
- ```
58
- """ # noqa E501
59
-
60
- def __init__(self, *, retriever: TextRetriever, max_workers: int = 3) -> None:
61
- """
62
- Initialize MultiQueryTextRetriever.
63
-
64
- :param retriever: The text-based retriever to use for document retrieval.
65
- :param max_workers: Maximum number of worker threads for parallel processing. Default is 3.
66
- """
67
- self.retriever = retriever
68
- self.max_workers = max_workers
69
- self._is_warmed_up = False
70
-
71
- def warm_up(self) -> None:
72
- """
73
- Warm up the retriever if it has a warm_up method.
74
- """
75
- if not self._is_warmed_up:
76
- if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
77
- self.retriever.warm_up()
78
- self._is_warmed_up = True
79
-
80
- @component.output_types(documents=list[Document])
81
- def run(self, queries: list[str], retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, list[Document]]:
82
- """
83
- Retrieve documents using multiple queries in parallel.
84
-
85
- :param queries: List of text queries to process.
86
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
87
- :returns:
88
- A dictionary containing:
89
- `documents`: List of retrieved documents sorted by relevance score.
90
- """
91
- docs: list[Document] = []
92
- seen_contents = set()
93
- retriever_kwargs = retriever_kwargs or {}
94
-
95
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
96
- queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
97
- for result in queries_results:
98
- if not result:
99
- continue
100
- # deduplicate based on content
101
- for doc in result:
102
- if doc.content not in seen_contents:
103
- docs.append(doc)
104
- seen_contents.add(doc.content)
105
-
106
- docs.sort(key=lambda x: x.score or 0.0, reverse=True)
107
- return {"documents": docs}
108
-
109
- def _run_on_thread(
110
- self,
111
- query: str,
112
- retriever_kwargs: Optional[dict[str, Any]] = None,
113
- ) -> Optional[list[Document]]:
114
- """
115
- Process a single query on a separate thread.
116
-
117
- :param query: The text query to process.
118
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
119
- :returns:
120
- List of retrieved documents or None if no results.
121
- """
122
- result = self.retriever.run(query=query, **(retriever_kwargs or {}))
123
- if result and "documents" in result:
124
- return result["documents"]
125
- return None
126
-
127
- def to_dict(self) -> dict[str, Any]:
128
- """
129
- Serializes the component to a dictionary.
130
-
131
- :returns:
132
- The serialized component as a dictionary.
133
- """
134
- return default_to_dict(
135
- self,
136
- retriever=component_to_dict(obj=self.retriever, name="retriever"),
137
- max_workers=self.max_workers,
138
- )
139
-
140
- @classmethod
141
- def from_dict(cls, data: dict[str, Any]) -> "MultiQueryTextRetriever":
142
- """
143
- Deserializes the component from a dictionary.
144
-
145
- :param data: The dictionary to deserialize from.
146
- :returns:
147
- The deserialized component.
148
- """
149
- deserialize_component_inplace(data["init_parameters"], key="retriever")
150
- return default_from_dict(cls, data)
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,11 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from haystack_experimental.super_components.indexers.sentence_transformers_document_indexer import (
6
- SentenceTransformersDocumentIndexer,
7
- )
8
-
9
- __all__ = [
10
- "SentenceTransformersDocumentIndexer",
11
- ]
@@ -1,199 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from typing import Any, Dict, List, Literal, Optional
6
-
7
- from haystack import Pipeline, component, default_from_dict, default_to_dict
8
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder
9
- from haystack.components.writers import DocumentWriter
10
- from haystack.core.super_component import SuperComponent
11
- from haystack.document_stores.types import DocumentStore, DuplicatePolicy
12
- from haystack.utils import (
13
- ComponentDevice,
14
- Secret,
15
- deserialize_document_store_in_init_params_inplace,
16
- deserialize_secrets_inplace,
17
- )
18
- from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
19
-
20
-
21
- @component
22
- class SentenceTransformersDocumentIndexer(SuperComponent):
23
- """
24
- A document indexer that takes a list of documents, embeds them using SentenceTransformers, and stores them.
25
-
26
- Usage:
27
-
28
- ```python
29
- >>> from haystack import Document
30
- >>> from haystack.document_stores.in_memory import InMemoryDocumentStore
31
- >>> document_store = InMemoryDocumentStore()
32
- >>> doc = Document(content="I love pizza!")
33
- >>> indexer = SentenceTransformersDocumentIndexer(document_store=document_store)
34
- >>> indexer.warm_up()
35
- >>> result = indexer.run(documents=[doc])
36
- >>> print(result)
37
- {'documents_written': 1}
38
- >>> document_store.count_documents()
39
- 1
40
- ```
41
- """
42
-
43
- def __init__( # pylint: disable=R0917
44
- self,
45
- document_store: DocumentStore,
46
- model: str = "sentence-transformers/all-mpnet-base-v2",
47
- device: Optional[ComponentDevice] = None,
48
- token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
49
- prefix: str = "",
50
- suffix: str = "",
51
- batch_size: int = 32,
52
- progress_bar: bool = True,
53
- normalize_embeddings: bool = False,
54
- meta_fields_to_embed: Optional[List[str]] = None,
55
- embedding_separator: str = "\n",
56
- trust_remote_code: bool = False,
57
- truncate_dim: Optional[int] = None,
58
- model_kwargs: Optional[Dict[str, Any]] = None,
59
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
60
- config_kwargs: Optional[Dict[str, Any]] = None,
61
- precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
62
- duplicate_policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE,
63
- ) -> None:
64
- """
65
- Initialize the SentenceTransformersDocumentIndexer component.
66
-
67
- :param document_store: The document store where the documents should be stored.
68
- :param model: The embedding model to use (local path or Hugging Face model ID).
69
- :param device: The device to use for loading the model.
70
- :param token: The API token to download private models from Hugging Face.
71
- :param prefix: String to add at the beginning of each document text.
72
- :param suffix: String to add at the end of each document text.
73
- :param batch_size: Number of documents to embed at once.
74
- :param progress_bar: If True, shows a progress bar when embedding documents.
75
- :param normalize_embeddings: If True, embeddings are L2 normalized.
76
- :param meta_fields_to_embed: List of metadata fields to embed along with the document text.
77
- :param embedding_separator: Separator used to concatenate metadata fields to document text.
78
- :param trust_remote_code: If True, allows custom models and scripts.
79
- :param truncate_dim: Dimension to truncate sentence embeddings to.
80
- :param model_kwargs: Additional keyword arguments for model initialization.
81
- :param tokenizer_kwargs: Additional keyword arguments for tokenizer initialization.
82
- :param config_kwargs: Additional keyword arguments for model configuration.
83
- :param precision: The precision to use for the embeddings.
84
- :param duplicate_policy: The duplicate policy to use when writing documents.
85
- """
86
- self.document_store = document_store
87
- self.model = model
88
- self.device = device
89
- self.token = token
90
- self.prefix = prefix
91
- self.suffix = suffix
92
- self.batch_size = batch_size
93
- self.progress_bar = progress_bar
94
- self.normalize_embeddings = normalize_embeddings
95
- self.meta_fields_to_embed = meta_fields_to_embed
96
- self.embedding_separator = embedding_separator
97
- self.trust_remote_code = trust_remote_code
98
- self.truncate_dim = truncate_dim
99
- self.model_kwargs = model_kwargs
100
- self.tokenizer_kwargs = tokenizer_kwargs
101
- self.config_kwargs = config_kwargs
102
- self.precision = precision
103
- self.duplicate_policy = duplicate_policy
104
-
105
- pipeline = Pipeline()
106
-
107
- pipeline.add_component(
108
- "embedder",
109
- SentenceTransformersDocumentEmbedder(
110
- model=self.model,
111
- device=self.device,
112
- token=self.token,
113
- prefix=self.prefix,
114
- suffix=self.suffix,
115
- batch_size=self.batch_size,
116
- progress_bar=self.progress_bar,
117
- normalize_embeddings=self.normalize_embeddings,
118
- meta_fields_to_embed=self.meta_fields_to_embed,
119
- embedding_separator=self.embedding_separator,
120
- trust_remote_code=self.trust_remote_code,
121
- truncate_dim=self.truncate_dim,
122
- model_kwargs=self.model_kwargs,
123
- tokenizer_kwargs=self.tokenizer_kwargs,
124
- config_kwargs=self.config_kwargs,
125
- precision=self.precision,
126
- ),
127
- )
128
- pipeline.add_component(
129
- "writer",
130
- DocumentWriter(
131
- document_store=self.document_store,
132
- policy=self.duplicate_policy,
133
- ),
134
- )
135
-
136
- pipeline.connect("embedder.documents", "writer.documents")
137
-
138
- super(SentenceTransformersDocumentIndexer, self).__init__(
139
- pipeline=pipeline,
140
- input_mapping={"documents": ["embedder.documents"]},
141
- output_mapping={"writer.documents_written": "documents_written"},
142
- )
143
-
144
- def to_dict(self) -> Dict[str, Any]:
145
- """
146
- Serialize this instance to a dictionary.
147
- """
148
- serialization_dict = default_to_dict(
149
- self,
150
- document_store=self.document_store.to_dict(),
151
- model=self.model,
152
- device=self.device.to_dict() if self.device else None,
153
- token=self.token.to_dict() if self.token else None,
154
- prefix=self.prefix,
155
- suffix=self.suffix,
156
- batch_size=self.batch_size,
157
- progress_bar=self.progress_bar,
158
- normalize_embeddings=self.normalize_embeddings,
159
- meta_fields_to_embed=self.meta_fields_to_embed,
160
- embedding_separator=self.embedding_separator,
161
- trust_remote_code=self.trust_remote_code,
162
- truncate_dim=self.truncate_dim,
163
- model_kwargs=self.model_kwargs,
164
- tokenizer_kwargs=self.tokenizer_kwargs,
165
- config_kwargs=self.config_kwargs,
166
- precision=self.precision,
167
- duplicate_policy=self.duplicate_policy.value,
168
- )
169
-
170
- if serialization_dict["init_parameters"].get("model_kwargs") is not None:
171
- serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
172
-
173
- return serialization_dict
174
-
175
- @classmethod
176
- def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentIndexer":
177
- """
178
- Load an instance of this component from a dictionary.
179
- """
180
- deserialize_document_store_in_init_params_inplace(data)
181
- init_params = data.get("init_parameters", {})
182
-
183
- # Handle device deserialization
184
- if init_params.get("device") is not None:
185
- init_params["device"] = ComponentDevice.from_dict(init_params["device"])
186
-
187
- # Handle secrets deserialization
188
- deserialize_secrets_inplace(init_params, keys=["token"])
189
-
190
- # Handle model kwargs deserialization
191
- if init_params.get("model_kwargs") is not None:
192
- deserialize_hf_model_kwargs(init_params["model_kwargs"])
193
-
194
- # Handle duplicate policy deserialization
195
- if policy_value := init_params.get("duplicate_policy"):
196
- init_params["duplicate_policy"] = DuplicatePolicy(policy_value)
197
-
198
- data["init_parameters"] = init_params
199
- return default_from_dict(cls, data)