haystack-experimental 0.14.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. haystack_experimental/chat_message_stores/__init__.py +1 -1
  2. haystack_experimental/chat_message_stores/in_memory.py +176 -31
  3. haystack_experimental/chat_message_stores/types.py +33 -21
  4. haystack_experimental/components/agents/agent.py +184 -35
  5. haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
  6. haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
  7. haystack_experimental/components/embedders/types/protocol.py +2 -2
  8. haystack_experimental/components/preprocessors/__init__.py +2 -0
  9. haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
  10. haystack_experimental/components/preprocessors/md_header_level_inferrer.py +2 -2
  11. haystack_experimental/components/retrievers/__init__.py +1 -3
  12. haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
  13. haystack_experimental/components/writers/__init__.py +1 -1
  14. haystack_experimental/components/writers/chat_message_writer.py +25 -22
  15. haystack_experimental/core/pipeline/breakpoint.py +5 -3
  16. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/METADATA +24 -31
  17. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/RECORD +20 -27
  18. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/WHEEL +1 -1
  19. haystack_experimental/components/query/__init__.py +0 -18
  20. haystack_experimental/components/query/query_expander.py +0 -299
  21. haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -180
  22. haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -158
  23. haystack_experimental/super_components/__init__.py +0 -3
  24. haystack_experimental/super_components/indexers/__init__.py +0 -11
  25. haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
  26. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/licenses/LICENSE +0 -0
  27. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/licenses/LICENSE-MIT.txt +0 -0
@@ -1,299 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import json
6
- from typing import Any, Dict, List, Optional
7
-
8
- from haystack import default_from_dict, default_to_dict, logging
9
- from haystack.components.builders.prompt_builder import PromptBuilder
10
- from haystack.components.generators.chat.openai import OpenAIChatGenerator
11
- from haystack.components.generators.chat.types import ChatGenerator
12
- from haystack.core.component import component
13
- from haystack.core.serialization import component_to_dict
14
- from haystack.dataclasses.chat_message import ChatMessage
15
- from haystack.utils.deserialization import deserialize_chatgenerator_inplace
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- DEFAULT_PROMPT_TEMPLATE = """
21
- You are part of an information system that processes user queries for retrieval.
22
- You have to expand a given query into {{ n_expansions }} queries that are
23
- semantically similar to improve retrieval recall.
24
-
25
- Structure:
26
- Follow the structure shown below in examples to generate expanded queries.
27
-
28
- Examples:
29
- 1. Query: "climate change effects"
30
- {"queries": ["impact of climate change", "consequences of global warming", "effects of environmental changes"]}
31
-
32
- 2. Query: "machine learning algorithms"
33
- {"queries": ["neural networks", "clustering techniques", "supervised learning methods", "deep learning models"]}
34
-
35
- 3. Query: "open source NLP frameworks"
36
- {"queries": ["natural language processing tools", "free nlp libraries", "open-source NLP platforms"]}
37
-
38
- Guidelines:
39
- - Generate queries that use different words and phrasings
40
- - Include synonyms and related terms
41
- - Maintain the same core meaning and intent
42
- - Make queries that are likely to retrieve relevant information the original might miss
43
- - Focus on variations that would work well with keyword-based search
44
- - Respond in the same language as the input query
45
-
46
- Your Task:
47
- Query: "{{ query }}"
48
-
49
- You *must* respond with a JSON object containing a "queries" array with the expanded queries.
50
- Example: {"queries": ["query1", "query2", "query3"]}"""
51
-
52
-
53
- @component
54
- class QueryExpander:
55
- """
56
- A component that returns a list of semantically similar queries to improve retrieval recall in RAG systems.
57
-
58
- The component uses a chat generator to expand queries. The chat generator is expected to return a JSON response
59
- with the following structure:
60
- ```json
61
- {"queries": ["expanded query 1", "expanded query 2", "expanded query 3"]}
62
- ```
63
-
64
- ### Usage example
65
-
66
- ```python
67
- from haystack.components.generators.chat.openai import OpenAIChatGenerator
68
- from haystack_experimental.components.query import QueryExpander
69
-
70
- expander = QueryExpander(
71
- chat_generator=OpenAIChatGenerator(model="gpt-4.1-mini"),
72
- n_expansions=3
73
- )
74
-
75
- result = expander.run(query="green energy sources")
76
- print(result["queries"])
77
- # Output: ['alternative query 1', 'alternative query 2', 'alternative query 3', 'green energy sources']
78
- # Note: Up to 3 additional queries + 1 original query (if include_original_query=True)
79
-
80
- # To control total number of queries:
81
- expander = QueryExpander(n_expansions=2, include_original_query=True) # Up to 3 total
82
- # or
83
- expander = QueryExpander(n_expansions=3, include_original_query=False) # Exactly 3 total
84
- ```
85
- """
86
-
87
- def __init__(
88
- self,
89
- *,
90
- chat_generator: Optional[ChatGenerator] = None,
91
- prompt_template: Optional[str] = None,
92
- n_expansions: int = 4,
93
- include_original_query: bool = True,
94
- ):
95
- """
96
- Initialize the QueryExpander component.
97
-
98
- :param chat_generator: The chat generator component to use for query expansion.
99
- If None, a default OpenAIChatGenerator with gpt-4.1-mini model is used.
100
- :param prompt_template: Custom [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder)
101
- template for query expansion. The template should instruct the LLM to return a JSON response with the
102
- structure: `{"queries": ["query1", "query2", "query3"]}`. The template should include 'query' and
103
- 'n_expansions' variables.
104
- :param n_expansions: Number of alternative queries to generate (default: 4).
105
- :param include_original_query: Whether to include the original query in the output.
106
- """
107
- if n_expansions <= 0:
108
- raise ValueError("n_expansions must be positive")
109
-
110
- self.n_expansions = n_expansions
111
- self.include_original_query = include_original_query
112
-
113
- if chat_generator is None:
114
- self.chat_generator: ChatGenerator = OpenAIChatGenerator(
115
- model="gpt-4.1-mini",
116
- generation_kwargs={
117
- "temperature": 0.7,
118
- "response_format": {
119
- "type": "json_schema",
120
- "json_schema": {
121
- "name": "query_expansion",
122
- "schema": {
123
- "type": "object",
124
- "properties": {"queries": {"type": "array", "items": {"type": "string"}}},
125
- "required": ["queries"],
126
- "additionalProperties": False,
127
- },
128
- },
129
- },
130
- "seed": 42,
131
- },
132
- )
133
- else:
134
- self.chat_generator = chat_generator
135
-
136
- self._is_warmed_up = False
137
- self._supports_warm_up = hasattr(self.chat_generator, "warm_up")
138
- self.prompt_template = prompt_template or DEFAULT_PROMPT_TEMPLATE
139
-
140
- # Check if required variables are present in the template
141
- if "query" not in self.prompt_template:
142
- logger.warning(
143
- "The prompt template does not contain the 'query' variable. This may cause issues during execution."
144
- )
145
- if "n_expansions" not in self.prompt_template:
146
- logger.warning(
147
- "The prompt template does not contain the 'n_expansions' variable. "
148
- "This may cause issues during execution."
149
- )
150
-
151
- self._prompt_builder = PromptBuilder(
152
- template=self.prompt_template,
153
- required_variables=["n_expansions", "query"],
154
- )
155
-
156
- def to_dict(self) -> Dict[str, Any]:
157
- """
158
- Serializes the component to a dictionary.
159
-
160
- :return: Dictionary with serialized data.
161
- """
162
- return default_to_dict(
163
- self,
164
- chat_generator=component_to_dict(self.chat_generator, name="chat_generator"),
165
- prompt_template=self.prompt_template,
166
- n_expansions=self.n_expansions,
167
- include_original_query=self.include_original_query,
168
- )
169
-
170
- @classmethod
171
- def from_dict(cls, data: Dict[str, Any]) -> "QueryExpander":
172
- """
173
- Deserializes the component from a dictionary.
174
-
175
- :param data: Dictionary with serialized data.
176
- :return: Deserialized component.
177
- """
178
- init_params = data.get("init_parameters", {})
179
-
180
- deserialize_chatgenerator_inplace(init_params, key="chat_generator")
181
-
182
- return default_from_dict(cls, data)
183
-
184
- @component.output_types(queries=List[str])
185
- def run(
186
- self,
187
- query: str,
188
- n_expansions: Optional[int] = None,
189
- ) -> Dict[str, List[str]]:
190
- """
191
- Expand the input query into multiple semantically similar queries.
192
-
193
- The language of the original query is preserved in the expanded queries.
194
-
195
- :param query: The original query to expand.
196
- :param n_expansions: Number of additional queries to generate (not including the original).
197
- If None, uses the value from initialization. Can be 0 to generate no additional queries.
198
- :return: Dictionary with "queries" key containing the list of expanded queries.
199
- If include_original_query=True, the original query will be included in addition
200
- to the n_expansions alternative queries.
201
- :raises ValueError: If n_expansions is not positive (less than or equal to 0).
202
- :raises RuntimeError: If the component is not warmed up and the chat generator does not support warm up.
203
- """
204
-
205
- if not self._is_warmed_up and self._supports_warm_up:
206
- raise RuntimeError("The component is not warmed up. Please call the `warm_up` method first.")
207
-
208
- response = {"queries": [query] if self.include_original_query else []}
209
-
210
- if not query.strip():
211
- logger.warning("Empty query provided to QueryExpander")
212
- return response
213
-
214
- expansion_count = n_expansions if n_expansions is not None else self.n_expansions
215
- if expansion_count <= 0:
216
- raise ValueError("n_expansions must be positive")
217
-
218
- try:
219
- prompt_result = self._prompt_builder.run(query=query.strip(), n_expansions=expansion_count)
220
- generator_result = self.chat_generator.run(messages=[ChatMessage.from_user(prompt_result["prompt"])])
221
-
222
- if not generator_result.get("replies") or len(generator_result["replies"]) == 0:
223
- logger.warning("ChatGenerator returned no replies for query: {query}", query=query)
224
- return response
225
-
226
- expanded_text = generator_result["replies"][0].text.strip()
227
- expanded_queries = self._parse_expanded_queries(expanded_text)
228
-
229
- # Limit the number of expanded queries to the requested amount
230
- if len(expanded_queries) > expansion_count:
231
- logger.warning(
232
- "Generated {generated_count} queries but only {requested_count} were requested. "
233
- "Truncating to the first {requested_count} queries. ",
234
- generated_count=len(expanded_queries),
235
- requested_count=expansion_count,
236
- )
237
- expanded_queries = expanded_queries[:expansion_count]
238
-
239
- # Add original query if requested and remove duplicates
240
- if self.include_original_query:
241
- expanded_queries_lower = [q.lower() for q in expanded_queries]
242
- if query.lower() not in expanded_queries_lower:
243
- expanded_queries.append(query)
244
-
245
- response["queries"] = expanded_queries
246
- return response
247
-
248
- except Exception as e:
249
- # Fallback: return original query to maintain pipeline functionality
250
- logger.error("Failed to expand query {query}: {error}", query=query, error=str(e))
251
- return response
252
-
253
- def warm_up(self):
254
- """
255
- Warm up the underlying LLM if it supports it.
256
- """
257
- if not self._is_warmed_up and self._supports_warm_up:
258
- self.chat_generator.warm_up() # type: ignore[attr-defined]
259
- self._is_warmed_up = True
260
-
261
- @staticmethod
262
- def _parse_expanded_queries(generator_response: str) -> List[str]:
263
- """
264
- Parse the generator response to extract individual expanded queries.
265
-
266
- :param generator_response: The raw text response from the generator.
267
- :return: List of parsed expanded queries.
268
- """
269
- if not generator_response.strip():
270
- return []
271
-
272
- try:
273
- parsed = json.loads(generator_response)
274
- if not isinstance(parsed, dict) or "queries" not in parsed:
275
- logger.warning(
276
- "Generator response is not a JSON object containing a 'queries' array: {response}",
277
- response=generator_response[:100],
278
- )
279
- return []
280
-
281
- queries = []
282
- for item in parsed["queries"]:
283
- if isinstance(item, str) and item.strip():
284
- queries.append(item.strip())
285
- else:
286
- logger.warning(
287
- "Skipping non-string or empty query in response: {item}",
288
- item=item,
289
- )
290
-
291
- return queries
292
-
293
- except json.JSONDecodeError as e:
294
- logger.warning(
295
- "Failed to parse JSON response: {error}. Response: {response}",
296
- error=str(e),
297
- response=generator_response[:100],
298
- )
299
- return []
@@ -1,180 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from concurrent.futures import ThreadPoolExecutor
6
- from typing import Any, List, Optional
7
-
8
- from haystack import Document, component, default_from_dict, default_to_dict
9
- from haystack.components.embedders.types.protocol import TextEmbedder
10
- from haystack.core.serialization import component_to_dict
11
- from haystack.utils.deserialization import deserialize_component_inplace
12
-
13
- from haystack_experimental.components.retrievers.types import EmbeddingRetriever
14
-
15
-
16
- @component
17
- class MultiQueryEmbeddingRetriever:
18
- """
19
- A component that retrieves documents using multiple queries in parallel with an embedding-based retriever.
20
-
21
- This component takes a list of text queries, converts them to embeddings using a query embedder,
22
- and then uses an embedding-based retriever to find relevant documents for each query in parallel.
23
- The results are combined and sorted by relevance score.
24
-
25
- ### Usage example
26
-
27
- ```python
28
- from haystack import Document
29
- from haystack.document_stores.in_memory import InMemoryDocumentStore
30
- from haystack.document_stores.types import DuplicatePolicy
31
- from haystack.components.embedders import SentenceTransformersTextEmbedder
32
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder
33
- from haystack.components.retrievers import InMemoryEmbeddingRetriever
34
- from haystack.components.writers import DocumentWriter
35
- from haystack_experimental.components.retrievers import MultiQueryEmbeddingRetriever
36
-
37
- documents = [
38
- Document(content="Renewable energy is energy that is collected from renewable resources."),
39
- Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
40
- Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
41
- Document(content="Geothermal energy is heat that comes from the sub-surface of the earth."),
42
- Document(content="Biomass energy is produced from organic materials, such as plant and animal waste."),
43
- Document(content="Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources."),
44
- ]
45
-
46
- # Populate the document store
47
- doc_store = InMemoryDocumentStore()
48
- doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
49
- doc_embedder.warm_up()
50
- doc_writer = DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP)
51
- documents = doc_embedder.run(documents)["documents"]
52
- doc_writer.run(documents=documents)
53
-
54
- # Run the multi-query retriever
55
- in_memory_retriever = InMemoryEmbeddingRetriever(document_store=doc_store, top_k=1)
56
- query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
57
-
58
- multi_query_retriever = MultiQueryEmbeddingRetriever(
59
- retriever=in_memory_retriever,
60
- query_embedder=query_embedder,
61
- max_workers=3
62
- )
63
-
64
- queries = ["Geothermal energy", "natural gas", "turbines"]
65
- result = multi_query_retriever.run(queries=queries)
66
- for doc in result["documents"]:
67
- print(f"Content: {doc.content}, Score: {doc.score}")
68
- >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
69
- >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
70
- >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
71
- >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680995796
72
- >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.3091423972562246
73
- >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243668087
74
- ```
75
- """ # noqa E501
76
-
77
- def __init__(
78
- self,
79
- *,
80
- retriever: EmbeddingRetriever,
81
- query_embedder: TextEmbedder,
82
- max_workers: int = 3,
83
- ):
84
- """
85
- Initialize MultiQueryEmbeddingRetriever.
86
-
87
- :param retriever: The embedding-based retriever to use for document retrieval.
88
- :param query_embedder: The query embedder to convert text queries to embeddings.
89
- :param max_workers: Maximum number of worker threads for parallel processing.
90
- """
91
- self.retriever = retriever
92
- self.query_embedder = query_embedder
93
- self.max_workers = max_workers
94
- self._is_warmed_up = False
95
-
96
- def warm_up(self) -> None:
97
- """
98
- Warm up the query embedder and the retriever if any has a warm_up method.
99
- """
100
- if not self._is_warmed_up:
101
- if hasattr(self.query_embedder, "warm_up") and callable(getattr(self.query_embedder, "warm_up")):
102
- self.query_embedder.warm_up()
103
- if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
104
- self.retriever.warm_up()
105
- self._is_warmed_up = True
106
-
107
- @component.output_types(documents=List[Document])
108
- def run(
109
- self,
110
- queries: List[str],
111
- retriever_kwargs: Optional[dict[str, Any]] = None,
112
- ) -> dict[str, Any]:
113
- """
114
- Retrieve documents using multiple queries in parallel.
115
-
116
- :param queries: List of text queries to process.
117
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
118
- :returns:
119
- A dictionary containing:
120
- - `documents`: List of retrieved documents sorted by relevance score.
121
- """
122
- docs: list[Document] = []
123
- seen_contents = set()
124
- retriever_kwargs = retriever_kwargs or {}
125
-
126
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
127
- queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
128
- for result in queries_results:
129
- if not result:
130
- continue
131
- for doc in result:
132
- # deduplicate based on content
133
- if doc.content not in seen_contents:
134
- docs.append(doc)
135
- seen_contents.add(doc.content)
136
-
137
- docs.sort(key=lambda x: x.score or 0.0, reverse=True)
138
- return {"documents": docs}
139
-
140
- def _run_on_thread(self, query: str, retriever_kwargs: Optional[dict[str, Any]] = None) -> Optional[List[Document]]:
141
- """
142
- Process a single query on a separate thread.
143
-
144
- :param query: The text query to process.
145
- :returns:
146
- List of retrieved documents or None if no results.
147
- """
148
- embedding_result = self.query_embedder.run(text=query)
149
- query_embedding = embedding_result["embedding"]
150
- result = self.retriever.run(query_embedding=query_embedding, **(retriever_kwargs or {}))
151
- if result and "documents" in result:
152
- return result["documents"]
153
- return None
154
-
155
- def to_dict(self) -> dict[str, Any]:
156
- """
157
- Serializes the component to a dictionary.
158
-
159
- :returns:
160
- A dictionary representing the serialized component.
161
- """
162
- return default_to_dict(
163
- self,
164
- retriever=component_to_dict(obj=self.retriever, name="retriever"),
165
- query_embedder=component_to_dict(obj=self.query_embedder, name="query_embedder"),
166
- max_workers=self.max_workers,
167
- )
168
-
169
- @classmethod
170
- def from_dict(cls, data: dict[str, Any]) -> "MultiQueryEmbeddingRetriever":
171
- """
172
- Deserializes the component from a dictionary.
173
-
174
- :param data: The dictionary to deserialize from.
175
- :returns:
176
- The deserialized component.
177
- """
178
- deserialize_component_inplace(data["init_parameters"], key="retriever")
179
- deserialize_component_inplace(data["init_parameters"], key="query_embedder")
180
- return default_from_dict(cls, data)
@@ -1,158 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from concurrent.futures import ThreadPoolExecutor
6
- from typing import Any, List, Optional
7
-
8
- from haystack import Document, component, default_from_dict, default_to_dict
9
- from haystack.core.serialization import component_to_dict
10
- from haystack.utils.deserialization import deserialize_component_inplace
11
-
12
- from haystack_experimental.components.retrievers.types import TextRetriever
13
-
14
-
15
- @component
16
- class MultiQueryTextRetriever:
17
- """
18
- A component that retrieves documents using multiple queries in parallel with a text-based retriever.
19
-
20
- This component takes a list of text queries and uses a text-based retriever to find relevant documents for each
21
- query in parallel, using a thread pool to manage concurrent execution. The results are combined and sorted by
22
- relevance score.
23
-
24
- You can use this component in combination with QueryExpander component to enhance the retrieval process.
25
-
26
- ### Usage example
27
- ```python
28
- from haystack import Document
29
- from haystack.components.writers import DocumentWriter
30
- from haystack.document_stores.in_memory import InMemoryDocumentStore
31
- from haystack.document_stores.types import DuplicatePolicy
32
- from haystack.components.retrievers import InMemoryBM25Retriever
33
- from haystack_experimental.components.query import QueryExpander
34
- from haystack_experimental.components.retrievers.multi_query_text_retriever import MultiQueryTextRetriever
35
-
36
- documents = [
37
- Document(content="Renewable energy is energy that is collected from renewable resources."),
38
- Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
39
- Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
40
- Document(content="Hydropower is a form of renewable energy using the flow of water to generate electricity."),
41
- Document(content="Geothermal energy is heat that comes from the sub-surface of the earth.")
42
- ]
43
-
44
- document_store = InMemoryDocumentStore()
45
- doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
46
- doc_writer.run(documents=documents)
47
-
48
- in_memory_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=1)
49
- multiquery_retriever = MultiQueryTextRetriever(retriever=in_memory_retriever)
50
- results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
51
- for doc in results["documents"]:
52
- print(f"Content: {doc.content}, Score: {doc.score}")
53
- >>
54
- >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
55
- >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.6157822790079805
56
- >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
57
- ```
58
- """ # noqa E501
59
-
60
- def __init__(
61
- self,
62
- retriever: TextRetriever,
63
- max_workers: int = 3,
64
- ):
65
- """
66
- Initialize MultiQueryTextRetriever.
67
-
68
- :param retriever: The text-based retriever to use for document retrieval.
69
- :param max_workers: Maximum number of worker threads for parallel processing. Default is 3.
70
- """
71
- self.retriever = retriever
72
- self.max_workers = max_workers
73
- self._is_warmed_up = False
74
-
75
- def warm_up(self) -> None:
76
- """
77
- Warm up the retriever if it has a warm_up method.
78
- """
79
- if not self._is_warmed_up:
80
- if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
81
- self.retriever.warm_up()
82
- self._is_warmed_up = True
83
-
84
- @component.output_types(documents=list[Document])
85
- def run(
86
- self,
87
- queries: List[str],
88
- retriever_kwargs: Optional[dict[str, Any]] = None,
89
- ) -> dict[str, Any]:
90
- """
91
- Retrieve documents using multiple queries in parallel.
92
-
93
- :param queries: List of text queries to process.
94
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
95
- :returns:
96
- A dictionary containing:
97
- `documents`: List of retrieved documents sorted by relevance score.
98
- """
99
- docs: list[Document] = []
100
- seen_contents = set()
101
- retriever_kwargs = retriever_kwargs or {}
102
-
103
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
104
- queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
105
- for result in queries_results:
106
- if not result:
107
- continue
108
- # deduplicate based on content
109
- for doc in result:
110
- if doc.content not in seen_contents:
111
- docs.append(doc)
112
- seen_contents.add(doc.content)
113
-
114
- docs.sort(key=lambda x: x.score or 0.0, reverse=True)
115
- return {"documents": docs}
116
-
117
- def _run_on_thread(
118
- self,
119
- query: str,
120
- retriever_kwargs: Optional[dict[str, Any]] = None,
121
- ) -> Optional[list[Document]]:
122
- """
123
- Process a single query on a separate thread.
124
-
125
- :param query: The text query to process.
126
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
127
- :returns:
128
- List of retrieved documents or None if no results.
129
- """
130
- result = self.retriever.run(query=query, **(retriever_kwargs or {}))
131
- if result and "documents" in result:
132
- return result["documents"]
133
- return None
134
-
135
- def to_dict(self) -> dict[str, Any]:
136
- """
137
- Serializes the component to a dictionary.
138
-
139
- :returns:
140
- The serialized component as a dictionary.
141
- """
142
- return default_to_dict(
143
- self,
144
- retriever=component_to_dict(obj=self.retriever, name="retriever"),
145
- max_workers=self.max_workers,
146
- )
147
-
148
- @classmethod
149
- def from_dict(cls, data: dict[str, Any]) -> "MultiQueryTextRetriever":
150
- """
151
- Deserializes the component from a dictionary.
152
-
153
- :param data: The dictionary to deserialize from.
154
- :returns:
155
- The deserialized component.
156
- """
157
- deserialize_component_inplace(data["init_parameters"], key="retriever")
158
- return default_from_dict(cls, data)
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,11 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from haystack_experimental.super_components.indexers.sentence_transformers_document_indexer import (
6
- SentenceTransformersDocumentIndexer,
7
- )
8
-
9
- __all__ = [
10
- "SentenceTransformersDocumentIndexer",
11
- ]