haystack-experimental 0.14.3__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. haystack_experimental/chat_message_stores/__init__.py +1 -1
  2. haystack_experimental/chat_message_stores/in_memory.py +176 -31
  3. haystack_experimental/chat_message_stores/types.py +33 -21
  4. haystack_experimental/components/agents/agent.py +147 -44
  5. haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
  6. haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
  7. haystack_experimental/components/embedders/types/protocol.py +2 -2
  8. haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
  9. haystack_experimental/components/retrievers/__init__.py +1 -3
  10. haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
  11. haystack_experimental/components/writers/__init__.py +1 -1
  12. haystack_experimental/components/writers/chat_message_writer.py +25 -22
  13. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/METADATA +24 -31
  14. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/RECORD +17 -24
  15. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/WHEEL +1 -1
  16. haystack_experimental/components/query/__init__.py +0 -18
  17. haystack_experimental/components/query/query_expander.py +0 -294
  18. haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -173
  19. haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -150
  20. haystack_experimental/super_components/__init__.py +0 -3
  21. haystack_experimental/super_components/indexers/__init__.py +0 -11
  22. haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
  23. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE +0 -0
  24. {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE-MIT.txt +0 -0
@@ -1,55 +1,48 @@
1
1
  haystack_experimental/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
2
- haystack_experimental/chat_message_stores/__init__.py,sha256=sQhjMoaVfC0XsL257egtZageGg_NgzntRF71rPS0lvc,238
3
- haystack_experimental/chat_message_stores/in_memory.py,sha256=nc_B_70KOvtgsul4QJb7HihHZrO220HRFsm-aXjYFJk,2305
4
- haystack_experimental/chat_message_stores/types.py,sha256=QzjA5-A08PlMAy7MMLNNUpob8S60Ypec74gSbz_l49E,2101
2
+ haystack_experimental/chat_message_stores/__init__.py,sha256=zu1bbMQDv9xUbGadIKWrC8v-87w_Xxg6KQnTb6K0k-Q,240
3
+ haystack_experimental/chat_message_stores/in_memory.py,sha256=i4ZK5W0Q0rDpgoRCYdIjPoJV8UJBr1PlUBH4ul5Adxk,8688
4
+ haystack_experimental/chat_message_stores/types.py,sha256=mXz6QTCyNomSBzr1eU76oHVrxQEwPDuiTIGTWvKwYJM,2739
5
5
  haystack_experimental/components/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
6
6
  haystack_experimental/components/agents/__init__.py,sha256=Sxu9LxPpQ5cljgoTgUeNC0GY8CwUdiSy1JWkd_-RRJ4,414
7
- haystack_experimental/components/agents/agent.py,sha256=861fVcgFXePbxJZ0EYTkR_EhqUXBWoEwajrcjo81nvE,35589
7
+ haystack_experimental/components/agents/agent.py,sha256=ZIrzAQygva8zFhEl1Tu7WRJCz0W_MersPjtesqx8HQE,42128
8
8
  haystack_experimental/components/agents/human_in_the_loop/__init__.py,sha256=xLr1G9pNWMmCpKN9mbv6yqeFfwMcbZyaVfCkzlwMxhY,1674
9
9
  haystack_experimental/components/agents/human_in_the_loop/breakpoint.py,sha256=GhNdGdFNDnwSiTukD4WVp6-1YgGjq5oqCEcGMC2dcog,2902
10
10
  haystack_experimental/components/agents/human_in_the_loop/dataclasses.py,sha256=OakB0PXBSG0LbQixcuo-d7IC-A3_k6qi80pB8hwY23o,2563
11
11
  haystack_experimental/components/agents/human_in_the_loop/errors.py,sha256=HAjD_MCOTBirqnJdxpc2MhqIm-XnU3Soev29wRBWoMw,1066
12
12
  haystack_experimental/components/agents/human_in_the_loop/policies.py,sha256=nzblePptT4Fg2GFHa4_SDIK_d7hZ_70qPhkteZBRXWk,3172
13
- haystack_experimental/components/agents/human_in_the_loop/strategies.py,sha256=KXfMLky27EuxOUhqbHO7oQ3KkL_3lzkwvk1Gk1EMXYY,19643
14
- haystack_experimental/components/agents/human_in_the_loop/types.py,sha256=aY93Wsd-5BgOiJaaSEGB_bGC-BTx_V_UT1faXtbNNdo,3072
13
+ haystack_experimental/components/agents/human_in_the_loop/strategies.py,sha256=kX_3T6DWh4l3_-baOtJPwR0rZi4ZYWUibRA1Myeikp8,28645
14
+ haystack_experimental/components/agents/human_in_the_loop/types.py,sha256=zdKGZ0vgaq-e0akYV_aGvCBpXplRiorpWcBmEbOQvT8,4604
15
15
  haystack_experimental/components/agents/human_in_the_loop/user_interfaces.py,sha256=HlJ3-CYNrQGsHOtpvrQE4ayQls8Q3EkLFUkOoRnLVC4,8707
16
16
  haystack_experimental/components/embedders/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
17
17
  haystack_experimental/components/embedders/types/__init__.py,sha256=HGR8aavwIEx7v-8nm5JxFIw47EWn7vAUmywhakTNDCo,182
18
- haystack_experimental/components/embedders/types/protocol.py,sha256=EEVtggoYWZL6zF-vbasJollCxLbheMYIISAh7hJ8LkA,1038
18
+ haystack_experimental/components/embedders/types/protocol.py,sha256=nVMo2x_sFP9T_DN-q-_HKGrLRd3rj27m7ZLxtigY4UQ,1026
19
19
  haystack_experimental/components/generators/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
20
20
  haystack_experimental/components/generators/chat/__init__.py,sha256=LEKI1mMtltVbSiU40QgBfnWC-z3_660TWuV-cVHhdTw,465
21
21
  haystack_experimental/components/generators/chat/openai.py,sha256=gX6UI4yfY0pzKhWErquvPF_gV-3Ut0y6wSJytAD07Jk,9855
22
22
  haystack_experimental/components/preprocessors/__init__.py,sha256=qZPFKpRxdw_VZ8fZ4T3GIKOObsbeOf_pKnZbLHR9AFU,653
23
- haystack_experimental/components/preprocessors/embedding_based_document_splitter.py,sha256=VyQ--gaMsWid-IRBVXi5YPJpwbFlaK-2mRFvRF8MSBQ,17616
23
+ haystack_experimental/components/preprocessors/embedding_based_document_splitter.py,sha256=NLi9e-aVJkZEvwQVzeWduyvR74wlYRHe6ZviDBx2rTk,17604
24
24
  haystack_experimental/components/preprocessors/md_header_level_inferrer.py,sha256=vyJWAFN-uhBkb5nCuJm0p29H75gGeaomOlHolD-fj5Q,5604
25
- haystack_experimental/components/query/__init__.py,sha256=quaqe16cbtgIdJx7d56CMdk1zZQ6f_3_TICsU0HF_U8,446
26
- haystack_experimental/components/query/query_expander.py,sha256=yyBrtYey9EhaWVNgtUDhYVBIylBt7JTWF7QmSKGDXbM,11961
27
- haystack_experimental/components/retrievers/__init__.py,sha256=CqPvqyvGp5L3Y1gTVQC8DD_xHzbIfTzGlj3oCsZM3J8,528
28
- haystack_experimental/components/retrievers/chat_message_retriever.py,sha256=CaAgW1qzzhMYyKNOyk-eIBgSsO7Bg7uDqAtgcorCE60,4030
29
- haystack_experimental/components/retrievers/multi_query_embedding_retriever.py,sha256=CQAWAGSBeMrLg1PxEPQNCHplYnOgvpXA0f-zbJtugKw,8101
30
- haystack_experimental/components/retrievers/multi_query_text_retriever.py,sha256=CfaqzBHHgnh79eSGU4Iada_gcnYwavj8_Ou6Bc4ndiA,6511
25
+ haystack_experimental/components/retrievers/__init__.py,sha256=7NLOg-A7LmwxskDYebB_bDzawByCb7cXn67hVN_3e6I,245
26
+ haystack_experimental/components/retrievers/chat_message_retriever.py,sha256=O94cNAbiTQ1Jkwsk6qjt9ty4JJwd_3uBh1odJjcvG2I,6046
31
27
  haystack_experimental/components/retrievers/types/__init__.py,sha256=iOngs3gs5enY8y6AWGeyQANTB_9qpXQ0QHSFFDDeEGc,218
32
28
  haystack_experimental/components/retrievers/types/protocol.py,sha256=oUdX_P_pTszzamrkUz3YZsXL3bb4mAYIXsPCtKDH1tw,2375
33
29
  haystack_experimental/components/summarizers/__init__.py,sha256=BqnfB0ZMb9ufYUjJ4qmmmRLPXa9FT8XKhMWW8G9Zg9Y,221
34
30
  haystack_experimental/components/summarizers/llm_summarizer.py,sha256=Rzl3DKWENBKoAiHvgYPsc4ev0WHZGJZj4PBF-FDHiXI,14392
35
- haystack_experimental/components/writers/__init__.py,sha256=iMdeAaZozza8E6dQ4Lc2uOYIFaR95K7bR9mSeuDqSAA,233
36
- haystack_experimental/components/writers/chat_message_writer.py,sha256=iu8gmvmRXlqd9S2-9B8p-7C0Y5GTuOI1AqcVKAkrzDc,3502
31
+ haystack_experimental/components/writers/__init__.py,sha256=DNVIwIEUi6HKsGM5UcIUPjVH7P3I8Hzc8e4PO7tjoPM,235
32
+ haystack_experimental/components/writers/chat_message_writer.py,sha256=Mkv9nShsPFAw1PPC6cK-tyYmXjWydCOl62boPNr7KkU,4042
37
33
  haystack_experimental/core/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
38
34
  haystack_experimental/core/pipeline/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
39
35
  haystack_experimental/core/pipeline/breakpoint.py,sha256=x6EW1lAv4em1z90Ezr0oKNOZGiR8jorzZBI4MOU6qKg,5239
40
36
  haystack_experimental/dataclasses/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
41
37
  haystack_experimental/dataclasses/breakpoints.py,sha256=f0kxYXJRHzk6jAW5Na51MZfUuRIlulhN4oTrGWTpSFE,2095
42
- haystack_experimental/super_components/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
43
- haystack_experimental/super_components/indexers/__init__.py,sha256=4VPKnuzVb89Zb4PT6ejYT4s0zJ4I3rwFtcLwsCdQKJA,313
44
- haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py,sha256=hfXznLVTgO39xO4GRYgi2Xy-pl4EFKtt13JrGncjvXQ,8519
45
38
  haystack_experimental/utils/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
46
39
  haystack_experimental/utils/hallucination_risk_calculator/__init__.py,sha256=kCd-qceud_T8P1XJHgRMaOnljyDjfFQ5UIdxEb5t6V0,219
47
40
  haystack_experimental/utils/hallucination_risk_calculator/core_math.py,sha256=8XIa2gX1B7U400KutPgxfIUHrOggkBPAm9gIkwhF7UM,4079
48
41
  haystack_experimental/utils/hallucination_risk_calculator/dataclasses.py,sha256=3vk9jsbW-7C9n408Qe730qgdXxIOzsTigf4TMLpryvI,2318
49
42
  haystack_experimental/utils/hallucination_risk_calculator/openai_planner.py,sha256=-yVQsGzM5rXsAVwolE6sp5W6q1yDw66SiIUuUbPk1ng,11413
50
43
  haystack_experimental/utils/hallucination_risk_calculator/skeletonization.py,sha256=qNdBUoFiBjQsI3ovrhd4RyTFmIbv51Goai1Z_l9lG28,5488
51
- haystack_experimental-0.14.3.dist-info/METADATA,sha256=9qsaqiWgCWrmALaybmA37OSLZjYXS8H95YlqIoP6DMA,18566
52
- haystack_experimental-0.14.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
53
- haystack_experimental-0.14.3.dist-info/licenses/LICENSE,sha256=93_5nS97uHxptHvK9E8BZgKxLGeIS-rBWT2swIv-X5Y,11368
54
- haystack_experimental-0.14.3.dist-info/licenses/LICENSE-MIT.txt,sha256=knmLkIKj_6tTrTSVRg9Tq88Kww4UCPLt2I1RGXJv9sQ,1037
55
- haystack_experimental-0.14.3.dist-info/RECORD,,
44
+ haystack_experimental-0.15.1.dist-info/METADATA,sha256=bjMAG3cFa5-gq7l8sVKD58gITsPKgvv0wJjb_Mq_TUM,17581
45
+ haystack_experimental-0.15.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
46
+ haystack_experimental-0.15.1.dist-info/licenses/LICENSE,sha256=93_5nS97uHxptHvK9E8BZgKxLGeIS-rBWT2swIv-X5Y,11368
47
+ haystack_experimental-0.15.1.dist-info/licenses/LICENSE-MIT.txt,sha256=knmLkIKj_6tTrTSVRg9Tq88Kww4UCPLt2I1RGXJv9sQ,1037
48
+ haystack_experimental-0.15.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,18 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import sys
6
- from typing import TYPE_CHECKING
7
-
8
- from lazy_imports import LazyImporter
9
-
10
- _import_structure = {
11
- "query_expander": ["QueryExpander"],
12
- }
13
-
14
- if TYPE_CHECKING:
15
- from .query_expander import QueryExpander
16
-
17
- else:
18
- sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
@@ -1,294 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import json
6
- from typing import Any, Optional
7
-
8
- from haystack import default_from_dict, default_to_dict, logging
9
- from haystack.components.builders.prompt_builder import PromptBuilder
10
- from haystack.components.generators.chat.openai import OpenAIChatGenerator
11
- from haystack.components.generators.chat.types import ChatGenerator
12
- from haystack.core.component import component
13
- from haystack.core.serialization import component_to_dict
14
- from haystack.dataclasses.chat_message import ChatMessage
15
- from haystack.utils.deserialization import deserialize_chatgenerator_inplace
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- DEFAULT_PROMPT_TEMPLATE = """
21
- You are part of an information system that processes user queries for retrieval.
22
- You have to expand a given query into {{ n_expansions }} queries that are
23
- semantically similar to improve retrieval recall.
24
-
25
- Structure:
26
- Follow the structure shown below in examples to generate expanded queries.
27
-
28
- Examples:
29
- 1. Query: "climate change effects"
30
- {"queries": ["impact of climate change", "consequences of global warming", "effects of environmental changes"]}
31
-
32
- 2. Query: "machine learning algorithms"
33
- {"queries": ["neural networks", "clustering techniques", "supervised learning methods", "deep learning models"]}
34
-
35
- 3. Query: "open source NLP frameworks"
36
- {"queries": ["natural language processing tools", "free nlp libraries", "open-source NLP platforms"]}
37
-
38
- Guidelines:
39
- - Generate queries that use different words and phrasings
40
- - Include synonyms and related terms
41
- - Maintain the same core meaning and intent
42
- - Make queries that are likely to retrieve relevant information the original might miss
43
- - Focus on variations that would work well with keyword-based search
44
- - Respond in the same language as the input query
45
-
46
- Your Task:
47
- Query: "{{ query }}"
48
-
49
- You *must* respond with a JSON object containing a "queries" array with the expanded queries.
50
- Example: {"queries": ["query1", "query2", "query3"]}"""
51
-
52
-
53
- @component
54
- class QueryExpander:
55
- """
56
- A component that returns a list of semantically similar queries to improve retrieval recall in RAG systems.
57
-
58
- The component uses a chat generator to expand queries. The chat generator is expected to return a JSON response
59
- with the following structure:
60
- ```json
61
- {"queries": ["expanded query 1", "expanded query 2", "expanded query 3"]}
62
- ```
63
-
64
- ### Usage example
65
-
66
- ```python
67
- from haystack.components.generators.chat.openai import OpenAIChatGenerator
68
- from haystack_experimental.components.query import QueryExpander
69
-
70
- expander = QueryExpander(
71
- chat_generator=OpenAIChatGenerator(model="gpt-4.1-mini"),
72
- n_expansions=3
73
- )
74
-
75
- result = expander.run(query="green energy sources")
76
- print(result["queries"])
77
- # Output: ['alternative query 1', 'alternative query 2', 'alternative query 3', 'green energy sources']
78
- # Note: Up to 3 additional queries + 1 original query (if include_original_query=True)
79
-
80
- # To control total number of queries:
81
- expander = QueryExpander(n_expansions=2, include_original_query=True) # Up to 3 total
82
- # or
83
- expander = QueryExpander(n_expansions=3, include_original_query=False) # Exactly 3 total
84
- ```
85
- """
86
-
87
- def __init__(
88
- self,
89
- *,
90
- chat_generator: Optional[ChatGenerator] = None,
91
- prompt_template: Optional[str] = None,
92
- n_expansions: int = 4,
93
- include_original_query: bool = True,
94
- ) -> None:
95
- """
96
- Initialize the QueryExpander component.
97
-
98
- :param chat_generator: The chat generator component to use for query expansion.
99
- If None, a default OpenAIChatGenerator with gpt-4.1-mini model is used.
100
- :param prompt_template: Custom [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder)
101
- template for query expansion. The template should instruct the LLM to return a JSON response with the
102
- structure: `{"queries": ["query1", "query2", "query3"]}`. The template should include 'query' and
103
- 'n_expansions' variables.
104
- :param n_expansions: Number of alternative queries to generate (default: 4).
105
- :param include_original_query: Whether to include the original query in the output.
106
- """
107
- if n_expansions <= 0:
108
- raise ValueError("n_expansions must be positive")
109
-
110
- self.n_expansions = n_expansions
111
- self.include_original_query = include_original_query
112
-
113
- if chat_generator is None:
114
- self.chat_generator: ChatGenerator = OpenAIChatGenerator(
115
- model="gpt-4.1-mini",
116
- generation_kwargs={
117
- "temperature": 0.7,
118
- "response_format": {
119
- "type": "json_schema",
120
- "json_schema": {
121
- "name": "query_expansion",
122
- "schema": {
123
- "type": "object",
124
- "properties": {"queries": {"type": "array", "items": {"type": "string"}}},
125
- "required": ["queries"],
126
- "additionalProperties": False,
127
- },
128
- },
129
- },
130
- "seed": 42,
131
- },
132
- )
133
- else:
134
- self.chat_generator = chat_generator
135
-
136
- self._is_warmed_up = False
137
- self.prompt_template = prompt_template or DEFAULT_PROMPT_TEMPLATE
138
-
139
- # Check if required variables are present in the template
140
- if "query" not in self.prompt_template:
141
- logger.warning(
142
- "The prompt template does not contain the 'query' variable. This may cause issues during execution."
143
- )
144
- if "n_expansions" not in self.prompt_template:
145
- logger.warning(
146
- "The prompt template does not contain the 'n_expansions' variable. "
147
- "This may cause issues during execution."
148
- )
149
-
150
- self._prompt_builder = PromptBuilder(
151
- template=self.prompt_template,
152
- required_variables=["n_expansions", "query"],
153
- )
154
-
155
- def to_dict(self) -> dict[str, Any]:
156
- """
157
- Serializes the component to a dictionary.
158
-
159
- :return: Dictionary with serialized data.
160
- """
161
- return default_to_dict(
162
- self,
163
- chat_generator=component_to_dict(self.chat_generator, name="chat_generator"),
164
- prompt_template=self.prompt_template,
165
- n_expansions=self.n_expansions,
166
- include_original_query=self.include_original_query,
167
- )
168
-
169
- @classmethod
170
- def from_dict(cls, data: dict[str, Any]) -> "QueryExpander":
171
- """
172
- Deserializes the component from a dictionary.
173
-
174
- :param data: Dictionary with serialized data.
175
- :return: Deserialized component.
176
- """
177
- init_params = data.get("init_parameters", {})
178
-
179
- deserialize_chatgenerator_inplace(init_params, key="chat_generator")
180
-
181
- return default_from_dict(cls, data)
182
-
183
- @component.output_types(queries=list[str])
184
- def run(self, query: str, n_expansions: Optional[int] = None) -> dict[str, list[str]]:
185
- """
186
- Expand the input query into multiple semantically similar queries.
187
-
188
- The language of the original query is preserved in the expanded queries.
189
-
190
- :param query: The original query to expand.
191
- :param n_expansions: Number of additional queries to generate (not including the original).
192
- If None, uses the value from initialization. Can be 0 to generate no additional queries.
193
- :return: Dictionary with "queries" key containing the list of expanded queries.
194
- If include_original_query=True, the original query will be included in addition
195
- to the n_expansions alternative queries.
196
- :raises ValueError: If n_expansions is not positive (less than or equal to 0).
197
- """
198
-
199
- if not self._is_warmed_up:
200
- self.warm_up()
201
-
202
- response = {"queries": [query] if self.include_original_query else []}
203
-
204
- if not query.strip():
205
- logger.warning("Empty query provided to QueryExpander")
206
- return response
207
-
208
- expansion_count = n_expansions if n_expansions is not None else self.n_expansions
209
- if expansion_count <= 0:
210
- raise ValueError("n_expansions must be positive")
211
-
212
- try:
213
- prompt_result = self._prompt_builder.run(query=query.strip(), n_expansions=expansion_count)
214
- generator_result = self.chat_generator.run(messages=[ChatMessage.from_user(prompt_result["prompt"])])
215
-
216
- if not generator_result.get("replies") or len(generator_result["replies"]) == 0:
217
- logger.warning("ChatGenerator returned no replies for query: {query}", query=query)
218
- return response
219
-
220
- expanded_text = generator_result["replies"][0].text.strip()
221
- expanded_queries = self._parse_expanded_queries(expanded_text)
222
-
223
- # Limit the number of expanded queries to the requested amount
224
- if len(expanded_queries) > expansion_count:
225
- logger.warning(
226
- "Generated {generated_count} queries but only {requested_count} were requested. "
227
- "Truncating to the first {requested_count} queries. ",
228
- generated_count=len(expanded_queries),
229
- requested_count=expansion_count,
230
- )
231
- expanded_queries = expanded_queries[:expansion_count]
232
-
233
- # Add original query if requested and remove duplicates
234
- if self.include_original_query:
235
- expanded_queries_lower = [q.lower() for q in expanded_queries]
236
- if query.lower() not in expanded_queries_lower:
237
- expanded_queries.append(query)
238
-
239
- response["queries"] = expanded_queries
240
- return response
241
-
242
- except Exception as e:
243
- # Fallback: return original query to maintain pipeline functionality
244
- logger.error("Failed to expand query {query}: {error}", query=query, error=str(e))
245
- return response
246
-
247
- def warm_up(self):
248
- """
249
- Warm up the LLM provider component.
250
- """
251
- if not self._is_warmed_up:
252
- if hasattr(self.chat_generator, "warm_up"):
253
- self.chat_generator.warm_up()
254
- self._is_warmed_up = True
255
-
256
- @staticmethod
257
- def _parse_expanded_queries(generator_response: str) -> list[str]:
258
- """
259
- Parse the generator response to extract individual expanded queries.
260
-
261
- :param generator_response: The raw text response from the generator.
262
- :return: List of parsed expanded queries.
263
- """
264
- if not generator_response.strip():
265
- return []
266
-
267
- try:
268
- parsed = json.loads(generator_response)
269
- if not isinstance(parsed, dict) or "queries" not in parsed:
270
- logger.warning(
271
- "Generator response is not a JSON object containing a 'queries' array: {response}",
272
- response=generator_response[:100],
273
- )
274
- return []
275
-
276
- queries = []
277
- for item in parsed["queries"]:
278
- if isinstance(item, str) and item.strip():
279
- queries.append(item.strip())
280
- else:
281
- logger.warning(
282
- "Skipping non-string or empty query in response: {item}",
283
- item=item,
284
- )
285
-
286
- return queries
287
-
288
- except json.JSONDecodeError as e:
289
- logger.warning(
290
- "Failed to parse JSON response: {error}. Response: {response}",
291
- error=str(e),
292
- response=generator_response[:100],
293
- )
294
- return []
@@ -1,173 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from concurrent.futures import ThreadPoolExecutor
6
- from typing import Any, Optional
7
-
8
- from haystack import Document, component, default_from_dict, default_to_dict
9
- from haystack.components.embedders.types.protocol import TextEmbedder
10
- from haystack.core.serialization import component_to_dict
11
- from haystack.utils.deserialization import deserialize_component_inplace
12
-
13
- from haystack_experimental.components.retrievers.types import EmbeddingRetriever
14
-
15
-
16
- @component
17
- class MultiQueryEmbeddingRetriever:
18
- """
19
- A component that retrieves documents using multiple queries in parallel with an embedding-based retriever.
20
-
21
- This component takes a list of text queries, converts them to embeddings using a query embedder,
22
- and then uses an embedding-based retriever to find relevant documents for each query in parallel.
23
- The results are combined and sorted by relevance score.
24
-
25
- ### Usage example
26
-
27
- ```python
28
- from haystack import Document
29
- from haystack.document_stores.in_memory import InMemoryDocumentStore
30
- from haystack.document_stores.types import DuplicatePolicy
31
- from haystack.components.embedders import SentenceTransformersTextEmbedder
32
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder
33
- from haystack.components.retrievers import InMemoryEmbeddingRetriever
34
- from haystack.components.writers import DocumentWriter
35
- from haystack_experimental.components.retrievers import MultiQueryEmbeddingRetriever
36
-
37
- documents = [
38
- Document(content="Renewable energy is energy that is collected from renewable resources."),
39
- Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
40
- Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
41
- Document(content="Geothermal energy is heat that comes from the sub-surface of the earth."),
42
- Document(content="Biomass energy is produced from organic materials, such as plant and animal waste."),
43
- Document(content="Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources."),
44
- ]
45
-
46
- # Populate the document store
47
- doc_store = InMemoryDocumentStore()
48
- doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
49
- doc_embedder.warm_up()
50
- doc_writer = DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP)
51
- documents = doc_embedder.run(documents)["documents"]
52
- doc_writer.run(documents=documents)
53
-
54
- # Run the multi-query retriever
55
- in_memory_retriever = InMemoryEmbeddingRetriever(document_store=doc_store, top_k=1)
56
- query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
57
-
58
- multi_query_retriever = MultiQueryEmbeddingRetriever(
59
- retriever=in_memory_retriever,
60
- query_embedder=query_embedder,
61
- max_workers=3
62
- )
63
-
64
- queries = ["Geothermal energy", "natural gas", "turbines"]
65
- result = multi_query_retriever.run(queries=queries)
66
- for doc in result["documents"]:
67
- print(f"Content: {doc.content}, Score: {doc.score}")
68
- >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
69
- >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
70
- >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
71
- >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680995796
72
- >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.3091423972562246
73
- >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243668087
74
- ```
75
- """ # noqa E501
76
-
77
- def __init__(self, *, retriever: EmbeddingRetriever, query_embedder: TextEmbedder, max_workers: int = 3) -> None:
78
- """
79
- Initialize MultiQueryEmbeddingRetriever.
80
-
81
- :param retriever: The embedding-based retriever to use for document retrieval.
82
- :param query_embedder: The query embedder to convert text queries to embeddings.
83
- :param max_workers: Maximum number of worker threads for parallel processing.
84
- """
85
- self.retriever = retriever
86
- self.query_embedder = query_embedder
87
- self.max_workers = max_workers
88
- self._is_warmed_up = False
89
-
90
- def warm_up(self) -> None:
91
- """
92
- Warm up the query embedder and the retriever if any has a warm_up method.
93
- """
94
- if not self._is_warmed_up:
95
- if hasattr(self.query_embedder, "warm_up") and callable(getattr(self.query_embedder, "warm_up")):
96
- self.query_embedder.warm_up()
97
- if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
98
- self.retriever.warm_up()
99
- self._is_warmed_up = True
100
-
101
- @component.output_types(documents=list[Document])
102
- def run(self, queries: list[str], retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, list[Document]]:
103
- """
104
- Retrieve documents using multiple queries in parallel.
105
-
106
- :param queries: List of text queries to process.
107
- :param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
108
- :returns:
109
- A dictionary containing:
110
- - `documents`: List of retrieved documents sorted by relevance score.
111
- """
112
- docs: list[Document] = []
113
- seen_contents = set()
114
- retriever_kwargs = retriever_kwargs or {}
115
-
116
- if not self._is_warmed_up:
117
- self.warm_up()
118
-
119
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
120
- queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
121
- for result in queries_results:
122
- if not result:
123
- continue
124
- for doc in result:
125
- # deduplicate based on content
126
- if doc.content not in seen_contents:
127
- docs.append(doc)
128
- seen_contents.add(doc.content)
129
-
130
- docs.sort(key=lambda x: x.score or 0.0, reverse=True)
131
- return {"documents": docs}
132
-
133
- def _run_on_thread(self, query: str, retriever_kwargs: Optional[dict[str, Any]] = None) -> Optional[list[Document]]:
134
- """
135
- Process a single query on a separate thread.
136
-
137
- :param query: The text query to process.
138
- :returns:
139
- List of retrieved documents or None if no results.
140
- """
141
- embedding_result = self.query_embedder.run(text=query)
142
- query_embedding = embedding_result["embedding"]
143
- result = self.retriever.run(query_embedding=query_embedding, **(retriever_kwargs or {}))
144
- if result and "documents" in result:
145
- return result["documents"]
146
- return None
147
-
148
- def to_dict(self) -> dict[str, Any]:
149
- """
150
- Serializes the component to a dictionary.
151
-
152
- :returns:
153
- A dictionary representing the serialized component.
154
- """
155
- return default_to_dict(
156
- self,
157
- retriever=component_to_dict(obj=self.retriever, name="retriever"),
158
- query_embedder=component_to_dict(obj=self.query_embedder, name="query_embedder"),
159
- max_workers=self.max_workers,
160
- )
161
-
162
- @classmethod
163
- def from_dict(cls, data: dict[str, Any]) -> "MultiQueryEmbeddingRetriever":
164
- """
165
- Deserializes the component from a dictionary.
166
-
167
- :param data: The dictionary to deserialize from.
168
- :returns:
169
- The deserialized component.
170
- """
171
- deserialize_component_inplace(data["init_parameters"], key="retriever")
172
- deserialize_component_inplace(data["init_parameters"], key="query_embedder")
173
- return default_from_dict(cls, data)