haystack-experimental 0.14.3__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_experimental/chat_message_stores/__init__.py +1 -1
- haystack_experimental/chat_message_stores/in_memory.py +176 -31
- haystack_experimental/chat_message_stores/types.py +33 -21
- haystack_experimental/components/agents/agent.py +147 -44
- haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
- haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
- haystack_experimental/components/embedders/types/protocol.py +2 -2
- haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
- haystack_experimental/components/retrievers/__init__.py +1 -3
- haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
- haystack_experimental/components/writers/__init__.py +1 -1
- haystack_experimental/components/writers/chat_message_writer.py +25 -22
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/METADATA +24 -31
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/RECORD +17 -24
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/WHEEL +1 -1
- haystack_experimental/components/query/__init__.py +0 -18
- haystack_experimental/components/query/query_expander.py +0 -294
- haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -173
- haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -150
- haystack_experimental/super_components/__init__.py +0 -3
- haystack_experimental/super_components/indexers/__init__.py +0 -11
- haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE +0 -0
- {haystack_experimental-0.14.3.dist-info → haystack_experimental-0.15.1.dist-info}/licenses/LICENSE-MIT.txt +0 -0
|
@@ -1,55 +1,48 @@
|
|
|
1
1
|
haystack_experimental/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
2
|
-
haystack_experimental/chat_message_stores/__init__.py,sha256=
|
|
3
|
-
haystack_experimental/chat_message_stores/in_memory.py,sha256=
|
|
4
|
-
haystack_experimental/chat_message_stores/types.py,sha256=
|
|
2
|
+
haystack_experimental/chat_message_stores/__init__.py,sha256=zu1bbMQDv9xUbGadIKWrC8v-87w_Xxg6KQnTb6K0k-Q,240
|
|
3
|
+
haystack_experimental/chat_message_stores/in_memory.py,sha256=i4ZK5W0Q0rDpgoRCYdIjPoJV8UJBr1PlUBH4ul5Adxk,8688
|
|
4
|
+
haystack_experimental/chat_message_stores/types.py,sha256=mXz6QTCyNomSBzr1eU76oHVrxQEwPDuiTIGTWvKwYJM,2739
|
|
5
5
|
haystack_experimental/components/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
6
6
|
haystack_experimental/components/agents/__init__.py,sha256=Sxu9LxPpQ5cljgoTgUeNC0GY8CwUdiSy1JWkd_-RRJ4,414
|
|
7
|
-
haystack_experimental/components/agents/agent.py,sha256=
|
|
7
|
+
haystack_experimental/components/agents/agent.py,sha256=ZIrzAQygva8zFhEl1Tu7WRJCz0W_MersPjtesqx8HQE,42128
|
|
8
8
|
haystack_experimental/components/agents/human_in_the_loop/__init__.py,sha256=xLr1G9pNWMmCpKN9mbv6yqeFfwMcbZyaVfCkzlwMxhY,1674
|
|
9
9
|
haystack_experimental/components/agents/human_in_the_loop/breakpoint.py,sha256=GhNdGdFNDnwSiTukD4WVp6-1YgGjq5oqCEcGMC2dcog,2902
|
|
10
10
|
haystack_experimental/components/agents/human_in_the_loop/dataclasses.py,sha256=OakB0PXBSG0LbQixcuo-d7IC-A3_k6qi80pB8hwY23o,2563
|
|
11
11
|
haystack_experimental/components/agents/human_in_the_loop/errors.py,sha256=HAjD_MCOTBirqnJdxpc2MhqIm-XnU3Soev29wRBWoMw,1066
|
|
12
12
|
haystack_experimental/components/agents/human_in_the_loop/policies.py,sha256=nzblePptT4Fg2GFHa4_SDIK_d7hZ_70qPhkteZBRXWk,3172
|
|
13
|
-
haystack_experimental/components/agents/human_in_the_loop/strategies.py,sha256=
|
|
14
|
-
haystack_experimental/components/agents/human_in_the_loop/types.py,sha256=
|
|
13
|
+
haystack_experimental/components/agents/human_in_the_loop/strategies.py,sha256=kX_3T6DWh4l3_-baOtJPwR0rZi4ZYWUibRA1Myeikp8,28645
|
|
14
|
+
haystack_experimental/components/agents/human_in_the_loop/types.py,sha256=zdKGZ0vgaq-e0akYV_aGvCBpXplRiorpWcBmEbOQvT8,4604
|
|
15
15
|
haystack_experimental/components/agents/human_in_the_loop/user_interfaces.py,sha256=HlJ3-CYNrQGsHOtpvrQE4ayQls8Q3EkLFUkOoRnLVC4,8707
|
|
16
16
|
haystack_experimental/components/embedders/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
17
17
|
haystack_experimental/components/embedders/types/__init__.py,sha256=HGR8aavwIEx7v-8nm5JxFIw47EWn7vAUmywhakTNDCo,182
|
|
18
|
-
haystack_experimental/components/embedders/types/protocol.py,sha256=
|
|
18
|
+
haystack_experimental/components/embedders/types/protocol.py,sha256=nVMo2x_sFP9T_DN-q-_HKGrLRd3rj27m7ZLxtigY4UQ,1026
|
|
19
19
|
haystack_experimental/components/generators/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
20
20
|
haystack_experimental/components/generators/chat/__init__.py,sha256=LEKI1mMtltVbSiU40QgBfnWC-z3_660TWuV-cVHhdTw,465
|
|
21
21
|
haystack_experimental/components/generators/chat/openai.py,sha256=gX6UI4yfY0pzKhWErquvPF_gV-3Ut0y6wSJytAD07Jk,9855
|
|
22
22
|
haystack_experimental/components/preprocessors/__init__.py,sha256=qZPFKpRxdw_VZ8fZ4T3GIKOObsbeOf_pKnZbLHR9AFU,653
|
|
23
|
-
haystack_experimental/components/preprocessors/embedding_based_document_splitter.py,sha256=
|
|
23
|
+
haystack_experimental/components/preprocessors/embedding_based_document_splitter.py,sha256=NLi9e-aVJkZEvwQVzeWduyvR74wlYRHe6ZviDBx2rTk,17604
|
|
24
24
|
haystack_experimental/components/preprocessors/md_header_level_inferrer.py,sha256=vyJWAFN-uhBkb5nCuJm0p29H75gGeaomOlHolD-fj5Q,5604
|
|
25
|
-
haystack_experimental/components/
|
|
26
|
-
haystack_experimental/components/
|
|
27
|
-
haystack_experimental/components/retrievers/__init__.py,sha256=CqPvqyvGp5L3Y1gTVQC8DD_xHzbIfTzGlj3oCsZM3J8,528
|
|
28
|
-
haystack_experimental/components/retrievers/chat_message_retriever.py,sha256=CaAgW1qzzhMYyKNOyk-eIBgSsO7Bg7uDqAtgcorCE60,4030
|
|
29
|
-
haystack_experimental/components/retrievers/multi_query_embedding_retriever.py,sha256=CQAWAGSBeMrLg1PxEPQNCHplYnOgvpXA0f-zbJtugKw,8101
|
|
30
|
-
haystack_experimental/components/retrievers/multi_query_text_retriever.py,sha256=CfaqzBHHgnh79eSGU4Iada_gcnYwavj8_Ou6Bc4ndiA,6511
|
|
25
|
+
haystack_experimental/components/retrievers/__init__.py,sha256=7NLOg-A7LmwxskDYebB_bDzawByCb7cXn67hVN_3e6I,245
|
|
26
|
+
haystack_experimental/components/retrievers/chat_message_retriever.py,sha256=O94cNAbiTQ1Jkwsk6qjt9ty4JJwd_3uBh1odJjcvG2I,6046
|
|
31
27
|
haystack_experimental/components/retrievers/types/__init__.py,sha256=iOngs3gs5enY8y6AWGeyQANTB_9qpXQ0QHSFFDDeEGc,218
|
|
32
28
|
haystack_experimental/components/retrievers/types/protocol.py,sha256=oUdX_P_pTszzamrkUz3YZsXL3bb4mAYIXsPCtKDH1tw,2375
|
|
33
29
|
haystack_experimental/components/summarizers/__init__.py,sha256=BqnfB0ZMb9ufYUjJ4qmmmRLPXa9FT8XKhMWW8G9Zg9Y,221
|
|
34
30
|
haystack_experimental/components/summarizers/llm_summarizer.py,sha256=Rzl3DKWENBKoAiHvgYPsc4ev0WHZGJZj4PBF-FDHiXI,14392
|
|
35
|
-
haystack_experimental/components/writers/__init__.py,sha256=
|
|
36
|
-
haystack_experimental/components/writers/chat_message_writer.py,sha256=
|
|
31
|
+
haystack_experimental/components/writers/__init__.py,sha256=DNVIwIEUi6HKsGM5UcIUPjVH7P3I8Hzc8e4PO7tjoPM,235
|
|
32
|
+
haystack_experimental/components/writers/chat_message_writer.py,sha256=Mkv9nShsPFAw1PPC6cK-tyYmXjWydCOl62boPNr7KkU,4042
|
|
37
33
|
haystack_experimental/core/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
38
34
|
haystack_experimental/core/pipeline/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
39
35
|
haystack_experimental/core/pipeline/breakpoint.py,sha256=x6EW1lAv4em1z90Ezr0oKNOZGiR8jorzZBI4MOU6qKg,5239
|
|
40
36
|
haystack_experimental/dataclasses/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
41
37
|
haystack_experimental/dataclasses/breakpoints.py,sha256=f0kxYXJRHzk6jAW5Na51MZfUuRIlulhN4oTrGWTpSFE,2095
|
|
42
|
-
haystack_experimental/super_components/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
43
|
-
haystack_experimental/super_components/indexers/__init__.py,sha256=4VPKnuzVb89Zb4PT6ejYT4s0zJ4I3rwFtcLwsCdQKJA,313
|
|
44
|
-
haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py,sha256=hfXznLVTgO39xO4GRYgi2Xy-pl4EFKtt13JrGncjvXQ,8519
|
|
45
38
|
haystack_experimental/utils/__init__.py,sha256=eHD7xrty2PCky_gG3ty19rpM4WfV32TyytM7gJODwl4,110
|
|
46
39
|
haystack_experimental/utils/hallucination_risk_calculator/__init__.py,sha256=kCd-qceud_T8P1XJHgRMaOnljyDjfFQ5UIdxEb5t6V0,219
|
|
47
40
|
haystack_experimental/utils/hallucination_risk_calculator/core_math.py,sha256=8XIa2gX1B7U400KutPgxfIUHrOggkBPAm9gIkwhF7UM,4079
|
|
48
41
|
haystack_experimental/utils/hallucination_risk_calculator/dataclasses.py,sha256=3vk9jsbW-7C9n408Qe730qgdXxIOzsTigf4TMLpryvI,2318
|
|
49
42
|
haystack_experimental/utils/hallucination_risk_calculator/openai_planner.py,sha256=-yVQsGzM5rXsAVwolE6sp5W6q1yDw66SiIUuUbPk1ng,11413
|
|
50
43
|
haystack_experimental/utils/hallucination_risk_calculator/skeletonization.py,sha256=qNdBUoFiBjQsI3ovrhd4RyTFmIbv51Goai1Z_l9lG28,5488
|
|
51
|
-
haystack_experimental-0.
|
|
52
|
-
haystack_experimental-0.
|
|
53
|
-
haystack_experimental-0.
|
|
54
|
-
haystack_experimental-0.
|
|
55
|
-
haystack_experimental-0.
|
|
44
|
+
haystack_experimental-0.15.1.dist-info/METADATA,sha256=bjMAG3cFa5-gq7l8sVKD58gITsPKgvv0wJjb_Mq_TUM,17581
|
|
45
|
+
haystack_experimental-0.15.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
46
|
+
haystack_experimental-0.15.1.dist-info/licenses/LICENSE,sha256=93_5nS97uHxptHvK9E8BZgKxLGeIS-rBWT2swIv-X5Y,11368
|
|
47
|
+
haystack_experimental-0.15.1.dist-info/licenses/LICENSE-MIT.txt,sha256=knmLkIKj_6tTrTSVRg9Tq88Kww4UCPLt2I1RGXJv9sQ,1037
|
|
48
|
+
haystack_experimental-0.15.1.dist-info/RECORD,,
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import sys
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
|
|
8
|
-
from lazy_imports import LazyImporter
|
|
9
|
-
|
|
10
|
-
_import_structure = {
|
|
11
|
-
"query_expander": ["QueryExpander"],
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from .query_expander import QueryExpander
|
|
16
|
-
|
|
17
|
-
else:
|
|
18
|
-
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
|
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
from typing import Any, Optional
|
|
7
|
-
|
|
8
|
-
from haystack import default_from_dict, default_to_dict, logging
|
|
9
|
-
from haystack.components.builders.prompt_builder import PromptBuilder
|
|
10
|
-
from haystack.components.generators.chat.openai import OpenAIChatGenerator
|
|
11
|
-
from haystack.components.generators.chat.types import ChatGenerator
|
|
12
|
-
from haystack.core.component import component
|
|
13
|
-
from haystack.core.serialization import component_to_dict
|
|
14
|
-
from haystack.dataclasses.chat_message import ChatMessage
|
|
15
|
-
from haystack.utils.deserialization import deserialize_chatgenerator_inplace
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
DEFAULT_PROMPT_TEMPLATE = """
|
|
21
|
-
You are part of an information system that processes user queries for retrieval.
|
|
22
|
-
You have to expand a given query into {{ n_expansions }} queries that are
|
|
23
|
-
semantically similar to improve retrieval recall.
|
|
24
|
-
|
|
25
|
-
Structure:
|
|
26
|
-
Follow the structure shown below in examples to generate expanded queries.
|
|
27
|
-
|
|
28
|
-
Examples:
|
|
29
|
-
1. Query: "climate change effects"
|
|
30
|
-
{"queries": ["impact of climate change", "consequences of global warming", "effects of environmental changes"]}
|
|
31
|
-
|
|
32
|
-
2. Query: "machine learning algorithms"
|
|
33
|
-
{"queries": ["neural networks", "clustering techniques", "supervised learning methods", "deep learning models"]}
|
|
34
|
-
|
|
35
|
-
3. Query: "open source NLP frameworks"
|
|
36
|
-
{"queries": ["natural language processing tools", "free nlp libraries", "open-source NLP platforms"]}
|
|
37
|
-
|
|
38
|
-
Guidelines:
|
|
39
|
-
- Generate queries that use different words and phrasings
|
|
40
|
-
- Include synonyms and related terms
|
|
41
|
-
- Maintain the same core meaning and intent
|
|
42
|
-
- Make queries that are likely to retrieve relevant information the original might miss
|
|
43
|
-
- Focus on variations that would work well with keyword-based search
|
|
44
|
-
- Respond in the same language as the input query
|
|
45
|
-
|
|
46
|
-
Your Task:
|
|
47
|
-
Query: "{{ query }}"
|
|
48
|
-
|
|
49
|
-
You *must* respond with a JSON object containing a "queries" array with the expanded queries.
|
|
50
|
-
Example: {"queries": ["query1", "query2", "query3"]}"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@component
|
|
54
|
-
class QueryExpander:
|
|
55
|
-
"""
|
|
56
|
-
A component that returns a list of semantically similar queries to improve retrieval recall in RAG systems.
|
|
57
|
-
|
|
58
|
-
The component uses a chat generator to expand queries. The chat generator is expected to return a JSON response
|
|
59
|
-
with the following structure:
|
|
60
|
-
```json
|
|
61
|
-
{"queries": ["expanded query 1", "expanded query 2", "expanded query 3"]}
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
### Usage example
|
|
65
|
-
|
|
66
|
-
```python
|
|
67
|
-
from haystack.components.generators.chat.openai import OpenAIChatGenerator
|
|
68
|
-
from haystack_experimental.components.query import QueryExpander
|
|
69
|
-
|
|
70
|
-
expander = QueryExpander(
|
|
71
|
-
chat_generator=OpenAIChatGenerator(model="gpt-4.1-mini"),
|
|
72
|
-
n_expansions=3
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
result = expander.run(query="green energy sources")
|
|
76
|
-
print(result["queries"])
|
|
77
|
-
# Output: ['alternative query 1', 'alternative query 2', 'alternative query 3', 'green energy sources']
|
|
78
|
-
# Note: Up to 3 additional queries + 1 original query (if include_original_query=True)
|
|
79
|
-
|
|
80
|
-
# To control total number of queries:
|
|
81
|
-
expander = QueryExpander(n_expansions=2, include_original_query=True) # Up to 3 total
|
|
82
|
-
# or
|
|
83
|
-
expander = QueryExpander(n_expansions=3, include_original_query=False) # Exactly 3 total
|
|
84
|
-
```
|
|
85
|
-
"""
|
|
86
|
-
|
|
87
|
-
def __init__(
|
|
88
|
-
self,
|
|
89
|
-
*,
|
|
90
|
-
chat_generator: Optional[ChatGenerator] = None,
|
|
91
|
-
prompt_template: Optional[str] = None,
|
|
92
|
-
n_expansions: int = 4,
|
|
93
|
-
include_original_query: bool = True,
|
|
94
|
-
) -> None:
|
|
95
|
-
"""
|
|
96
|
-
Initialize the QueryExpander component.
|
|
97
|
-
|
|
98
|
-
:param chat_generator: The chat generator component to use for query expansion.
|
|
99
|
-
If None, a default OpenAIChatGenerator with gpt-4.1-mini model is used.
|
|
100
|
-
:param prompt_template: Custom [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder)
|
|
101
|
-
template for query expansion. The template should instruct the LLM to return a JSON response with the
|
|
102
|
-
structure: `{"queries": ["query1", "query2", "query3"]}`. The template should include 'query' and
|
|
103
|
-
'n_expansions' variables.
|
|
104
|
-
:param n_expansions: Number of alternative queries to generate (default: 4).
|
|
105
|
-
:param include_original_query: Whether to include the original query in the output.
|
|
106
|
-
"""
|
|
107
|
-
if n_expansions <= 0:
|
|
108
|
-
raise ValueError("n_expansions must be positive")
|
|
109
|
-
|
|
110
|
-
self.n_expansions = n_expansions
|
|
111
|
-
self.include_original_query = include_original_query
|
|
112
|
-
|
|
113
|
-
if chat_generator is None:
|
|
114
|
-
self.chat_generator: ChatGenerator = OpenAIChatGenerator(
|
|
115
|
-
model="gpt-4.1-mini",
|
|
116
|
-
generation_kwargs={
|
|
117
|
-
"temperature": 0.7,
|
|
118
|
-
"response_format": {
|
|
119
|
-
"type": "json_schema",
|
|
120
|
-
"json_schema": {
|
|
121
|
-
"name": "query_expansion",
|
|
122
|
-
"schema": {
|
|
123
|
-
"type": "object",
|
|
124
|
-
"properties": {"queries": {"type": "array", "items": {"type": "string"}}},
|
|
125
|
-
"required": ["queries"],
|
|
126
|
-
"additionalProperties": False,
|
|
127
|
-
},
|
|
128
|
-
},
|
|
129
|
-
},
|
|
130
|
-
"seed": 42,
|
|
131
|
-
},
|
|
132
|
-
)
|
|
133
|
-
else:
|
|
134
|
-
self.chat_generator = chat_generator
|
|
135
|
-
|
|
136
|
-
self._is_warmed_up = False
|
|
137
|
-
self.prompt_template = prompt_template or DEFAULT_PROMPT_TEMPLATE
|
|
138
|
-
|
|
139
|
-
# Check if required variables are present in the template
|
|
140
|
-
if "query" not in self.prompt_template:
|
|
141
|
-
logger.warning(
|
|
142
|
-
"The prompt template does not contain the 'query' variable. This may cause issues during execution."
|
|
143
|
-
)
|
|
144
|
-
if "n_expansions" not in self.prompt_template:
|
|
145
|
-
logger.warning(
|
|
146
|
-
"The prompt template does not contain the 'n_expansions' variable. "
|
|
147
|
-
"This may cause issues during execution."
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
self._prompt_builder = PromptBuilder(
|
|
151
|
-
template=self.prompt_template,
|
|
152
|
-
required_variables=["n_expansions", "query"],
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
def to_dict(self) -> dict[str, Any]:
|
|
156
|
-
"""
|
|
157
|
-
Serializes the component to a dictionary.
|
|
158
|
-
|
|
159
|
-
:return: Dictionary with serialized data.
|
|
160
|
-
"""
|
|
161
|
-
return default_to_dict(
|
|
162
|
-
self,
|
|
163
|
-
chat_generator=component_to_dict(self.chat_generator, name="chat_generator"),
|
|
164
|
-
prompt_template=self.prompt_template,
|
|
165
|
-
n_expansions=self.n_expansions,
|
|
166
|
-
include_original_query=self.include_original_query,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
@classmethod
|
|
170
|
-
def from_dict(cls, data: dict[str, Any]) -> "QueryExpander":
|
|
171
|
-
"""
|
|
172
|
-
Deserializes the component from a dictionary.
|
|
173
|
-
|
|
174
|
-
:param data: Dictionary with serialized data.
|
|
175
|
-
:return: Deserialized component.
|
|
176
|
-
"""
|
|
177
|
-
init_params = data.get("init_parameters", {})
|
|
178
|
-
|
|
179
|
-
deserialize_chatgenerator_inplace(init_params, key="chat_generator")
|
|
180
|
-
|
|
181
|
-
return default_from_dict(cls, data)
|
|
182
|
-
|
|
183
|
-
@component.output_types(queries=list[str])
|
|
184
|
-
def run(self, query: str, n_expansions: Optional[int] = None) -> dict[str, list[str]]:
|
|
185
|
-
"""
|
|
186
|
-
Expand the input query into multiple semantically similar queries.
|
|
187
|
-
|
|
188
|
-
The language of the original query is preserved in the expanded queries.
|
|
189
|
-
|
|
190
|
-
:param query: The original query to expand.
|
|
191
|
-
:param n_expansions: Number of additional queries to generate (not including the original).
|
|
192
|
-
If None, uses the value from initialization. Can be 0 to generate no additional queries.
|
|
193
|
-
:return: Dictionary with "queries" key containing the list of expanded queries.
|
|
194
|
-
If include_original_query=True, the original query will be included in addition
|
|
195
|
-
to the n_expansions alternative queries.
|
|
196
|
-
:raises ValueError: If n_expansions is not positive (less than or equal to 0).
|
|
197
|
-
"""
|
|
198
|
-
|
|
199
|
-
if not self._is_warmed_up:
|
|
200
|
-
self.warm_up()
|
|
201
|
-
|
|
202
|
-
response = {"queries": [query] if self.include_original_query else []}
|
|
203
|
-
|
|
204
|
-
if not query.strip():
|
|
205
|
-
logger.warning("Empty query provided to QueryExpander")
|
|
206
|
-
return response
|
|
207
|
-
|
|
208
|
-
expansion_count = n_expansions if n_expansions is not None else self.n_expansions
|
|
209
|
-
if expansion_count <= 0:
|
|
210
|
-
raise ValueError("n_expansions must be positive")
|
|
211
|
-
|
|
212
|
-
try:
|
|
213
|
-
prompt_result = self._prompt_builder.run(query=query.strip(), n_expansions=expansion_count)
|
|
214
|
-
generator_result = self.chat_generator.run(messages=[ChatMessage.from_user(prompt_result["prompt"])])
|
|
215
|
-
|
|
216
|
-
if not generator_result.get("replies") or len(generator_result["replies"]) == 0:
|
|
217
|
-
logger.warning("ChatGenerator returned no replies for query: {query}", query=query)
|
|
218
|
-
return response
|
|
219
|
-
|
|
220
|
-
expanded_text = generator_result["replies"][0].text.strip()
|
|
221
|
-
expanded_queries = self._parse_expanded_queries(expanded_text)
|
|
222
|
-
|
|
223
|
-
# Limit the number of expanded queries to the requested amount
|
|
224
|
-
if len(expanded_queries) > expansion_count:
|
|
225
|
-
logger.warning(
|
|
226
|
-
"Generated {generated_count} queries but only {requested_count} were requested. "
|
|
227
|
-
"Truncating to the first {requested_count} queries. ",
|
|
228
|
-
generated_count=len(expanded_queries),
|
|
229
|
-
requested_count=expansion_count,
|
|
230
|
-
)
|
|
231
|
-
expanded_queries = expanded_queries[:expansion_count]
|
|
232
|
-
|
|
233
|
-
# Add original query if requested and remove duplicates
|
|
234
|
-
if self.include_original_query:
|
|
235
|
-
expanded_queries_lower = [q.lower() for q in expanded_queries]
|
|
236
|
-
if query.lower() not in expanded_queries_lower:
|
|
237
|
-
expanded_queries.append(query)
|
|
238
|
-
|
|
239
|
-
response["queries"] = expanded_queries
|
|
240
|
-
return response
|
|
241
|
-
|
|
242
|
-
except Exception as e:
|
|
243
|
-
# Fallback: return original query to maintain pipeline functionality
|
|
244
|
-
logger.error("Failed to expand query {query}: {error}", query=query, error=str(e))
|
|
245
|
-
return response
|
|
246
|
-
|
|
247
|
-
def warm_up(self):
|
|
248
|
-
"""
|
|
249
|
-
Warm up the LLM provider component.
|
|
250
|
-
"""
|
|
251
|
-
if not self._is_warmed_up:
|
|
252
|
-
if hasattr(self.chat_generator, "warm_up"):
|
|
253
|
-
self.chat_generator.warm_up()
|
|
254
|
-
self._is_warmed_up = True
|
|
255
|
-
|
|
256
|
-
@staticmethod
|
|
257
|
-
def _parse_expanded_queries(generator_response: str) -> list[str]:
|
|
258
|
-
"""
|
|
259
|
-
Parse the generator response to extract individual expanded queries.
|
|
260
|
-
|
|
261
|
-
:param generator_response: The raw text response from the generator.
|
|
262
|
-
:return: List of parsed expanded queries.
|
|
263
|
-
"""
|
|
264
|
-
if not generator_response.strip():
|
|
265
|
-
return []
|
|
266
|
-
|
|
267
|
-
try:
|
|
268
|
-
parsed = json.loads(generator_response)
|
|
269
|
-
if not isinstance(parsed, dict) or "queries" not in parsed:
|
|
270
|
-
logger.warning(
|
|
271
|
-
"Generator response is not a JSON object containing a 'queries' array: {response}",
|
|
272
|
-
response=generator_response[:100],
|
|
273
|
-
)
|
|
274
|
-
return []
|
|
275
|
-
|
|
276
|
-
queries = []
|
|
277
|
-
for item in parsed["queries"]:
|
|
278
|
-
if isinstance(item, str) and item.strip():
|
|
279
|
-
queries.append(item.strip())
|
|
280
|
-
else:
|
|
281
|
-
logger.warning(
|
|
282
|
-
"Skipping non-string or empty query in response: {item}",
|
|
283
|
-
item=item,
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
return queries
|
|
287
|
-
|
|
288
|
-
except json.JSONDecodeError as e:
|
|
289
|
-
logger.warning(
|
|
290
|
-
"Failed to parse JSON response: {error}. Response: {response}",
|
|
291
|
-
error=str(e),
|
|
292
|
-
response=generator_response[:100],
|
|
293
|
-
)
|
|
294
|
-
return []
|
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
-
from typing import Any, Optional
|
|
7
|
-
|
|
8
|
-
from haystack import Document, component, default_from_dict, default_to_dict
|
|
9
|
-
from haystack.components.embedders.types.protocol import TextEmbedder
|
|
10
|
-
from haystack.core.serialization import component_to_dict
|
|
11
|
-
from haystack.utils.deserialization import deserialize_component_inplace
|
|
12
|
-
|
|
13
|
-
from haystack_experimental.components.retrievers.types import EmbeddingRetriever
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@component
|
|
17
|
-
class MultiQueryEmbeddingRetriever:
|
|
18
|
-
"""
|
|
19
|
-
A component that retrieves documents using multiple queries in parallel with an embedding-based retriever.
|
|
20
|
-
|
|
21
|
-
This component takes a list of text queries, converts them to embeddings using a query embedder,
|
|
22
|
-
and then uses an embedding-based retriever to find relevant documents for each query in parallel.
|
|
23
|
-
The results are combined and sorted by relevance score.
|
|
24
|
-
|
|
25
|
-
### Usage example
|
|
26
|
-
|
|
27
|
-
```python
|
|
28
|
-
from haystack import Document
|
|
29
|
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
30
|
-
from haystack.document_stores.types import DuplicatePolicy
|
|
31
|
-
from haystack.components.embedders import SentenceTransformersTextEmbedder
|
|
32
|
-
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
|
33
|
-
from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
|
34
|
-
from haystack.components.writers import DocumentWriter
|
|
35
|
-
from haystack_experimental.components.retrievers import MultiQueryEmbeddingRetriever
|
|
36
|
-
|
|
37
|
-
documents = [
|
|
38
|
-
Document(content="Renewable energy is energy that is collected from renewable resources."),
|
|
39
|
-
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
|
|
40
|
-
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
|
|
41
|
-
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth."),
|
|
42
|
-
Document(content="Biomass energy is produced from organic materials, such as plant and animal waste."),
|
|
43
|
-
Document(content="Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources."),
|
|
44
|
-
]
|
|
45
|
-
|
|
46
|
-
# Populate the document store
|
|
47
|
-
doc_store = InMemoryDocumentStore()
|
|
48
|
-
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
|
|
49
|
-
doc_embedder.warm_up()
|
|
50
|
-
doc_writer = DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP)
|
|
51
|
-
documents = doc_embedder.run(documents)["documents"]
|
|
52
|
-
doc_writer.run(documents=documents)
|
|
53
|
-
|
|
54
|
-
# Run the multi-query retriever
|
|
55
|
-
in_memory_retriever = InMemoryEmbeddingRetriever(document_store=doc_store, top_k=1)
|
|
56
|
-
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
|
|
57
|
-
|
|
58
|
-
multi_query_retriever = MultiQueryEmbeddingRetriever(
|
|
59
|
-
retriever=in_memory_retriever,
|
|
60
|
-
query_embedder=query_embedder,
|
|
61
|
-
max_workers=3
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
queries = ["Geothermal energy", "natural gas", "turbines"]
|
|
65
|
-
result = multi_query_retriever.run(queries=queries)
|
|
66
|
-
for doc in result["documents"]:
|
|
67
|
-
print(f"Content: {doc.content}, Score: {doc.score}")
|
|
68
|
-
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
|
|
69
|
-
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
|
|
70
|
-
>> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
|
|
71
|
-
>> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680995796
|
|
72
|
-
>> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.3091423972562246
|
|
73
|
-
>> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243668087
|
|
74
|
-
```
|
|
75
|
-
""" # noqa E501
|
|
76
|
-
|
|
77
|
-
def __init__(self, *, retriever: EmbeddingRetriever, query_embedder: TextEmbedder, max_workers: int = 3) -> None:
|
|
78
|
-
"""
|
|
79
|
-
Initialize MultiQueryEmbeddingRetriever.
|
|
80
|
-
|
|
81
|
-
:param retriever: The embedding-based retriever to use for document retrieval.
|
|
82
|
-
:param query_embedder: The query embedder to convert text queries to embeddings.
|
|
83
|
-
:param max_workers: Maximum number of worker threads for parallel processing.
|
|
84
|
-
"""
|
|
85
|
-
self.retriever = retriever
|
|
86
|
-
self.query_embedder = query_embedder
|
|
87
|
-
self.max_workers = max_workers
|
|
88
|
-
self._is_warmed_up = False
|
|
89
|
-
|
|
90
|
-
def warm_up(self) -> None:
|
|
91
|
-
"""
|
|
92
|
-
Warm up the query embedder and the retriever if any has a warm_up method.
|
|
93
|
-
"""
|
|
94
|
-
if not self._is_warmed_up:
|
|
95
|
-
if hasattr(self.query_embedder, "warm_up") and callable(getattr(self.query_embedder, "warm_up")):
|
|
96
|
-
self.query_embedder.warm_up()
|
|
97
|
-
if hasattr(self.retriever, "warm_up") and callable(getattr(self.retriever, "warm_up")):
|
|
98
|
-
self.retriever.warm_up()
|
|
99
|
-
self._is_warmed_up = True
|
|
100
|
-
|
|
101
|
-
@component.output_types(documents=list[Document])
|
|
102
|
-
def run(self, queries: list[str], retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, list[Document]]:
|
|
103
|
-
"""
|
|
104
|
-
Retrieve documents using multiple queries in parallel.
|
|
105
|
-
|
|
106
|
-
:param queries: List of text queries to process.
|
|
107
|
-
:param retriever_kwargs: Optional dictionary of arguments to pass to the retriever's run method.
|
|
108
|
-
:returns:
|
|
109
|
-
A dictionary containing:
|
|
110
|
-
- `documents`: List of retrieved documents sorted by relevance score.
|
|
111
|
-
"""
|
|
112
|
-
docs: list[Document] = []
|
|
113
|
-
seen_contents = set()
|
|
114
|
-
retriever_kwargs = retriever_kwargs or {}
|
|
115
|
-
|
|
116
|
-
if not self._is_warmed_up:
|
|
117
|
-
self.warm_up()
|
|
118
|
-
|
|
119
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
120
|
-
queries_results = executor.map(lambda query: self._run_on_thread(query, retriever_kwargs), queries)
|
|
121
|
-
for result in queries_results:
|
|
122
|
-
if not result:
|
|
123
|
-
continue
|
|
124
|
-
for doc in result:
|
|
125
|
-
# deduplicate based on content
|
|
126
|
-
if doc.content not in seen_contents:
|
|
127
|
-
docs.append(doc)
|
|
128
|
-
seen_contents.add(doc.content)
|
|
129
|
-
|
|
130
|
-
docs.sort(key=lambda x: x.score or 0.0, reverse=True)
|
|
131
|
-
return {"documents": docs}
|
|
132
|
-
|
|
133
|
-
def _run_on_thread(self, query: str, retriever_kwargs: Optional[dict[str, Any]] = None) -> Optional[list[Document]]:
|
|
134
|
-
"""
|
|
135
|
-
Process a single query on a separate thread.
|
|
136
|
-
|
|
137
|
-
:param query: The text query to process.
|
|
138
|
-
:returns:
|
|
139
|
-
List of retrieved documents or None if no results.
|
|
140
|
-
"""
|
|
141
|
-
embedding_result = self.query_embedder.run(text=query)
|
|
142
|
-
query_embedding = embedding_result["embedding"]
|
|
143
|
-
result = self.retriever.run(query_embedding=query_embedding, **(retriever_kwargs or {}))
|
|
144
|
-
if result and "documents" in result:
|
|
145
|
-
return result["documents"]
|
|
146
|
-
return None
|
|
147
|
-
|
|
148
|
-
def to_dict(self) -> dict[str, Any]:
|
|
149
|
-
"""
|
|
150
|
-
Serializes the component to a dictionary.
|
|
151
|
-
|
|
152
|
-
:returns:
|
|
153
|
-
A dictionary representing the serialized component.
|
|
154
|
-
"""
|
|
155
|
-
return default_to_dict(
|
|
156
|
-
self,
|
|
157
|
-
retriever=component_to_dict(obj=self.retriever, name="retriever"),
|
|
158
|
-
query_embedder=component_to_dict(obj=self.query_embedder, name="query_embedder"),
|
|
159
|
-
max_workers=self.max_workers,
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
@classmethod
|
|
163
|
-
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryEmbeddingRetriever":
|
|
164
|
-
"""
|
|
165
|
-
Deserializes the component from a dictionary.
|
|
166
|
-
|
|
167
|
-
:param data: The dictionary to deserialize from.
|
|
168
|
-
:returns:
|
|
169
|
-
The deserialized component.
|
|
170
|
-
"""
|
|
171
|
-
deserialize_component_inplace(data["init_parameters"], key="retriever")
|
|
172
|
-
deserialize_component_inplace(data["init_parameters"], key="query_embedder")
|
|
173
|
-
return default_from_dict(cls, data)
|