haystack-experimental 0.14.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. haystack_experimental/chat_message_stores/__init__.py +1 -1
  2. haystack_experimental/chat_message_stores/in_memory.py +176 -31
  3. haystack_experimental/chat_message_stores/types.py +33 -21
  4. haystack_experimental/components/agents/agent.py +184 -35
  5. haystack_experimental/components/agents/human_in_the_loop/strategies.py +220 -3
  6. haystack_experimental/components/agents/human_in_the_loop/types.py +36 -1
  7. haystack_experimental/components/embedders/types/protocol.py +2 -2
  8. haystack_experimental/components/preprocessors/__init__.py +2 -0
  9. haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +16 -16
  10. haystack_experimental/components/preprocessors/md_header_level_inferrer.py +2 -2
  11. haystack_experimental/components/retrievers/__init__.py +1 -3
  12. haystack_experimental/components/retrievers/chat_message_retriever.py +57 -26
  13. haystack_experimental/components/writers/__init__.py +1 -1
  14. haystack_experimental/components/writers/chat_message_writer.py +25 -22
  15. haystack_experimental/core/pipeline/breakpoint.py +5 -3
  16. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/METADATA +24 -31
  17. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/RECORD +20 -27
  18. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/WHEEL +1 -1
  19. haystack_experimental/components/query/__init__.py +0 -18
  20. haystack_experimental/components/query/query_expander.py +0 -299
  21. haystack_experimental/components/retrievers/multi_query_embedding_retriever.py +0 -180
  22. haystack_experimental/components/retrievers/multi_query_text_retriever.py +0 -158
  23. haystack_experimental/super_components/__init__.py +0 -3
  24. haystack_experimental/super_components/indexers/__init__.py +0 -11
  25. haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py +0 -199
  26. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/licenses/LICENSE +0 -0
  27. {haystack_experimental-0.14.2.dist-info → haystack_experimental-0.15.0.dist-info}/licenses/LICENSE-MIT.txt +0 -0
@@ -1,199 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from typing import Any, Dict, List, Literal, Optional
6
-
7
- from haystack import Pipeline, component, default_from_dict, default_to_dict
8
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder
9
- from haystack.components.writers import DocumentWriter
10
- from haystack.core.super_component import SuperComponent
11
- from haystack.document_stores.types import DocumentStore, DuplicatePolicy
12
- from haystack.utils import (
13
- ComponentDevice,
14
- Secret,
15
- deserialize_document_store_in_init_params_inplace,
16
- deserialize_secrets_inplace,
17
- )
18
- from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
19
-
20
-
21
- @component
22
- class SentenceTransformersDocumentIndexer(SuperComponent):
23
- """
24
- A document indexer that takes a list of documents, embeds them using SentenceTransformers, and stores them.
25
-
26
- Usage:
27
-
28
- ```python
29
- >>> from haystack import Document
30
- >>> from haystack.document_stores.in_memory import InMemoryDocumentStore
31
- >>> document_store = InMemoryDocumentStore()
32
- >>> doc = Document(content="I love pizza!")
33
- >>> indexer = SentenceTransformersDocumentIndexer(document_store=document_store)
34
- >>> indexer.warm_up()
35
- >>> result = indexer.run(documents=[doc])
36
- >>> print(result)
37
- {'documents_written': 1}
38
- >>> document_store.count_documents()
39
- 1
40
- ```
41
- """
42
-
43
- def __init__( # pylint: disable=R0917
44
- self,
45
- document_store: DocumentStore,
46
- model: str = "sentence-transformers/all-mpnet-base-v2",
47
- device: Optional[ComponentDevice] = None,
48
- token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
49
- prefix: str = "",
50
- suffix: str = "",
51
- batch_size: int = 32,
52
- progress_bar: bool = True,
53
- normalize_embeddings: bool = False,
54
- meta_fields_to_embed: Optional[List[str]] = None,
55
- embedding_separator: str = "\n",
56
- trust_remote_code: bool = False,
57
- truncate_dim: Optional[int] = None,
58
- model_kwargs: Optional[Dict[str, Any]] = None,
59
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
60
- config_kwargs: Optional[Dict[str, Any]] = None,
61
- precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
62
- duplicate_policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE,
63
- ) -> None:
64
- """
65
- Initialize the SentenceTransformersDocumentIndexer component.
66
-
67
- :param document_store: The document store where the documents should be stored.
68
- :param model: The embedding model to use (local path or Hugging Face model ID).
69
- :param device: The device to use for loading the model.
70
- :param token: The API token to download private models from Hugging Face.
71
- :param prefix: String to add at the beginning of each document text.
72
- :param suffix: String to add at the end of each document text.
73
- :param batch_size: Number of documents to embed at once.
74
- :param progress_bar: If True, shows a progress bar when embedding documents.
75
- :param normalize_embeddings: If True, embeddings are L2 normalized.
76
- :param meta_fields_to_embed: List of metadata fields to embed along with the document text.
77
- :param embedding_separator: Separator used to concatenate metadata fields to document text.
78
- :param trust_remote_code: If True, allows custom models and scripts.
79
- :param truncate_dim: Dimension to truncate sentence embeddings to.
80
- :param model_kwargs: Additional keyword arguments for model initialization.
81
- :param tokenizer_kwargs: Additional keyword arguments for tokenizer initialization.
82
- :param config_kwargs: Additional keyword arguments for model configuration.
83
- :param precision: The precision to use for the embeddings.
84
- :param duplicate_policy: The duplicate policy to use when writing documents.
85
- """
86
- self.document_store = document_store
87
- self.model = model
88
- self.device = device
89
- self.token = token
90
- self.prefix = prefix
91
- self.suffix = suffix
92
- self.batch_size = batch_size
93
- self.progress_bar = progress_bar
94
- self.normalize_embeddings = normalize_embeddings
95
- self.meta_fields_to_embed = meta_fields_to_embed
96
- self.embedding_separator = embedding_separator
97
- self.trust_remote_code = trust_remote_code
98
- self.truncate_dim = truncate_dim
99
- self.model_kwargs = model_kwargs
100
- self.tokenizer_kwargs = tokenizer_kwargs
101
- self.config_kwargs = config_kwargs
102
- self.precision = precision
103
- self.duplicate_policy = duplicate_policy
104
-
105
- pipeline = Pipeline()
106
-
107
- pipeline.add_component(
108
- "embedder",
109
- SentenceTransformersDocumentEmbedder(
110
- model=self.model,
111
- device=self.device,
112
- token=self.token,
113
- prefix=self.prefix,
114
- suffix=self.suffix,
115
- batch_size=self.batch_size,
116
- progress_bar=self.progress_bar,
117
- normalize_embeddings=self.normalize_embeddings,
118
- meta_fields_to_embed=self.meta_fields_to_embed,
119
- embedding_separator=self.embedding_separator,
120
- trust_remote_code=self.trust_remote_code,
121
- truncate_dim=self.truncate_dim,
122
- model_kwargs=self.model_kwargs,
123
- tokenizer_kwargs=self.tokenizer_kwargs,
124
- config_kwargs=self.config_kwargs,
125
- precision=self.precision,
126
- ),
127
- )
128
- pipeline.add_component(
129
- "writer",
130
- DocumentWriter(
131
- document_store=self.document_store,
132
- policy=self.duplicate_policy,
133
- ),
134
- )
135
-
136
- pipeline.connect("embedder.documents", "writer.documents")
137
-
138
- super(SentenceTransformersDocumentIndexer, self).__init__(
139
- pipeline=pipeline,
140
- input_mapping={"documents": ["embedder.documents"]},
141
- output_mapping={"writer.documents_written": "documents_written"},
142
- )
143
-
144
- def to_dict(self) -> Dict[str, Any]:
145
- """
146
- Serialize this instance to a dictionary.
147
- """
148
- serialization_dict = default_to_dict(
149
- self,
150
- document_store=self.document_store.to_dict(),
151
- model=self.model,
152
- device=self.device.to_dict() if self.device else None,
153
- token=self.token.to_dict() if self.token else None,
154
- prefix=self.prefix,
155
- suffix=self.suffix,
156
- batch_size=self.batch_size,
157
- progress_bar=self.progress_bar,
158
- normalize_embeddings=self.normalize_embeddings,
159
- meta_fields_to_embed=self.meta_fields_to_embed,
160
- embedding_separator=self.embedding_separator,
161
- trust_remote_code=self.trust_remote_code,
162
- truncate_dim=self.truncate_dim,
163
- model_kwargs=self.model_kwargs,
164
- tokenizer_kwargs=self.tokenizer_kwargs,
165
- config_kwargs=self.config_kwargs,
166
- precision=self.precision,
167
- duplicate_policy=self.duplicate_policy.value,
168
- )
169
-
170
- if serialization_dict["init_parameters"].get("model_kwargs") is not None:
171
- serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
172
-
173
- return serialization_dict
174
-
175
- @classmethod
176
- def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentIndexer":
177
- """
178
- Load an instance of this component from a dictionary.
179
- """
180
- deserialize_document_store_in_init_params_inplace(data)
181
- init_params = data.get("init_parameters", {})
182
-
183
- # Handle device deserialization
184
- if init_params.get("device") is not None:
185
- init_params["device"] = ComponentDevice.from_dict(init_params["device"])
186
-
187
- # Handle secrets deserialization
188
- deserialize_secrets_inplace(init_params, keys=["token"])
189
-
190
- # Handle model kwargs deserialization
191
- if init_params.get("model_kwargs") is not None:
192
- deserialize_hf_model_kwargs(init_params["model_kwargs"])
193
-
194
- # Handle duplicate policy deserialization
195
- if policy_value := init_params.get("duplicate_policy"):
196
- init_params["duplicate_policy"] = DuplicatePolicy(policy_value)
197
-
198
- data["init_parameters"] = init_params
199
- return default_from_dict(cls, data)