haystack-experimental 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. haystack_experimental/chat_message_stores/in_memory.py +3 -3
  2. haystack_experimental/chat_message_stores/types.py +2 -2
  3. haystack_experimental/components/agents/agent.py +174 -119
  4. haystack_experimental/components/agents/human_in_the_loop/breakpoint.py +3 -1
  5. haystack_experimental/components/agents/human_in_the_loop/dataclasses.py +6 -6
  6. haystack_experimental/components/agents/human_in_the_loop/errors.py +1 -5
  7. haystack_experimental/components/agents/human_in_the_loop/strategies.py +10 -10
  8. haystack_experimental/components/agents/human_in_the_loop/types.py +5 -5
  9. haystack_experimental/components/agents/human_in_the_loop/user_interfaces.py +2 -2
  10. haystack_experimental/components/generators/chat/openai.py +11 -11
  11. haystack_experimental/components/preprocessors/__init__.py +1 -3
  12. haystack_experimental/components/retrievers/chat_message_retriever.py +4 -4
  13. haystack_experimental/components/retrievers/types/protocol.py +3 -3
  14. haystack_experimental/components/summarizers/llm_summarizer.py +7 -7
  15. haystack_experimental/core/pipeline/breakpoint.py +6 -6
  16. haystack_experimental/dataclasses/breakpoints.py +2 -2
  17. haystack_experimental/utils/hallucination_risk_calculator/dataclasses.py +9 -9
  18. haystack_experimental/utils/hallucination_risk_calculator/openai_planner.py +4 -4
  19. haystack_experimental/utils/hallucination_risk_calculator/skeletonization.py +5 -5
  20. {haystack_experimental-0.15.1.dist-info → haystack_experimental-0.16.0.dist-info}/METADATA +6 -10
  21. {haystack_experimental-0.15.1.dist-info → haystack_experimental-0.16.0.dist-info}/RECORD +24 -25
  22. haystack_experimental/components/preprocessors/embedding_based_document_splitter.py +0 -430
  23. {haystack_experimental-0.15.1.dist-info → haystack_experimental-0.16.0.dist-info}/WHEEL +0 -0
  24. {haystack_experimental-0.15.1.dist-info → haystack_experimental-0.16.0.dist-info}/licenses/LICENSE +0 -0
  25. {haystack_experimental-0.15.1.dist-info → haystack_experimental-0.16.0.dist-info}/licenses/LICENSE-MIT.txt +0 -0
@@ -1,430 +0,0 @@
1
- # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from copy import deepcopy
6
- from typing import Any, Optional
7
-
8
- import numpy as np
9
- from haystack import Document, component, logging
10
- from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter
11
- from haystack.core.serialization import component_to_dict, default_from_dict, default_to_dict
12
- from haystack.utils.deserialization import deserialize_component_inplace
13
-
14
- from haystack_experimental.components.embedders.types import DocumentEmbedder
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- @component
20
- class EmbeddingBasedDocumentSplitter:
21
- """
22
- Splits documents based on embedding similarity using cosine distances between sequential sentence groups.
23
-
24
- This component first splits text into sentences, optionally groups them, calculates embeddings for each group,
25
- and then uses cosine distance between sequential embeddings to determine split points. Any distance above
26
- the specified percentile is treated as a break point. The component also tracks page numbers based on form feed
27
- characters (`\f`) in the original document.
28
-
29
- This component is inspired by [5 Levels of Text Splitting](
30
- https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
31
- ) by Greg Kamradt.
32
-
33
- ### Usage example
34
-
35
- ```python
36
- from haystack import Document
37
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder
38
- from haystack_experimental.components.preprocessors import EmbeddingBasedDocumentSplitter
39
-
40
- # Create a document with content that has a clear topic shift
41
- doc = Document(
42
- content="This is a first sentence. This is a second sentence. This is a third sentence. "
43
- "Completely different topic. The same completely different topic."
44
- )
45
-
46
- # Initialize the embedder to calculate semantic similarities
47
- embedder = SentenceTransformersDocumentEmbedder()
48
-
49
- # Configure the splitter with parameters that control splitting behavior
50
- splitter = EmbeddingBasedDocumentSplitter(
51
- document_embedder=embedder,
52
- sentences_per_group=2, # Group 2 sentences before calculating embeddings
53
- percentile=0.95, # Split when cosine distance exceeds 95th percentile
54
- min_length=50, # Merge splits shorter than 50 characters
55
- max_length=1000 # Further split chunks longer than 1000 characters
56
- )
57
- splitter.warm_up()
58
- result = splitter.run(documents=[doc])
59
-
60
- # The result contains a list of Document objects, each representing a semantic chunk
61
- # Each split document includes metadata: source_id, split_id, and page_number
62
- print(f"Original document split into {len(result['documents'])} chunks")
63
- for i, split_doc in enumerate(result['documents']):
64
- print(f"Chunk {i}: {split_doc.content[:50]}...")
65
- ```
66
- """
67
-
68
- def __init__(
69
- self,
70
- *,
71
- document_embedder: DocumentEmbedder,
72
- sentences_per_group: int = 3,
73
- percentile: float = 0.95,
74
- min_length: int = 50,
75
- max_length: int = 1000,
76
- language: Language = "en",
77
- use_split_rules: bool = True,
78
- extend_abbreviations: bool = True,
79
- ):
80
- """
81
- Initialize EmbeddingBasedDocumentSplitter.
82
-
83
- :param document_embedder: The DocumentEmbedder to use for calculating embeddings.
84
- :param sentences_per_group: Number of sentences to group together before embedding.
85
- :param percentile: Percentile threshold for cosine distance. Distances above this percentile
86
- are treated as break points.
87
- :param min_length: Minimum length of splits in characters. Splits below this length will be merged.
88
- :param max_length: Maximum length of splits in characters. Splits above this length will be recursively split.
89
- :param language: Language for sentence tokenization.
90
- :param use_split_rules: Whether to use additional split rules for sentence tokenization. Applies additional
91
- split rules from SentenceSplitter to the sentence spans.
92
- :param extend_abbreviations: If True, the abbreviations used by NLTK's PunktTokenizer are extended by a list
93
- of curated abbreviations. Currently supported languages are: en, de.
94
- If False, the default abbreviations are used.
95
- """
96
- self.document_embedder = document_embedder
97
- self.sentences_per_group = sentences_per_group
98
- self.percentile = percentile
99
- self.min_length = min_length
100
- self.max_length = max_length
101
- self.language = language
102
- self.use_split_rules = use_split_rules
103
- self.extend_abbreviations = extend_abbreviations
104
-
105
- self._init_validation()
106
- self.sentence_splitter: Optional[SentenceSplitter] = None
107
- self._is_warmed_up = False
108
-
109
- def _init_validation(self) -> None:
110
- """
111
- Validates initialization parameters.
112
- """
113
- if self.sentences_per_group <= 0:
114
- raise ValueError("sentences_per_group must be greater than 0.")
115
-
116
- if not 0.0 <= self.percentile <= 1.0:
117
- raise ValueError("percentile must be between 0.0 and 1.0.")
118
-
119
- if self.min_length < 0:
120
- raise ValueError("min_length must be greater than or equal to 0.")
121
-
122
- if self.max_length <= self.min_length:
123
- raise ValueError("max_length must be greater than min_length.")
124
-
125
- def warm_up(self) -> None:
126
- """
127
- Warm up the component by initializing the sentence splitter.
128
- """
129
- self.sentence_splitter = SentenceSplitter(
130
- language=self.language,
131
- use_split_rules=self.use_split_rules,
132
- extend_abbreviations=self.extend_abbreviations,
133
- keep_white_spaces=True,
134
- )
135
- if hasattr(self.document_embedder, "warm_up"):
136
- self.document_embedder.warm_up()
137
- self._is_warmed_up = True
138
-
139
- @component.output_types(documents=list[Document])
140
- def run(self, documents: list[Document]) -> dict[str, list[Document]]:
141
- """
142
- Split documents based on embedding similarity.
143
-
144
- :param documents: The documents to split.
145
- :returns: A dictionary with the following key:
146
- - `documents`: List of documents with the split texts. Each document includes:
147
- - A metadata field `source_id` to track the original document.
148
- - A metadata field `split_id` to track the split number.
149
- - A metadata field `page_number` to track the original page number.
150
- - All other metadata copied from the original document.
151
-
152
- :raises:
153
- - `RuntimeError`: If the component wasn't warmed up.
154
- - `TypeError`: If the input is not a list of Documents.
155
- - `ValueError`: If the document content is None or empty.
156
- """
157
- if not self._is_warmed_up:
158
- raise RuntimeError(
159
- "The component EmbeddingBasedDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
160
- )
161
-
162
- if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
163
- raise TypeError("EmbeddingBasedDocumentSplitter expects a List of Documents as input.")
164
-
165
- split_docs: list[Document] = []
166
- for doc in documents:
167
- if doc.content is None:
168
- raise ValueError(
169
- f"EmbeddingBasedDocumentSplitter only works with text documents but content for "
170
- f"document ID {doc.id} is None."
171
- )
172
- if doc.content == "":
173
- logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
174
- continue
175
-
176
- doc_splits = self._split_document(doc=doc)
177
- split_docs.extend(doc_splits)
178
-
179
- return {"documents": split_docs}
180
-
181
- def _split_document(self, doc: Document) -> list[Document]:
182
- """
183
- Split a single document based on embedding similarity.
184
- """
185
- # Create an initial split of the document content into smaller chunks
186
- splits = self._split_text(text=doc.content) # type: ignore[union-attr, arg-type]
187
-
188
- # Merge splits smaller than min_length
189
- merged_splits = self._merge_small_splits(splits=splits)
190
-
191
- # Recursively split splits larger than max_length
192
- final_splits = self._split_large_splits(splits=merged_splits)
193
-
194
- # Create Document objects from the final splits
195
- return EmbeddingBasedDocumentSplitter._create_documents_from_splits(splits=final_splits, original_doc=doc)
196
-
197
- def _split_text(self, text: str) -> list[str]:
198
- """
199
- Split a text into smaller chunks based on embedding similarity.
200
- """
201
-
202
- # NOTE: `self.sentence_splitter.split_sentences` strips all white space types (e.g. new lines, page breaks,
203
- # etc.) at the end of the provided text. So to not lose them, we need keep track of them and add them back to
204
- # the last sentence.
205
- rstripped_text = text.rstrip()
206
- trailing_whitespaces = text[len(rstripped_text) :]
207
-
208
- # Split the text into sentences
209
- sentences_result = self.sentence_splitter.split_sentences(rstripped_text) # type: ignore[union-attr]
210
-
211
- # Add back the stripped white spaces to the last sentence
212
- if sentences_result and trailing_whitespaces:
213
- sentences_result[-1]["sentence"] += trailing_whitespaces
214
- sentences_result[-1]["end"] += len(trailing_whitespaces)
215
-
216
- sentences = [sentence["sentence"] for sentence in sentences_result]
217
- sentence_groups = self._group_sentences(sentences=sentences)
218
- embeddings = self._calculate_embeddings(sentence_groups=sentence_groups)
219
- split_points = self._find_split_points(embeddings=embeddings)
220
- sub_splits = self._create_splits_from_points(sentence_groups=sentence_groups, split_points=split_points)
221
-
222
- return sub_splits
223
-
224
- def _group_sentences(self, sentences: list[str]) -> list[str]:
225
- """
226
- Group sentences into groups of sentences_per_group.
227
- """
228
- if self.sentences_per_group == 1:
229
- return sentences
230
-
231
- groups = []
232
- for i in range(0, len(sentences), self.sentences_per_group):
233
- group = sentences[i : i + self.sentences_per_group]
234
- groups.append("".join(group))
235
-
236
- return groups
237
-
238
- def _calculate_embeddings(self, sentence_groups: list[str]) -> list[list[float]]:
239
- """
240
- Calculate embeddings for each sentence group using the DocumentEmbedder.
241
- """
242
- # Create Document objects for each group
243
- group_docs = [Document(content=group) for group in sentence_groups]
244
- result = self.document_embedder.run(group_docs)
245
- embedded_docs = result["documents"]
246
- embeddings = [doc.embedding for doc in embedded_docs]
247
- return embeddings
248
-
249
- def _find_split_points(self, embeddings: list[list[float]]) -> list[int]:
250
- """
251
- Find split points based on cosine distances between sequential embeddings.
252
- """
253
- if len(embeddings) <= 1:
254
- return []
255
-
256
- # Calculate cosine distances between sequential pairs
257
- distances = []
258
- for i in range(len(embeddings) - 1):
259
- distance = EmbeddingBasedDocumentSplitter._cosine_distance(
260
- embedding1=embeddings[i], embedding2=embeddings[i + 1]
261
- )
262
- distances.append(distance)
263
-
264
- # Calculate threshold based on percentile
265
- threshold = np.percentile(distances, self.percentile * 100)
266
-
267
- # Find indices where distance exceeds threshold
268
- split_points = []
269
- for i, distance in enumerate(distances):
270
- if distance > threshold:
271
- split_points.append(i + 1) # +1 because we want to split after this point
272
-
273
- return split_points
274
-
275
- @staticmethod
276
- def _cosine_distance(embedding1: list[float], embedding2: list[float]) -> float:
277
- """
278
- Calculate cosine distance between two embeddings.
279
- """
280
- vec1 = np.array(embedding1)
281
- vec2 = np.array(embedding2)
282
-
283
- norm1 = float(np.linalg.norm(vec1))
284
- norm2 = float(np.linalg.norm(vec2))
285
-
286
- if norm1 == 0 or norm2 == 0:
287
- return 1.0
288
-
289
- cosine_sim = float(np.dot(vec1, vec2) / (norm1 * norm2))
290
-
291
- return 1.0 - cosine_sim
292
-
293
- @staticmethod
294
- def _create_splits_from_points(sentence_groups: list[str], split_points: list[int]) -> list[str]:
295
- """
296
- Create splits based on split points.
297
- """
298
- if not split_points:
299
- return ["".join(sentence_groups)]
300
-
301
- splits = []
302
- start = 0
303
-
304
- for point in split_points:
305
- split_text = "".join(sentence_groups[start:point])
306
- if split_text:
307
- splits.append(split_text)
308
- start = point
309
-
310
- # Add the last split
311
- if start < len(sentence_groups):
312
- split_text = "".join(sentence_groups[start:])
313
- if split_text:
314
- splits.append(split_text)
315
-
316
- return splits
317
-
318
- def _merge_small_splits(self, splits: list[str]) -> list[str]:
319
- """
320
- Merge splits that are below min_length.
321
- """
322
- if not splits:
323
- return splits
324
-
325
- merged = []
326
- current_split = splits[0]
327
-
328
- for split in splits[1:]:
329
- # We merge splits that are smaller than min_length but only if the newly merged split is still below
330
- # max_length.
331
- if len(current_split) < self.min_length and len(current_split) + len(split) < self.max_length:
332
- # Merge with next split
333
- current_split += split
334
- else:
335
- # Current split is long enough, save it and start a new one
336
- merged.append(current_split)
337
- current_split = split
338
-
339
- # Don't forget the last split
340
- merged.append(current_split)
341
-
342
- return merged
343
-
344
- def _split_large_splits(self, splits: list[str]) -> list[str]:
345
- """
346
- Recursively split splits that are above max_length.
347
-
348
- This method checks each split and if it exceeds max_length, it attempts to split it further using the same
349
- embedding-based approach. This is done recursively until all splits are within the max_length limit or no
350
- further splitting is possible.
351
-
352
- This is works because the threshold for splits is calculated dynamically based on the provided of embeddings.
353
- """
354
- final_splits = []
355
-
356
- for split in splits:
357
- if len(split) <= self.max_length:
358
- final_splits.append(split)
359
- else:
360
- # Recursively split large splits
361
- # We can reuse the same _split_text method to split the text into smaller chunks because the threshold
362
- # for splits is calculated dynamically based on embeddings from `split`.
363
- sub_splits = self._split_text(text=split)
364
-
365
- # Stop splitting if no further split is possible or continue with recursion
366
- if len(sub_splits) == 1:
367
- logger.warning(
368
- f"Could not split a chunk further below max_length={self.max_length}. "
369
- f"Returning chunk of length {len(split)}."
370
- )
371
- final_splits.append(split)
372
- else:
373
- final_splits.extend(self._split_large_splits(splits=sub_splits))
374
-
375
- return final_splits
376
-
377
- @staticmethod
378
- def _create_documents_from_splits(splits: list[str], original_doc: Document) -> list[Document]:
379
- """
380
- Create Document objects from splits.
381
- """
382
- documents = []
383
- metadata = deepcopy(original_doc.meta)
384
- metadata["source_id"] = original_doc.id
385
-
386
- # Calculate page numbers for each split
387
- current_page = 1
388
-
389
- for i, split_text in enumerate(splits):
390
- split_meta = deepcopy(metadata)
391
- split_meta["split_id"] = i
392
-
393
- # Calculate page number for this split
394
- # Count page breaks in the split itself
395
- page_breaks_in_split = split_text.count("\f")
396
-
397
- # Calculate the page number for this split
398
- split_meta["page_number"] = current_page
399
-
400
- doc = Document(content=split_text, meta=split_meta)
401
- documents.append(doc)
402
-
403
- # Update page counter for next split
404
- current_page += page_breaks_in_split
405
-
406
- return documents
407
-
408
- def to_dict(self) -> dict[str, Any]:
409
- """
410
- Serializes the component to a dictionary.
411
- """
412
- return default_to_dict(
413
- self,
414
- document_embedder=component_to_dict(obj=self.document_embedder, name="document_embedder"),
415
- sentences_per_group=self.sentences_per_group,
416
- percentile=self.percentile,
417
- min_length=self.min_length,
418
- max_length=self.max_length,
419
- language=self.language,
420
- use_split_rules=self.use_split_rules,
421
- extend_abbreviations=self.extend_abbreviations,
422
- )
423
-
424
- @classmethod
425
- def from_dict(cls, data: dict[str, Any]) -> "EmbeddingBasedDocumentSplitter":
426
- """
427
- Deserializes the component from a dictionary.
428
- """
429
- deserialize_component_inplace(data["init_parameters"], key="document_embedder")
430
- return default_from_dict(cls, data)