ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,261 @@
1
+ import asyncio
2
+ from collections.abc import Iterable
3
+ from dataclasses import dataclass
4
+
5
+ from ragbits.core.sources.base import Source
6
+ from ragbits.core.utils.helpers import batched
7
+ from ragbits.core.vector_stores.base import VectorStore
8
+ from ragbits.document_search.documents.document import Document, DocumentMeta
9
+ from ragbits.document_search.documents.element import Element
10
+ from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
11
+ from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
12
+ from ragbits.document_search.ingestion.strategies.base import (
13
+ IngestDocumentResult,
14
+ IngestError,
15
+ IngestExecutionResult,
16
+ IngestStrategy,
17
+ )
18
+
19
+
20
+ @dataclass
21
+ class IngestTaskResult:
22
+ """
23
+ Represents the result of the document batch ingest task.
24
+ """
25
+
26
+ document_uri: str
27
+ elements: list[Element]
28
+
29
+
30
+ class BatchedIngestStrategy(IngestStrategy):
31
+ """
32
+ Ingest strategy that processes documents in batches.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ batch_size: int | None = None,
38
+ enrich_batch_size: int | None = None,
39
+ index_batch_size: int | None = None,
40
+ num_retries: int = 3,
41
+ backoff_multiplier: int = 1,
42
+ backoff_max: int = 60,
43
+ ) -> None:
44
+ """
45
+ Initialize the BatchedIngestStrategy instance.
46
+
47
+ Args:
48
+ batch_size: The batch size for parsing documents.
49
+ Describes the maximum number of documents to parse at once. If None, all documents are parsed at once.
50
+ enrich_batch_size: The batch size for enriching elements.
51
+ Describes the maximum number of document elements to enrich at once.
52
+ If None, all elements are enriched at once.
53
+ index_batch_size: The batch size for indexing elements.
54
+ Describes the maximum number of document elements to index at once.
55
+ If None, all elements are indexed at once.
56
+ num_retries: The number of retries per document ingest task error.
57
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
58
+ backoff_max: The maximum allowed delay (in seconds) between retries.
59
+ """
60
+ super().__init__(num_retries=num_retries, backoff_multiplier=backoff_multiplier, backoff_max=backoff_max)
61
+ self.batch_size = batch_size
62
+ self.enrich_batch_size = enrich_batch_size
63
+ self.index_batch_size = index_batch_size
64
+
65
+ async def __call__(
66
+ self,
67
+ documents: Iterable[DocumentMeta | Document | Source],
68
+ vector_store: VectorStore,
69
+ parser_router: DocumentParserRouter,
70
+ enricher_router: ElementEnricherRouter,
71
+ ) -> IngestExecutionResult:
72
+ """
73
+ Ingest documents sequentially in batches.
74
+
75
+ Args:
76
+ documents: The documents to ingest.
77
+ vector_store: The vector store to store document chunks.
78
+ parser_router: The document parser router to use.
79
+ enricher_router: The intermediate element enricher router to use.
80
+
81
+ Returns:
82
+ The ingest execution result.
83
+ """
84
+ results = IngestExecutionResult()
85
+
86
+ for documents_batch in batched(documents, self.batch_size):
87
+ # Parse documents
88
+ parse_results = await self._parse_batch(documents_batch, parser_router)
89
+
90
+ # Split documents into successful and failed
91
+ successfully_parsed = [result for result in parse_results if isinstance(result, IngestTaskResult)]
92
+ failed_parsed = [result for result in parse_results if isinstance(result, IngestDocumentResult)]
93
+
94
+ # Further split successful documents into to enrich and ready
95
+ to_enrich = [
96
+ result
97
+ for result in successfully_parsed
98
+ if any(type(element) in enricher_router for element in result.elements)
99
+ ]
100
+ ready_parsed = [
101
+ result
102
+ for result in successfully_parsed
103
+ if not any(type(element) in enricher_router for element in result.elements)
104
+ ]
105
+
106
+ # Enrich documents
107
+ enrich_results = await self._enrich_batch(to_enrich, enricher_router)
108
+
109
+ # Split enriched documents into successful and failed
110
+ successfully_enriched = [result for result in enrich_results if isinstance(result, IngestTaskResult)]
111
+ failed_enriched = [result for result in enrich_results if isinstance(result, IngestDocumentResult)]
112
+
113
+ # Combine ready documents with successfully enriched documents for indexing
114
+ to_index = ready_parsed + successfully_enriched
115
+
116
+ # Index the combined documents
117
+ index_results = await self._index_batch(to_index, vector_store)
118
+
119
+ # Split indexed documents into successful and failed
120
+ successfully_indexed = [result for result in index_results if not result.error]
121
+ failed_indexed = [result for result in index_results if result.error]
122
+
123
+ # Combine all failed documents
124
+ all_failed = failed_parsed + failed_enriched + failed_indexed
125
+
126
+ # Update the final result
127
+ results.successful.extend(successfully_indexed)
128
+ results.failed.extend(all_failed)
129
+
130
+ return results
131
+
132
+ async def _parse_batch(
133
+ self,
134
+ batch: list[DocumentMeta | Document | Source],
135
+ parser_router: DocumentParserRouter,
136
+ ) -> list[IngestTaskResult | IngestDocumentResult]:
137
+ """
138
+ Parse batch of documents.
139
+
140
+ Args:
141
+ batch: The documents to parse.
142
+ parser_router: The document parser router to use.
143
+
144
+ Returns:
145
+ The task results.
146
+ """
147
+ uris = [document.metadata.id if isinstance(document, Document) else document.id for document in batch]
148
+ responses = await asyncio.gather(
149
+ *[
150
+ self._call_with_error_handling(
151
+ self._parse_document,
152
+ document=document,
153
+ parser_router=parser_router,
154
+ )
155
+ for document in batch
156
+ ],
157
+ return_exceptions=True,
158
+ )
159
+
160
+ results: list[IngestTaskResult | IngestDocumentResult] = []
161
+ for uri, response in zip(uris, responses, strict=True):
162
+ if isinstance(response, BaseException):
163
+ if isinstance(response, Exception):
164
+ results.append(
165
+ IngestDocumentResult(
166
+ document_uri=uri,
167
+ error=IngestError.from_exception(response),
168
+ )
169
+ )
170
+ # Handle only standard exceptions, not BaseExceptions like SystemExit, KeyboardInterrupt, etc.
171
+ else:
172
+ raise response
173
+ else:
174
+ results.append(
175
+ IngestTaskResult(
176
+ document_uri=uri,
177
+ elements=response,
178
+ )
179
+ )
180
+
181
+ return results
182
+
183
+ async def _enrich_batch(
184
+ self,
185
+ batch: list[IngestTaskResult],
186
+ enricher_router: ElementEnricherRouter,
187
+ ) -> list[IngestTaskResult | IngestDocumentResult]:
188
+ """
189
+ Enrich batch of documents.
190
+
191
+ Args:
192
+ batch: The documents to enrich.
193
+ enricher_router: The intermediate element enricher router to use.
194
+
195
+ Returns:
196
+ The task results.
197
+ """
198
+
199
+ async def _enrich_document(result: IngestTaskResult) -> IngestTaskResult | IngestDocumentResult:
200
+ try:
201
+ enriched_elements = [
202
+ element
203
+ for elements_batch in batched(result.elements, self.enrich_batch_size)
204
+ for element in await self._call_with_error_handling(
205
+ self._enrich_elements,
206
+ elements=elements_batch,
207
+ enricher_router=enricher_router,
208
+ )
209
+ ]
210
+ return IngestTaskResult(
211
+ document_uri=result.document_uri,
212
+ elements=enriched_elements,
213
+ )
214
+ except Exception as exc:
215
+ return IngestDocumentResult(
216
+ document_uri=result.document_uri,
217
+ error=IngestError.from_exception(exc),
218
+ )
219
+
220
+ return await asyncio.gather(*[_enrich_document(result) for result in batch])
221
+
222
+ async def _index_batch(
223
+ self,
224
+ batch: list[IngestTaskResult],
225
+ vector_store: VectorStore,
226
+ ) -> list[IngestDocumentResult]:
227
+ """
228
+ Index batch of documents.
229
+
230
+ Args:
231
+ batch: The documents to index.
232
+ vector_store: The vector store to store document chunks.
233
+
234
+ Returns:
235
+ The task results.
236
+ """
237
+
238
+ async def _index_document(result: IngestTaskResult) -> IngestDocumentResult:
239
+ try:
240
+ await self._call_with_error_handling(
241
+ self._remove_elements,
242
+ document_ids=[result.document_uri],
243
+ vector_store=vector_store,
244
+ )
245
+ for elements_batch in batched(result.elements, self.index_batch_size):
246
+ await self._call_with_error_handling(
247
+ self._insert_elements,
248
+ elements=elements_batch,
249
+ vector_store=vector_store,
250
+ )
251
+ return IngestDocumentResult(
252
+ document_uri=result.document_uri,
253
+ num_elements=len(result.elements),
254
+ )
255
+ except Exception as exc:
256
+ return IngestDocumentResult(
257
+ document_uri=result.document_uri,
258
+ error=IngestError.from_exception(exc),
259
+ )
260
+
261
+ return await asyncio.gather(*[_index_document(result) for result in batch])
@@ -0,0 +1,138 @@
1
+ import asyncio
2
+ from collections.abc import Iterable
3
+
4
+ from ragbits.core.sources.base import Source
5
+ from ragbits.core.utils.decorators import requires_dependencies
6
+ from ragbits.core.vector_stores.base import VectorStore
7
+ from ragbits.document_search.documents.document import Document, DocumentMeta
8
+ from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
9
+ from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
10
+ from ragbits.document_search.ingestion.strategies.base import (
11
+ IngestDocumentResult,
12
+ IngestExecutionResult,
13
+ )
14
+ from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy, IngestTaskResult
15
+
16
+
17
+ class RayDistributedIngestStrategy(BatchedIngestStrategy):
18
+ """
19
+ Ingest strategy that processes documents on a cluster, using Ray.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ batch_size: int = 1,
25
+ enrich_batch_size: int | None = None,
26
+ index_batch_size: int | None = None,
27
+ parse_memory: float | None = None,
28
+ processing_memory: float | None = None,
29
+ num_retries: int = 3,
30
+ backoff_multiplier: int = 1,
31
+ backoff_max: int = 60,
32
+ ) -> None:
33
+ """
34
+ Initialize the RayDistributedIngestStrategy instance.
35
+
36
+ Args:
37
+ batch_size: The batch size for parsing documents.
38
+ enrich_batch_size: The batch size for enriching elements.
39
+ Describes the maximum number of document elements to enrich at once.
40
+ If None, all elements are enriched at once.
41
+ index_batch_size: The batch size for indexing elements.
42
+ Describes the maximum number of document elements to index at once.
43
+ If None, all elements are indexed at once.
44
+ parse_memory: The heap memory in bytes to reserve for each parallel parsing tasks.
45
+ processing_memory: The heap memory in bytes to reserve for each parallel elements processing tasks.
46
+ num_retries: The number of retries per document ingest task error.
47
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
48
+ backoff_max: The maximum allowed delay (in seconds) between retries.
49
+ """
50
+ super().__init__(
51
+ batch_size=batch_size,
52
+ enrich_batch_size=enrich_batch_size,
53
+ index_batch_size=index_batch_size,
54
+ num_retries=num_retries,
55
+ backoff_multiplier=backoff_multiplier,
56
+ backoff_max=backoff_max,
57
+ )
58
+ self.parse_memory = parse_memory
59
+ self.processing_memory = processing_memory
60
+
61
+ @requires_dependencies(["ray.data"], "ray")
62
+ async def __call__(
63
+ self,
64
+ documents: Iterable[DocumentMeta | Document | Source],
65
+ vector_store: VectorStore,
66
+ parser_router: DocumentParserRouter,
67
+ enricher_router: ElementEnricherRouter,
68
+ ) -> IngestExecutionResult:
69
+ """
70
+ Ingest documents in parallel in batches.
71
+
72
+ Args:
73
+ documents: The documents to ingest.
74
+ vector_store: The vector store to store document chunks.
75
+ parser_router: The document parser router to use.
76
+ enricher_router: The intermediate element enricher router to use.
77
+
78
+ Returns:
79
+ The ingest execution result.
80
+ """
81
+ import ray
82
+
83
+ # Parse documents
84
+ parse_results = ray.data.from_items(list(documents)).map_batches(
85
+ fn=lambda batch: {"results": asyncio.run(self._parse_batch(batch["item"], parser_router))},
86
+ batch_size=self.batch_size,
87
+ num_cpus=1,
88
+ memory=self.parse_memory,
89
+ zero_copy_batch=True,
90
+ )
91
+
92
+ # Split documents into successful and failed
93
+ successfully_parsed = parse_results.filter(lambda data: isinstance(data["results"], IngestTaskResult))
94
+ failed_parsed = parse_results.filter(lambda data: isinstance(data["results"], IngestDocumentResult))
95
+
96
+ # Further split valid documents into to enrich and ready
97
+ to_enrich = successfully_parsed.filter(
98
+ lambda data: any(type(element) in enricher_router for element in data["results"].elements)
99
+ )
100
+ ready_parsed = successfully_parsed.filter(
101
+ lambda data: not any(type(element) in enricher_router for element in data["results"].elements)
102
+ )
103
+
104
+ # Enrich documents
105
+ enrich_results = to_enrich.map_batches(
106
+ fn=lambda batch: {"results": asyncio.run(self._enrich_batch(batch["results"], enricher_router))},
107
+ batch_size=self.batch_size,
108
+ num_cpus=0,
109
+ memory=self.processing_memory,
110
+ )
111
+
112
+ # Split enriched documents into successful and failed
113
+ successfully_enriched = enrich_results.filter(lambda data: isinstance(data["results"], IngestTaskResult))
114
+ failed_enriched = enrich_results.filter(lambda data: isinstance(data["results"], IngestDocumentResult))
115
+
116
+ # Combine ready documents with successfully enriched documents for indexing
117
+ to_index = ready_parsed.union(successfully_enriched)
118
+
119
+ # Index the combined documents
120
+ index_results = to_index.map_batches(
121
+ fn=lambda batch: {"results": asyncio.run(self._index_batch(batch["results"], vector_store))},
122
+ batch_size=self.batch_size,
123
+ num_cpus=0,
124
+ memory=self.processing_memory,
125
+ )
126
+
127
+ # Split indexed documents into successful and failed
128
+ successfully_indexed = index_results.filter(lambda data: not data["results"].error)
129
+ failed_indexed = index_results.filter(lambda data: data["results"].error)
130
+
131
+ # Combine all failed documents
132
+ all_failed = failed_parsed.union(failed_enriched, failed_indexed)
133
+
134
+ # Return the final result
135
+ return IngestExecutionResult(
136
+ successful=[data["results"] for data in successfully_indexed.take_all()],
137
+ failed=[data["results"] for data in all_failed.take_all()],
138
+ )
@@ -0,0 +1,23 @@
1
+ from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy
2
+
3
+
4
+ class SequentialIngestStrategy(BatchedIngestStrategy):
5
+ """
6
+ Ingest strategy that processes documents in sequence, one at a time.
7
+ """
8
+
9
+ def __init__(self, num_retries: int = 3, backoff_multiplier: int = 1, backoff_max: int = 60) -> None:
10
+ """
11
+ Initialize the SequentialIngestStrategy instance.
12
+
13
+ Args:
14
+ num_retries: The number of retries per document ingest task error.
15
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
16
+ backoff_max: The maximum allowed delay (in seconds) between retries.
17
+ """
18
+ super().__init__(
19
+ batch_size=1,
20
+ num_retries=num_retries,
21
+ backoff_multiplier=backoff_multiplier,
22
+ backoff_max=backoff_max,
23
+ )
File without changes
File without changes
@@ -0,0 +1,18 @@
1
+ from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
2
+ from ragbits.document_search.retrieval.rephrasers.llm import (
3
+ LLMQueryRephraser,
4
+ LLMQueryRephraserOptions,
5
+ LLMQueryRephraserPrompt,
6
+ LLMQueryRephraserPromptInput,
7
+ )
8
+ from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
9
+
10
+ __all__ = [
11
+ "LLMQueryRephraser",
12
+ "LLMQueryRephraserOptions",
13
+ "LLMQueryRephraserPrompt",
14
+ "LLMQueryRephraserPromptInput",
15
+ "NoopQueryRephraser",
16
+ "QueryRephraser",
17
+ "QueryRephraserOptions",
18
+ ]
@@ -0,0 +1,39 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
3
+ from typing import ClassVar, TypeVar
4
+
5
+ from ragbits.core.options import Options
6
+ from ragbits.core.utils.config_handling import ConfigurableComponent
7
+ from ragbits.document_search.retrieval import rephrasers
8
+
9
+
10
+ class QueryRephraserOptions(Options):
11
+ """
12
+ Object representing the options for the rephraser.
13
+ """
14
+
15
+
16
+ QueryRephraserOptionsT = TypeVar("QueryRephraserOptionsT", bound=QueryRephraserOptions)
17
+
18
+
19
+ class QueryRephraser(ConfigurableComponent[QueryRephraserOptionsT], ABC):
20
+ """
21
+ Rephrases a query. Can provide multiple rephrased queries from one sentence / question.
22
+ """
23
+
24
+ options_cls: type[QueryRephraserOptionsT]
25
+ default_module: ClassVar = rephrasers
26
+ configuration_key: ClassVar = "rephraser"
27
+
28
+ @abstractmethod
29
+ async def rephrase(self, query: str, options: QueryRephraserOptionsT | None = None) -> Iterable[str]:
30
+ """
31
+ Rephrase a query.
32
+
33
+ Args:
34
+ query: The query to rephrase.
35
+ options: The options for the rephraser.
36
+
37
+ Returns:
38
+ The rephrased queries.
39
+ """
@@ -0,0 +1,141 @@
1
+ from collections.abc import Iterable
2
+ from typing import Generic
3
+
4
+ from pydantic import BaseModel
5
+ from typing_extensions import Self
6
+
7
+ from ragbits.core.audit.traces import traceable
8
+ from ragbits.core.llms.base import LLM, LLMClientOptionsT
9
+ from ragbits.core.prompt import Prompt
10
+ from ragbits.core.types import NOT_GIVEN, NotGiven
11
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
12
+ from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
13
+
14
+
15
+ class LLMQueryRephraserPromptInput(BaseModel):
16
+ """
17
+ Input data for the query rephraser prompt.
18
+ """
19
+
20
+ query: str
21
+ n: int | None = None
22
+
23
+
24
+ class LLMQueryRephraserPrompt(Prompt[LLMQueryRephraserPromptInput, list]):
25
+ """
26
+ Prompt for generating a rephrased user query.
27
+ """
28
+
29
+ system_prompt = """
30
+ You are an expert in query rephrasing and clarity improvement.
31
+ {%- if n and n > 1 %}
32
+ Your task is to generate {{ n }} different versions of the given user query to retrieve relevant documents
33
+ from a vector database. They can be phrased as statements, as they will be used as a search query.
34
+ By generating multiple perspectives on the user query, your goal is to help the user overcome some of the
35
+ limitations of the distance-based similarity search.
36
+ Alternative queries should only contain information present in the original query. Do not include anything
37
+ in the alternative query, you have not seen in the original version.
38
+ It is VERY important you DO NOT ADD any comments or notes. Return ONLY alternative queries.
39
+ Provide these alternative queries separated by newlines. DO NOT ADD any enumeration.
40
+ {%- else %}
41
+ Your task is to return a single paraphrased version of a user's query,
42
+ correcting any typos, handling abbreviations and improving clarity.
43
+ Focus on making the query more precise and readable while keeping its original intent.
44
+ Just return the rephrased query. No additional explanations are needed.
45
+ {%- endif %}
46
+ """
47
+ user_prompt = "Query: {{ query }}"
48
+
49
+ @staticmethod
50
+ def _response_parser(value: str) -> list[str]:
51
+ return [stripped_line for line in value.strip().split("\n") if (stripped_line := line.strip())]
52
+
53
+ response_parser = _response_parser
54
+
55
+
56
+ class LLMQueryRephraserOptions(QueryRephraserOptions, Generic[LLMClientOptionsT]):
57
+ """
58
+ Object representing the options for the LLM query rephraser.
59
+
60
+ Attributes:
61
+ n: The number of rephrasings to generate. Any number below 2 will generate only one rephrasing.
62
+ llm_options: The options for the LLM.
63
+ """
64
+
65
+ n: int | None | NotGiven = NOT_GIVEN
66
+ llm_options: LLMClientOptionsT | None | NotGiven = NOT_GIVEN
67
+
68
+
69
+ class LLMQueryRephraser(QueryRephraser[LLMQueryRephraserOptions[LLMClientOptionsT]]):
70
+ """
71
+ A rephraser class that uses a LLM to rephrase queries.
72
+ """
73
+
74
+ options_cls: type[LLMQueryRephraserOptions] = LLMQueryRephraserOptions
75
+
76
+ def __init__(
77
+ self,
78
+ llm: LLM[LLMClientOptionsT],
79
+ prompt: type[Prompt[LLMQueryRephraserPromptInput, list[str]]] | None = None,
80
+ default_options: LLMQueryRephraserOptions[LLMClientOptionsT] | None = None,
81
+ ) -> None:
82
+ """
83
+ Initialize the LLMQueryRephraser with a LLM.
84
+
85
+ Args:
86
+ llm: A LLM instance to handle query rephrasing.
87
+ prompt: The prompt to use for rephrasing queries.
88
+ default_options: The default options for the rephraser.
89
+ """
90
+ super().__init__(default_options=default_options)
91
+ self._llm = llm
92
+ self._prompt = prompt or LLMQueryRephraserPrompt
93
+
94
+ @traceable
95
+ async def rephrase(
96
+ self,
97
+ query: str,
98
+ options: LLMQueryRephraserOptions[LLMClientOptionsT] | None = None,
99
+ ) -> Iterable[str]:
100
+ """
101
+ Rephrase a given query using the LLM.
102
+
103
+ Args:
104
+ query: The query to be rephrased. If not provided, a custom prompt must be given.
105
+ options: The options for the rephraser.
106
+
107
+ Returns:
108
+ A list containing the rephrased query.
109
+
110
+ Raises:
111
+ LLMConnectionError: If there is a connection error with the LLM API.
112
+ LLMStatusError: If the LLM API returns an error status code.
113
+ LLMResponseError: If the LLM API response is invalid.
114
+ """
115
+ merged_options = (self.default_options | options) if options else self.default_options
116
+ llm_options = merged_options.llm_options or None
117
+ prompt = self._prompt(LLMQueryRephraserPromptInput(query=query, n=merged_options.n or None))
118
+ return await self._llm.generate(prompt, options=llm_options)
119
+
120
+ @classmethod
121
+ def from_config(cls, config: dict) -> Self:
122
+ """
123
+ Create an instance of `LLMQueryRephraser` from a configuration dictionary.
124
+
125
+ Args:
126
+ config: A dictionary containing configuration settings for the rephraser.
127
+
128
+ Returns:
129
+ An instance of the rephraser class initialized with the provided configuration.
130
+
131
+ Raises:
132
+ ValidationError: If the LLM or prompt configuration doesn't follow the expected format.
133
+ InvalidConfigError: If an LLM or prompt class can't be found or is not the correct type.
134
+ """
135
+ config["llm"] = LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
136
+ config["prompt"] = (
137
+ import_by_path(ObjectConstructionConfig.model_validate(config["prompt"]).type)
138
+ if "prompt" in config
139
+ else None
140
+ )
141
+ return super().from_config(config)
@@ -0,0 +1,26 @@
1
+ from collections.abc import Iterable
2
+
3
+ from ragbits.core.audit.traces import traceable
4
+ from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
5
+
6
+
7
+ class NoopQueryRephraser(QueryRephraser[QueryRephraserOptions]):
8
+ """
9
+ A no-op query paraphraser that does not change the query.
10
+ """
11
+
12
+ options_cls: type[QueryRephraserOptions] = QueryRephraserOptions
13
+
14
+ @traceable
15
+ async def rephrase(self, query: str, options: QueryRephraserOptions | None = None) -> Iterable[str]: # noqa: PLR6301
16
+ """
17
+ Mock implementation which outputs the same query as in input.
18
+
19
+ Args:
20
+ query: The query to rephrase.
21
+ options: The options for the rephraser.
22
+
23
+ Returns:
24
+ The list with non-transformed query.
25
+ """
26
+ return [query]
@@ -0,0 +1,4 @@
1
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
2
+ from ragbits.document_search.retrieval.rerankers.noop import NoopReranker
3
+
4
+ __all__ = ["NoopReranker", "Reranker", "RerankerOptions"]