ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/document_search/__init__.py +3 -0
- ragbits/document_search/_main.py +273 -0
- ragbits/document_search/cli.py +109 -0
- ragbits/document_search/documents/__init__.py +0 -0
- ragbits/document_search/documents/document.py +203 -0
- ragbits/document_search/documents/element.py +208 -0
- ragbits/document_search/ingestion/__init__.py +0 -0
- ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
- ragbits/document_search/ingestion/enrichers/base.py +64 -0
- ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
- ragbits/document_search/ingestion/enrichers/image.py +107 -0
- ragbits/document_search/ingestion/enrichers/router.py +86 -0
- ragbits/document_search/ingestion/parsers/__init__.py +9 -0
- ragbits/document_search/ingestion/parsers/base.py +97 -0
- ragbits/document_search/ingestion/parsers/docling.py +178 -0
- ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
- ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
- ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
- ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
- ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
- ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
- ragbits/document_search/ingestion/parsers/router.py +90 -0
- ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
- ragbits/document_search/ingestion/strategies/__init__.py +6 -0
- ragbits/document_search/ingestion/strategies/base.py +290 -0
- ragbits/document_search/ingestion/strategies/batched.py +261 -0
- ragbits/document_search/ingestion/strategies/ray.py +138 -0
- ragbits/document_search/ingestion/strategies/sequential.py +23 -0
- ragbits/document_search/py.typed +0 -0
- ragbits/document_search/retrieval/__init__.py +0 -0
- ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
- ragbits/document_search/retrieval/rephrasers/base.py +39 -0
- ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
- ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
- ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
- ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
- ragbits/document_search/retrieval/rerankers/base.py +56 -0
- ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
- ragbits/document_search/retrieval/rerankers/llm.py +177 -0
- ragbits/document_search/retrieval/rerankers/noop.py +34 -0
- ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from itertools import chain
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from rerankers import Reranker as AnswerReranker
|
|
6
|
+
|
|
7
|
+
from ragbits.core.audit.traces import trace
|
|
8
|
+
from ragbits.document_search.documents.element import Element
|
|
9
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AnswerAIReranker(Reranker[RerankerOptions]):
|
|
13
|
+
"""
|
|
14
|
+
A [rerankers](https://github.com/AnswerDotAI/rerankers) re-ranker covering most popular re-ranking methods.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
options_cls: type[RerankerOptions] = RerankerOptions
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
model: str,
|
|
22
|
+
default_options: RerankerOptions | None = None,
|
|
23
|
+
**rerankers_kwargs: Any, # noqa: ANN401
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Initialize the AnswerAIReranker instance.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
model: The reranker model to use.
|
|
30
|
+
default_options: The default options for reranking.
|
|
31
|
+
**rerankers_kwargs: Additional keyword arguments native to rerankers lib.
|
|
32
|
+
"""
|
|
33
|
+
super().__init__(default_options=default_options)
|
|
34
|
+
self.model = model
|
|
35
|
+
self.ranker = AnswerReranker(self.model, **rerankers_kwargs)
|
|
36
|
+
|
|
37
|
+
async def rerank(
|
|
38
|
+
self,
|
|
39
|
+
elements: Sequence[Sequence[Element]],
|
|
40
|
+
query: str,
|
|
41
|
+
options: RerankerOptions | None = None,
|
|
42
|
+
) -> Sequence[Element]:
|
|
43
|
+
"""
|
|
44
|
+
Rerank elements.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
elements: The elements to rerank.
|
|
48
|
+
query: The query to rerank the elements against.
|
|
49
|
+
options: The options for reranking.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
The reranked elements.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: Raised if the input query is empty or if the list of candidate documents is empty.
|
|
56
|
+
TypeError: Raised if the input types are incorrect, such as if the query is not a string, or List[str].
|
|
57
|
+
IndexError: Raised if docs is an empty List.
|
|
58
|
+
"""
|
|
59
|
+
merged_options = (self.default_options | options) if options else self.default_options
|
|
60
|
+
flat_elements = list(chain.from_iterable(elements))
|
|
61
|
+
documents = [element.text_representation or "" for element in flat_elements]
|
|
62
|
+
|
|
63
|
+
with trace(
|
|
64
|
+
query=query, documents=documents, elements=elements, model=self.model, options=merged_options
|
|
65
|
+
) as outputs:
|
|
66
|
+
response = self.ranker.rank(
|
|
67
|
+
query=query,
|
|
68
|
+
docs=documents,
|
|
69
|
+
)
|
|
70
|
+
if merged_options.top_n:
|
|
71
|
+
response = response.top_k(merged_options.top_n)
|
|
72
|
+
|
|
73
|
+
results = []
|
|
74
|
+
for result in response:
|
|
75
|
+
if not merged_options.score_threshold or result.score >= merged_options.score_threshold:
|
|
76
|
+
if merged_options.override_score:
|
|
77
|
+
flat_elements[result.document.doc_id].score = result.score
|
|
78
|
+
results.append(flat_elements[result.document.doc_id])
|
|
79
|
+
|
|
80
|
+
outputs.results = results
|
|
81
|
+
|
|
82
|
+
return outputs.results
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from typing import ClassVar, TypeVar
|
|
4
|
+
|
|
5
|
+
from ragbits.core.options import Options
|
|
6
|
+
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
7
|
+
from ragbits.core.utils.config_handling import ConfigurableComponent
|
|
8
|
+
from ragbits.document_search.documents.element import Element
|
|
9
|
+
from ragbits.document_search.retrieval import rerankers
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RerankerOptions(Options):
|
|
13
|
+
"""
|
|
14
|
+
Object representing the options for the reranker.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
top_n: The number of entries to return.
|
|
18
|
+
score_threshold: The minimum relevance score for an entry to be returned.
|
|
19
|
+
override_score: If True reranking will override element score.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
top_n: int | None | NotGiven = NOT_GIVEN
|
|
23
|
+
score_threshold: float | None | NotGiven = NOT_GIVEN
|
|
24
|
+
override_score: bool = True
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
RerankerOptionsT = TypeVar("RerankerOptionsT", bound=RerankerOptions)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Reranker(ConfigurableComponent[RerankerOptionsT], ABC):
|
|
31
|
+
"""
|
|
32
|
+
Reranks elements retrieved from vector store.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
options_cls: type[RerankerOptionsT]
|
|
36
|
+
default_module: ClassVar = rerankers
|
|
37
|
+
configuration_key: ClassVar = "reranker"
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def rerank(
|
|
41
|
+
self,
|
|
42
|
+
elements: Sequence[Sequence[Element]],
|
|
43
|
+
query: str,
|
|
44
|
+
options: RerankerOptionsT | None = None,
|
|
45
|
+
) -> Sequence[Element]:
|
|
46
|
+
"""
|
|
47
|
+
Rerank elements.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
elements: The elements to rerank.
|
|
51
|
+
query: The query to rerank the elements against.
|
|
52
|
+
options: The options for reranking.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
The reranked elements.
|
|
56
|
+
"""
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from itertools import chain
|
|
3
|
+
|
|
4
|
+
import litellm
|
|
5
|
+
|
|
6
|
+
from ragbits.core.audit.traces import traceable
|
|
7
|
+
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
8
|
+
from ragbits.document_search.documents.element import Element
|
|
9
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LiteLLMRerankerOptions(RerankerOptions):
|
|
13
|
+
"""
|
|
14
|
+
Object representing the options for the litellm reranker.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
top_n: The number of entries to return.
|
|
18
|
+
score_threshold: The minimum relevance score for an entry to be returned.
|
|
19
|
+
override_score: If True reranking will override element score.
|
|
20
|
+
max_chunks_per_doc: The maximum amount of tokens a document can have before truncation.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
max_chunks_per_doc: int | None | NotGiven = NOT_GIVEN
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LiteLLMReranker(Reranker[LiteLLMRerankerOptions]):
|
|
27
|
+
"""
|
|
28
|
+
A [LiteLLM](https://docs.litellm.ai/docs/rerank) reranker for providers such as Cohere, Together AI, Azure AI.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
options_cls: type[LiteLLMRerankerOptions] = LiteLLMRerankerOptions
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
model: str,
|
|
36
|
+
default_options: LiteLLMRerankerOptions | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Initialize the LiteLLMReranker instance.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
model: The reranker model to use.
|
|
43
|
+
default_options: The default options for reranking.
|
|
44
|
+
"""
|
|
45
|
+
super().__init__(default_options=default_options)
|
|
46
|
+
self.model = model
|
|
47
|
+
|
|
48
|
+
@traceable
|
|
49
|
+
async def rerank(
|
|
50
|
+
self,
|
|
51
|
+
elements: Sequence[Sequence[Element]],
|
|
52
|
+
query: str,
|
|
53
|
+
options: LiteLLMRerankerOptions | None = None,
|
|
54
|
+
) -> Sequence[Element]:
|
|
55
|
+
"""
|
|
56
|
+
Rerank elements with LiteLLM API.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
elements: The elements to rerank.
|
|
60
|
+
query: The query to rerank the elements against.
|
|
61
|
+
options: The options for reranking.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The reranked elements.
|
|
65
|
+
"""
|
|
66
|
+
merged_options = (self.default_options | options) if options else self.default_options
|
|
67
|
+
flat_elements = list(chain.from_iterable(elements))
|
|
68
|
+
documents = [element.text_representation or "" for element in flat_elements]
|
|
69
|
+
|
|
70
|
+
response = await litellm.arerank(
|
|
71
|
+
model=self.model,
|
|
72
|
+
query=query,
|
|
73
|
+
documents=documents,
|
|
74
|
+
top_n=merged_options.top_n or None,
|
|
75
|
+
max_chunks_per_doc=merged_options.max_chunks_per_doc or None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
results = []
|
|
79
|
+
for result in response.results:
|
|
80
|
+
if not merged_options.score_threshold or result["relevance_score"] >= merged_options.score_threshold:
|
|
81
|
+
if merged_options.override_score:
|
|
82
|
+
flat_elements[result["index"]].score = result["relevance_score"]
|
|
83
|
+
results.append(flat_elements[result["index"]])
|
|
84
|
+
|
|
85
|
+
return results
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from itertools import chain
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from ragbits.core.audit.traces import traceable
|
|
9
|
+
from ragbits.core.llms.base import LLM
|
|
10
|
+
from ragbits.core.llms.litellm import LiteLLM, LiteLLMOptions
|
|
11
|
+
from ragbits.core.prompt.prompt import Prompt
|
|
12
|
+
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
13
|
+
from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
|
|
14
|
+
from ragbits.document_search.documents.element import Element
|
|
15
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RerankerInput(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Input data for the document reranker.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
query: str
|
|
24
|
+
document: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RerankerPrompt(Prompt[RerankerInput, str]):
|
|
28
|
+
"""
|
|
29
|
+
Prompt for reranking documents.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
system_prompt = """
|
|
33
|
+
You are an Assistant responsible for helping detect whether the retrieved document is relevant to the query.
|
|
34
|
+
For a given input, you need to output a single token: "Yes" or "No" indicating the retrieved document is relevant to the query.
|
|
35
|
+
""" # noqa: E501
|
|
36
|
+
user_prompt = """
|
|
37
|
+
Query: {{query}}
|
|
38
|
+
Document: {{document}}
|
|
39
|
+
Relevant:
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class LLMRerankerOptions(RerankerOptions):
|
|
44
|
+
"""
|
|
45
|
+
Object representing the options for the llm reranker.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
top_n: The number of entries to return.
|
|
49
|
+
score_threshold: The minimum relevance score for an entry to be returned.
|
|
50
|
+
override_score: If True reranking will override element score.
|
|
51
|
+
llm_options: The options for the LLM.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
llm_options: LiteLLMOptions | None | NotGiven = NOT_GIVEN
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class LLMReranker(Reranker[LLMRerankerOptions]):
|
|
58
|
+
"""
|
|
59
|
+
Reranker based on LLM.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
options_cls: type[LLMRerankerOptions] = LLMRerankerOptions
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
llm: LiteLLM,
|
|
67
|
+
*,
|
|
68
|
+
prompt: type[Prompt[RerankerInput, str]] | None = None,
|
|
69
|
+
default_options: LLMRerankerOptions | None = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""
|
|
72
|
+
Initialize the LLMReranker instance.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
llm: The LLM instance to handle reranking.
|
|
76
|
+
prompt: The prompt to use for reranking elements.
|
|
77
|
+
default_options: The default options for reranking.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(default_options=default_options)
|
|
80
|
+
self._llm = llm
|
|
81
|
+
self._prompt = prompt or RerankerPrompt
|
|
82
|
+
self._llm_options = LiteLLMOptions(
|
|
83
|
+
temperature=0.0,
|
|
84
|
+
logprobs=True,
|
|
85
|
+
max_tokens=1,
|
|
86
|
+
logit_bias={
|
|
87
|
+
self._llm.get_token_id(" Yes"): 1,
|
|
88
|
+
self._llm.get_token_id(" No"): 1,
|
|
89
|
+
},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def from_config(cls, config: dict) -> Self:
|
|
94
|
+
"""
|
|
95
|
+
Initialize the class with the provided configuration.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
config: A dictionary containing configuration details for the class.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The initialized instance of LLMReranker.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValidationError: If the configuration doesn't follow the expected format.
|
|
105
|
+
InvalidConfigError: If llm or prompt can't be found or are not the correct type.
|
|
106
|
+
"""
|
|
107
|
+
config["llm"] = LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
|
|
108
|
+
config["prompt"] = import_by_path(config["prompt"]) if "prompt" in config else None
|
|
109
|
+
return super().from_config(config)
|
|
110
|
+
|
|
111
|
+
@traceable
|
|
112
|
+
async def rerank(
|
|
113
|
+
self,
|
|
114
|
+
elements: Sequence[Sequence[Element]],
|
|
115
|
+
query: str,
|
|
116
|
+
options: LLMRerankerOptions | None = None,
|
|
117
|
+
) -> Sequence[Element]:
|
|
118
|
+
"""
|
|
119
|
+
Rerank elements with LLM.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
elements: The elements to rerank.
|
|
123
|
+
query: The query to rerank the elements against.
|
|
124
|
+
options: The options for reranking.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
The reranked elements.
|
|
128
|
+
"""
|
|
129
|
+
merged_options = (self.default_options | options) if options else self.default_options
|
|
130
|
+
llm_options = (
|
|
131
|
+
self._llm_options | merged_options.llm_options if merged_options.llm_options else self._llm_options
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
flat_elements = list(chain.from_iterable(elements))
|
|
135
|
+
scores = await self._score_elements(flat_elements, query, llm_options)
|
|
136
|
+
|
|
137
|
+
scored_elements = list(zip(flat_elements, scores, strict=True))
|
|
138
|
+
scored_elements.sort(key=lambda x: x[1], reverse=True)
|
|
139
|
+
|
|
140
|
+
results = []
|
|
141
|
+
for element, score in scored_elements[: merged_options.top_n or None]:
|
|
142
|
+
if not merged_options.score_threshold or score >= merged_options.score_threshold:
|
|
143
|
+
if merged_options.override_score:
|
|
144
|
+
element.score = score
|
|
145
|
+
results.append(element)
|
|
146
|
+
return results
|
|
147
|
+
|
|
148
|
+
async def _score_elements(
|
|
149
|
+
self,
|
|
150
|
+
elements: Sequence[Element],
|
|
151
|
+
query: str,
|
|
152
|
+
llm_options: LiteLLMOptions,
|
|
153
|
+
) -> Sequence[float]:
|
|
154
|
+
"""
|
|
155
|
+
Score the elements according to their relevance to the query using LLM.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
elements: The elements to rerank.
|
|
159
|
+
query: The query to rerank the elements against.
|
|
160
|
+
llm_options: The LLM options to use for scoring.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
The elements scores.
|
|
164
|
+
"""
|
|
165
|
+
scores = []
|
|
166
|
+
for element in elements:
|
|
167
|
+
if element.text_representation:
|
|
168
|
+
prompt = self._prompt(RerankerInput(query=query, document=element.text_representation))
|
|
169
|
+
response = await self._llm.generate_with_metadata(prompt=prompt, options=llm_options)
|
|
170
|
+
prob = math.exp(response.metadata["logprobs"][0]["logprob"])
|
|
171
|
+
score = prob if response.content == "Yes" else 1 - prob
|
|
172
|
+
else:
|
|
173
|
+
score = 0.0
|
|
174
|
+
|
|
175
|
+
scores.append(score)
|
|
176
|
+
|
|
177
|
+
return scores
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from itertools import chain
|
|
3
|
+
|
|
4
|
+
from ragbits.core.audit.traces import traceable
|
|
5
|
+
from ragbits.document_search.documents.element import Element
|
|
6
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NoopReranker(Reranker[RerankerOptions]):
|
|
10
|
+
"""
|
|
11
|
+
A no-op reranker that does not change the order of the elements.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
options_cls: type[RerankerOptions] = RerankerOptions
|
|
15
|
+
|
|
16
|
+
@traceable
|
|
17
|
+
async def rerank( # noqa: PLR6301
|
|
18
|
+
self,
|
|
19
|
+
elements: Sequence[Sequence[Element]],
|
|
20
|
+
query: str,
|
|
21
|
+
options: RerankerOptions | None = None,
|
|
22
|
+
) -> Sequence[Element]:
|
|
23
|
+
"""
|
|
24
|
+
No reranking, returning the elements in the same order.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
elements: The elements to rerank.
|
|
28
|
+
query: The query to rerank the elements against.
|
|
29
|
+
options: The options for reranking.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The reranked elements.
|
|
33
|
+
"""
|
|
34
|
+
return [*{element.id: element for element in chain.from_iterable(elements)}.values()]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
|
|
4
|
+
from ragbits.core.audit.traces import traceable
|
|
5
|
+
from ragbits.document_search.documents.element import Element
|
|
6
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReciprocalRankFusionReranker(Reranker[RerankerOptions]):
|
|
10
|
+
"""
|
|
11
|
+
A reranker that implements the Reciprocal Rank Fusion (RRF) algorithm to
|
|
12
|
+
combine multiple ranked result sets into a single reranked list.
|
|
13
|
+
|
|
14
|
+
RRF is a method that assigns scores to documents based on their positions
|
|
15
|
+
in multiple ranked lists, allowing for fusion of diverse ranking sources
|
|
16
|
+
without the need for tuning.
|
|
17
|
+
|
|
18
|
+
The score for each document is calculated using the formula:
|
|
19
|
+
|
|
20
|
+
score = sum(1.0 / (k + rank(q, d)))
|
|
21
|
+
|
|
22
|
+
where:
|
|
23
|
+
- k is a ranking constant (1 is used here)
|
|
24
|
+
- q is a query in the set of queries
|
|
25
|
+
- d is a document in the result set
|
|
26
|
+
- rank(q, d) is the position of d in the ranking list for q (starting from 1)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
options_cls: type[RerankerOptions] = RerankerOptions
|
|
30
|
+
|
|
31
|
+
@traceable
|
|
32
|
+
async def rerank(
|
|
33
|
+
self,
|
|
34
|
+
elements: Sequence[Sequence[Element]],
|
|
35
|
+
query: str,
|
|
36
|
+
options: RerankerOptions | None = None,
|
|
37
|
+
) -> Sequence[Element]:
|
|
38
|
+
"""
|
|
39
|
+
Reranks elements using the Reciprocal Rank Fusion (RRF) algorithm.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
elements: A list of ranked lists of elements to be fused.
|
|
43
|
+
query: The query string for reranking.
|
|
44
|
+
options: The options for reranking.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The reranked elements.
|
|
48
|
+
"""
|
|
49
|
+
if len(elements) == 1:
|
|
50
|
+
return elements[0]
|
|
51
|
+
|
|
52
|
+
merged_options = (self.default_options | options) if options else self.default_options
|
|
53
|
+
|
|
54
|
+
scores: dict[str, float] = defaultdict(float)
|
|
55
|
+
elements_map: dict[str, Element] = {}
|
|
56
|
+
|
|
57
|
+
for query_elements in elements:
|
|
58
|
+
for rank, element in enumerate(query_elements):
|
|
59
|
+
if not element.key:
|
|
60
|
+
continue
|
|
61
|
+
scores[element.key] += 1 / (rank + 1 + 1)
|
|
62
|
+
elements_map[element.key] = element
|
|
63
|
+
|
|
64
|
+
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
for element_id, score in sorted_scores[: merged_options.top_n or None]:
|
|
68
|
+
if not merged_options.score_threshold or score >= merged_options.score_threshold:
|
|
69
|
+
if merged_options.override_score:
|
|
70
|
+
elements_map[element_id].score = score
|
|
71
|
+
results.append(elements_map[element_id])
|
|
72
|
+
|
|
73
|
+
return results
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragbits-document-search
|
|
3
|
+
Version: 1.4.0.dev202601310254
|
|
4
|
+
Summary: Document Search module for Ragbits
|
|
5
|
+
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
|
+
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
7
|
+
Project-URL: Documentation, https://ragbits.deepsense.ai/
|
|
8
|
+
Project-URL: Source, https://github.com/deepsense-ai/ragbits
|
|
9
|
+
Author-email: "deepsense.ai" <ragbits@deepsense.ai>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Keywords: Document Search,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: docling[easyocr]<3.0.0,>=2.15.1
|
|
26
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
27
|
+
Requires-Dist: opencv-python<5.0.0.0,>=4.11.0.86
|
|
28
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.0
|
|
29
|
+
Requires-Dist: ragbits-core==1.4.0.dev202601310254
|
|
30
|
+
Requires-Dist: rerankers<1.0.0,>=0.6.1
|
|
31
|
+
Provides-Extra: ray
|
|
32
|
+
Requires-Dist: ray[data]<3.0.0,>=2.43.0; extra == 'ray'
|
|
33
|
+
Provides-Extra: unstructured
|
|
34
|
+
Requires-Dist: unstructured-client<1.0.0,>=0.26.0; extra == 'unstructured'
|
|
35
|
+
Requires-Dist: unstructured<1.0.0,>=0.16.9; extra == 'unstructured'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# Ragbits Document Search
|
|
39
|
+
|
|
40
|
+
Ragbits Document Search is a Python package that provides tools for building RAG applications. It helps ingest, index, and search documents to retrieve relevant information for your prompts.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
You can install the latest version of Ragbits Document Search using pip:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install ragbits-document-search
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quickstart
|
|
51
|
+
```python
|
|
52
|
+
import asyncio
|
|
53
|
+
|
|
54
|
+
from ragbits.core.embeddings import LiteLLMEmbedder
|
|
55
|
+
from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
|
|
56
|
+
from ragbits.document_search import DocumentSearch
|
|
57
|
+
|
|
58
|
+
async def main() -> None:
|
|
59
|
+
"""
|
|
60
|
+
Run the example.
|
|
61
|
+
"""
|
|
62
|
+
embedder = LiteLLMEmbedder(
|
|
63
|
+
model_name="text-embedding-3-small",
|
|
64
|
+
)
|
|
65
|
+
vector_store = InMemoryVectorStore(embedder=embedder)
|
|
66
|
+
document_search = DocumentSearch(
|
|
67
|
+
vector_store=vector_store,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Ingest all .txt files from the "biographies" directory
|
|
71
|
+
await document_search.ingest("local://biographies/*.txt")
|
|
72
|
+
|
|
73
|
+
# Search the documents for the query
|
|
74
|
+
results = await document_search.search("When was Marie Curie-Sklodowska born?")
|
|
75
|
+
print(results)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
asyncio.run(main())
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Documentation
|
|
83
|
+
* [Quickstart 2: Adding RAG Capabilities](https://ragbits.deepsense.ai/quickstart/quickstart2_rag/)
|
|
84
|
+
* [How-To Guides - Document Search](https://ragbits.deepsense.ai/how-to/document_search/async_processing/)
|
|
85
|
+
* [API Reference - Document Search](https://ragbits.deepsense.ai/api_reference/document_search/)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
ragbits/document_search/__init__.py,sha256=uYbt6kSsfKpL3IY07CCpiRhsG77oZkZjV1pEuwBX1n0,135
|
|
2
|
+
ragbits/document_search/_main.py,sha256=25oeZQ9mWX2tewHi67Wh35wbW3F1LlFthay7HuyL2rI,12331
|
|
3
|
+
ragbits/document_search/cli.py,sha256=GYcm-_tMgW96kBf8Oj88PodjXnWKaXV_OalloK1cU8s,3020
|
|
4
|
+
ragbits/document_search/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
ragbits/document_search/documents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ragbits/document_search/documents/document.py,sha256=yxi-07fduOeqM0ay9E_BbDm11nGfqtCxuM13cSAfd74,5158
|
|
7
|
+
ragbits/document_search/documents/element.py,sha256=hQCMyGn4C6bfR7lh1vcKZepQpSySLpaA7Zz3D33GHjA,5898
|
|
8
|
+
ragbits/document_search/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
ragbits/document_search/ingestion/enrichers/__init__.py,sha256=owoQ-Qw76Yow8cNNfzYtgpjzw3SySrLXvuZ4Wv0Iap0,325
|
|
10
|
+
ragbits/document_search/ingestion/enrichers/base.py,sha256=_vPNgJeZvrV3A8HVAZjCi18MGtDtVMaat8rUN9Vq1Do,2307
|
|
11
|
+
ragbits/document_search/ingestion/enrichers/exceptions.py,sha256=f3dFENXD8rBM6yDdxQWJM4-GzJf-RBrJqArKnTS6GwA,1007
|
|
12
|
+
ragbits/document_search/ingestion/enrichers/image.py,sha256=81lC6gO-6xGrdArDsQ6doDVQgZcsHuqQrw3DQ7G4sXw,3512
|
|
13
|
+
ragbits/document_search/ingestion/enrichers/router.py,sha256=pEFO3sRb3eY8DZhWEGOsPv0iSS0lGAkcXuk1VLtjwiI,2913
|
|
14
|
+
ragbits/document_search/ingestion/parsers/__init__.py,sha256=WMkY-JEi_FxbegnhArrLQC2Dh4Mtn5O7Fu_fsVCEngQ,315
|
|
15
|
+
ragbits/document_search/ingestion/parsers/base.py,sha256=b9zAlDnXi81_WkCJSUpsRl7hVRQkmpxcqBnmQ89GCUc,3174
|
|
16
|
+
ragbits/document_search/ingestion/parsers/docling.py,sha256=nAIQEzoBEyYr9IAlovz6FIRDjUIlnkeKgckIvj2R8C8,6785
|
|
17
|
+
ragbits/document_search/ingestion/parsers/exceptions.py,sha256=ymCw6XhBhtF2gqJ9n-QzLCjzuaaS__J_VKllgwyBFO4,1005
|
|
18
|
+
ragbits/document_search/ingestion/parsers/router.py,sha256=C2dQT4s5I-CDuS6DE6chQyLyLXfhPoRDYlINRVM0ph8,3168
|
|
19
|
+
ragbits/document_search/ingestion/parsers/unstructured.py,sha256=yfze3TQYPzMnV3SwhqUg04bTBURfhOZsK-zJ3JOxl1k,9453
|
|
20
|
+
ragbits/document_search/ingestion/parsers/pptx/__init__.py,sha256=X9vfcWO8Ght3U2vOlNZFhE1_PYM0TZdSoTLS8pj98jo,627
|
|
21
|
+
ragbits/document_search/ingestion/parsers/pptx/callbacks.py,sha256=G6-4fg11c_xCXcEmcnplEcxwJm6mRj3e_NxHz45nTKQ,799
|
|
22
|
+
ragbits/document_search/ingestion/parsers/pptx/exceptions.py,sha256=fIdt9osO2p1QFq0yDPDNLLwhRLq9bW-wJg41lgUz7iY,1868
|
|
23
|
+
ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py,sha256=Peg5jYX8ePUS4yDsc1sD-xMHqtPnzGuQ517wtkASccg,3287
|
|
24
|
+
ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py,sha256=raxrMJTFd2S--V6eAosbrhbislAoVZx2LdM8EE-7gww,3038
|
|
25
|
+
ragbits/document_search/ingestion/parsers/pptx/parser.py,sha256=h07vIPFpWLFcKiAY9IxIj-775xHs0QUDb1lTxNpO9u4,3243
|
|
26
|
+
ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py,sha256=xKET2hq_OYnuYS4B8QIln1Pc2iVCr--psfahXtISdA4,2851
|
|
27
|
+
ragbits/document_search/ingestion/strategies/__init__.py,sha256=R7Tx1HKrDE38NqxewCrbvsJ9BTWpvWrjv6RlKUQuJgQ,462
|
|
28
|
+
ragbits/document_search/ingestion/strategies/base.py,sha256=DVrzSZXjiiDqX0ZEy9UO7m96zRu0zOgLpOraFGA6spo,10573
|
|
29
|
+
ragbits/document_search/ingestion/strategies/batched.py,sha256=N1SA4Z5sVOB50CX5i7fpsHlsVX63hf4ApascL3LO4Og,10031
|
|
30
|
+
ragbits/document_search/ingestion/strategies/ray.py,sha256=sNBGBuRDBTfuVSCJXiaPFXjvQkXLfPuD9kOBQnQi8-I,6018
|
|
31
|
+
ragbits/document_search/ingestion/strategies/sequential.py,sha256=H81rULKg4A_di5bxkir4mslbrVWm1TxjdI7dSlaPYE0,880
|
|
32
|
+
ragbits/document_search/retrieval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
ragbits/document_search/retrieval/rephrasers/__init__.py,sha256=RHW1TjSuXwEE6qkUXF2iVnbpcK0iJExOZqihHS_njIQ,578
|
|
34
|
+
ragbits/document_search/retrieval/rephrasers/base.py,sha256=N5egUeRkhz5198LYLZk6zx9KVEvhrj5WjB-7m7etST4,1137
|
|
35
|
+
ragbits/document_search/retrieval/rephrasers/llm.py,sha256=DLAXiFPNExdPGd4Yrg1FxEqVurN99L7qpgrUR_faTjw,5673
|
|
36
|
+
ragbits/document_search/retrieval/rephrasers/noop.py,sha256=4aFtxWCpueVg-uOFc247v3zS_cEB6Bsex37hhSdyxpk,832
|
|
37
|
+
ragbits/document_search/retrieval/rerankers/__init__.py,sha256=QFgBmjkHFjsVS89EpnUoUw36IpjXB45OcLUIcRjCZtE,220
|
|
38
|
+
ragbits/document_search/retrieval/rerankers/answerai.py,sha256=3bx0ao19pXa3vcgptfCbLxmmQL4KimV8AxVtj8kaYqk,2987
|
|
39
|
+
ragbits/document_search/retrieval/rerankers/base.py,sha256=oTHFI4hBNJvPL9s0flZYhwG5xZ9FIflThSuuy5JtgpA,1649
|
|
40
|
+
ragbits/document_search/retrieval/rerankers/litellm.py,sha256=C5pum0YGrQPT3raUD7hYVmAOM5bNHa8MsgWZLoEKTp4,2898
|
|
41
|
+
ragbits/document_search/retrieval/rerankers/llm.py,sha256=YcKu4OqPhChfL1BUygim0xBLtiHOOJqI4oNF6ilYtPc,5902
|
|
42
|
+
ragbits/document_search/retrieval/rerankers/noop.py,sha256=9v381IrfLbNYHQaecTNkQHN6rLZx-WB5QqqA5XKrF20,1072
|
|
43
|
+
ragbits/document_search/retrieval/rerankers/rrf.py,sha256=y281Rucv3fuyMNLjJrikqD5--wLu0KQcwbeI6WxmPao,2628
|
|
44
|
+
ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA,sha256=joisMG3fGB6M4XETPZyxUeFH8Ts0hMjUg5VzWw85enc,3261
|
|
45
|
+
ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
46
|
+
ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD,,
|