ragbandit-core 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbandit/__init__.py +26 -0
- ragbandit/config/__init__.py +3 -0
- ragbandit/config/llms.py +34 -0
- ragbandit/config/pricing.py +38 -0
- ragbandit/documents/__init__.py +66 -0
- ragbandit/documents/chunkers/__init__.py +18 -0
- ragbandit/documents/chunkers/base_chunker.py +201 -0
- ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
- ragbandit/documents/chunkers/semantic_chunker.py +205 -0
- ragbandit/documents/document_pipeline.py +350 -0
- ragbandit/documents/embedders/__init__.py +14 -0
- ragbandit/documents/embedders/base_embedder.py +82 -0
- ragbandit/documents/embedders/mistral_embedder.py +129 -0
- ragbandit/documents/ocr/__init__.py +13 -0
- ragbandit/documents/ocr/base_ocr.py +136 -0
- ragbandit/documents/ocr/mistral_ocr.py +147 -0
- ragbandit/documents/processors/__init__.py +16 -0
- ragbandit/documents/processors/base_processor.py +88 -0
- ragbandit/documents/processors/footnotes_processor.py +353 -0
- ragbandit/documents/processors/references_processor.py +408 -0
- ragbandit/documents/utils/__init__.py +11 -0
- ragbandit/documents/utils/secure_file_handler.py +95 -0
- ragbandit/prompt_tools/__init__.py +27 -0
- ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
- ragbandit/prompt_tools/prompt_tool.py +118 -0
- ragbandit/prompt_tools/references_processor_tools.py +31 -0
- ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
- ragbandit/schema.py +206 -0
- ragbandit/utils/__init__.py +19 -0
- ragbandit/utils/in_memory_log_handler.py +33 -0
- ragbandit/utils/llm_utils.py +188 -0
- ragbandit/utils/mistral_client.py +76 -0
- ragbandit/utils/token_usage_tracker.py +220 -0
- ragbandit_core-0.1.1.dist-info/METADATA +145 -0
- ragbandit_core-0.1.1.dist-info/RECORD +38 -0
- ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
- ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
- ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Footnote processor for detecting, processing,
|
|
3
|
+
and handling footnotes in documents.
|
|
4
|
+
|
|
5
|
+
This processor identifies footnotes in a document, categorizes them as either
|
|
6
|
+
references or explanations, and processes them accordingly:
|
|
7
|
+
- Explanation footnotes are inlined where they are referenced
|
|
8
|
+
- Citation/reference footnotes are collected and returned separately
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from difflib import SequenceMatcher
|
|
12
|
+
|
|
13
|
+
from ragbandit.documents.processors.base_processor import BaseProcessor
|
|
14
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
15
|
+
|
|
16
|
+
from ragbandit.prompt_tools.footnotes_processor_tools import (
|
|
17
|
+
detect_footnote_section_tool,
|
|
18
|
+
FootnoteStart,
|
|
19
|
+
detect_footnote_start_tool,
|
|
20
|
+
classify_footnote_tool,
|
|
21
|
+
replace_footnote_inline_operation
|
|
22
|
+
)
|
|
23
|
+
from ragbandit.schema import (
|
|
24
|
+
OCRResult,
|
|
25
|
+
ProcessingResult,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FootnoteProcessor(BaseProcessor):
|
|
30
|
+
"""Processor for detecting and handling footnotes in documents.
|
|
31
|
+
|
|
32
|
+
This processor:
|
|
33
|
+
1. Detects footnote sections at the bottom of each page
|
|
34
|
+
2. Processes each footnote to determine if it's a citation or explanation
|
|
35
|
+
3. Inlines explanation footnotes where they are referenced
|
|
36
|
+
4. Collects citation footnotes for inclusion in references
|
|
37
|
+
5. Returns the modified document and the extracted footnote references
|
|
38
|
+
"""
|
|
39
|
+
def __init__(self, name: str | None = None, api_key: str | None = None):
|
|
40
|
+
"""Initialize the references processor.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
name: Optional name for the processor
|
|
44
|
+
api_key: API key for LLM services
|
|
45
|
+
"""
|
|
46
|
+
super().__init__(name, api_key)
|
|
47
|
+
|
|
48
|
+
def process(
|
|
49
|
+
self,
|
|
50
|
+
document: OCRResult | ProcessingResult,
|
|
51
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
52
|
+
) -> ProcessingResult:
|
|
53
|
+
"""Process OCR pages to detect and handle footnotes.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
document: OCR response or ProcessingResult to process
|
|
57
|
+
usage_tracker: Token usage tracker for LLM calls
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Tuple containing:
|
|
61
|
+
- Modified ProcessingResult with footnotes processed
|
|
62
|
+
- Dictionary of footnote references by page
|
|
63
|
+
"""
|
|
64
|
+
# Normalise input to ProcessingResult once, then delegate
|
|
65
|
+
proc_input = self.ensure_processing_result(
|
|
66
|
+
document, processor_name=str(self)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
proc_result, footnote_refs = self.process_footnotes(
|
|
70
|
+
proc_input, usage_tracker
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Embed footnote references into extracted_data for downstream use
|
|
74
|
+
if footnote_refs:
|
|
75
|
+
if proc_result.extracted_data is None:
|
|
76
|
+
proc_result.extracted_data = {}
|
|
77
|
+
|
|
78
|
+
proc_result.extracted_data["footnote_refs"] = footnote_refs
|
|
79
|
+
|
|
80
|
+
return proc_result
|
|
81
|
+
|
|
82
|
+
def process_footnotes(
|
|
83
|
+
self,
|
|
84
|
+
proc_result: ProcessingResult,
|
|
85
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
86
|
+
) -> tuple[ProcessingResult, dict]:
|
|
87
|
+
"""Process footnotes in document pages.
|
|
88
|
+
|
|
89
|
+
This method identifies footnote sections in each page, processes them,
|
|
90
|
+
and handles them based on their category (explanation or citation).
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
proc_result: The document to process (already a ProcessingResult)
|
|
94
|
+
usage_tracker: Optional tracker for token usage in LLM calls
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Tuple containing:
|
|
98
|
+
- Modified ProcessingResult with footnotes processed
|
|
99
|
+
- Dictionary of footnote references by page
|
|
100
|
+
"""
|
|
101
|
+
footnote_sections: dict[int, str] = {}
|
|
102
|
+
for page in proc_result.pages:
|
|
103
|
+
page_footnote_section = detect_footnote_section_tool(
|
|
104
|
+
api_key=self.api_key,
|
|
105
|
+
ocr_response_page=page.markdown,
|
|
106
|
+
usage_tracker=usage_tracker
|
|
107
|
+
)
|
|
108
|
+
footnote_sections[page.index] = (
|
|
109
|
+
page_footnote_section.footnote_section
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Clean up footnote sections
|
|
113
|
+
footnote_sections = self._clean_footnote_sections(footnote_sections)
|
|
114
|
+
|
|
115
|
+
# Split footnote sections into individual footnotes
|
|
116
|
+
footnotes_listed = self._split_footnote_sections(footnote_sections)
|
|
117
|
+
|
|
118
|
+
# Process and categorize each footnote
|
|
119
|
+
footnotes_explained = self._categorize_footnotes(
|
|
120
|
+
footnotes_listed, usage_tracker
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Process footnotes based on their category and update document
|
|
124
|
+
proc_result, footnote_refs = self._process_footnotes_by_category(
|
|
125
|
+
proc_result, footnotes_explained, footnotes_listed, usage_tracker
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return proc_result, footnote_refs
|
|
129
|
+
|
|
130
|
+
def _clean_footnote_sections(self, footnote_sections: dict) -> None:
|
|
131
|
+
"""Clean up footnote sections by removing common junk characters.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
footnote_sections: Dictionary mapping page index
|
|
135
|
+
to footnote section text
|
|
136
|
+
"""
|
|
137
|
+
# Remove commonly occurring junk char [^0]
|
|
138
|
+
remove_char = "[^0]:"
|
|
139
|
+
remove_char_len = len(remove_char)
|
|
140
|
+
|
|
141
|
+
for page_index, footnote in footnote_sections.items():
|
|
142
|
+
if footnote:
|
|
143
|
+
remove_char_index = footnote.find(remove_char)
|
|
144
|
+
if remove_char_index >= 0:
|
|
145
|
+
footnote = (
|
|
146
|
+
footnote[0:remove_char_index]
|
|
147
|
+
+ footnote[(remove_char_index + remove_char_len):]
|
|
148
|
+
)
|
|
149
|
+
footnote_sections[page_index] = footnote
|
|
150
|
+
|
|
151
|
+
# Delete footnote sections without actual footnotes
|
|
152
|
+
page_index_no_footnotes = [
|
|
153
|
+
page_index
|
|
154
|
+
for page_index in footnote_sections
|
|
155
|
+
if len(footnote_sections[page_index]) == 0
|
|
156
|
+
]
|
|
157
|
+
for page_index in page_index_no_footnotes:
|
|
158
|
+
del footnote_sections[page_index]
|
|
159
|
+
|
|
160
|
+
return footnote_sections
|
|
161
|
+
|
|
162
|
+
def _split_footnote_sections(self, footnote_sections: dict) -> dict:
|
|
163
|
+
"""Split footnote sections into individual footnotes.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
footnote_sections: Dictionary mapping page index
|
|
167
|
+
to footnote section text
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Dictionary mapping page index to list of individual footnotes
|
|
171
|
+
"""
|
|
172
|
+
footnotes_listed = {}
|
|
173
|
+
for page_index in footnote_sections:
|
|
174
|
+
# Split footnote section into list
|
|
175
|
+
footnotes_list = footnote_sections[page_index].split("\n")
|
|
176
|
+
clean_footnote_list = []
|
|
177
|
+
for footnote in footnotes_list:
|
|
178
|
+
# Remove extra spaces and newlines
|
|
179
|
+
stripped_footnote = footnote.strip()
|
|
180
|
+
# If footnote less than 5 characters,
|
|
181
|
+
# assume it's formatting junk
|
|
182
|
+
# And only include footnotes longer than 5 characters
|
|
183
|
+
if len(stripped_footnote) > 5:
|
|
184
|
+
clean_footnote_list.append(stripped_footnote)
|
|
185
|
+
|
|
186
|
+
if clean_footnote_list:
|
|
187
|
+
# Use clean footnote list
|
|
188
|
+
footnotes_listed[page_index] = clean_footnote_list
|
|
189
|
+
else:
|
|
190
|
+
# No clean footnote list available
|
|
191
|
+
# Assume cleaning process possibly removed vital information
|
|
192
|
+
# If footnotes are longer than 5 chars, include them
|
|
193
|
+
safe_footnote_list = []
|
|
194
|
+
for footnote in footnotes_list:
|
|
195
|
+
if len(footnote) > 5:
|
|
196
|
+
safe_footnote_list.append(footnote)
|
|
197
|
+
if safe_footnote_list:
|
|
198
|
+
footnotes_listed[page_index] = safe_footnote_list
|
|
199
|
+
|
|
200
|
+
return footnotes_listed
|
|
201
|
+
|
|
202
|
+
def _get_footnote_symbol(
|
|
203
|
+
self,
|
|
204
|
+
footnote_start: FootnoteStart,
|
|
205
|
+
footnote: str,
|
|
206
|
+
) -> tuple[str, str]:
|
|
207
|
+
fn_start = footnote_start.footnote_start
|
|
208
|
+
|
|
209
|
+
footnote_start_index = footnote.find(fn_start)
|
|
210
|
+
footnote_symbol = footnote[0:footnote_start_index].strip()
|
|
211
|
+
footnote_text = footnote[footnote_start_index:].strip()
|
|
212
|
+
|
|
213
|
+
return footnote_symbol, footnote_text
|
|
214
|
+
|
|
215
|
+
def _categorize_footnotes(
|
|
216
|
+
self,
|
|
217
|
+
footnotes_listed: dict,
|
|
218
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
219
|
+
) -> dict:
|
|
220
|
+
"""Categorize footnotes as citations or explanations.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
footnotes_listed: Dictionary mapping page index
|
|
224
|
+
to list of footnotes
|
|
225
|
+
usage_tracker: Optional tracker for token usage in LLM calls
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Dictionary mapping page index to list of categorized footnotes
|
|
229
|
+
"""
|
|
230
|
+
footnotes_explained = {}
|
|
231
|
+
for page_index in footnotes_listed:
|
|
232
|
+
footnotes_explained[page_index] = []
|
|
233
|
+
for footnote in footnotes_listed[page_index]:
|
|
234
|
+
# Get footnote symbol and text
|
|
235
|
+
footnote_start = detect_footnote_start_tool(
|
|
236
|
+
api_key=self.api_key,
|
|
237
|
+
footnote=footnote,
|
|
238
|
+
usage_tracker=usage_tracker
|
|
239
|
+
)
|
|
240
|
+
footnote_symbol, footnote_text = self._get_footnote_symbol(
|
|
241
|
+
footnote_start, footnote
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Based on the text, classify the footnote
|
|
245
|
+
# Depending on the text the footnote can either be
|
|
246
|
+
# another reference OR an explanation.
|
|
247
|
+
# We assume that explanations are to be inlined,
|
|
248
|
+
# and references added to the references
|
|
249
|
+
footnote_classification = classify_footnote_tool(
|
|
250
|
+
api_key=self.api_key,
|
|
251
|
+
footnote_text=footnote_text,
|
|
252
|
+
usage_tracker=usage_tracker
|
|
253
|
+
)
|
|
254
|
+
footnotes_explained[page_index].append(
|
|
255
|
+
{
|
|
256
|
+
"footnote_symbol": footnote_symbol,
|
|
257
|
+
"footnote_text": footnote_text,
|
|
258
|
+
"category": footnote_classification.category.value,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
return footnotes_explained
|
|
263
|
+
|
|
264
|
+
def _process_footnotes_by_category(
|
|
265
|
+
self,
|
|
266
|
+
proc_result: ProcessingResult,
|
|
267
|
+
footnotes_explained: dict,
|
|
268
|
+
footnotes_listed: dict,
|
|
269
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
270
|
+
) -> tuple[ProcessingResult, dict]:
|
|
271
|
+
"""Process footnotes based on their category and update document.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
proc_result: ProcessingResult containing document pages
|
|
275
|
+
footnotes_explained: Dictionary mapping page index to
|
|
276
|
+
categorized footnotes
|
|
277
|
+
footnotes_listed: Dictionary mapping page index to
|
|
278
|
+
original footnote text
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Tuple containing:
|
|
282
|
+
- Modified ProcessingResult with footnotes processed
|
|
283
|
+
- Dictionary of footnote references by page
|
|
284
|
+
"""
|
|
285
|
+
footnote_refs: dict[int, list[dict]] = {}
|
|
286
|
+
for page_index in footnotes_explained:
|
|
287
|
+
page_markdown = proc_result.pages[page_index].markdown
|
|
288
|
+
for footnote in footnotes_explained[page_index]:
|
|
289
|
+
footnote_category = footnote.get("category", "")
|
|
290
|
+
# If footnote is a citation, add it to footnote refs
|
|
291
|
+
if (
|
|
292
|
+
footnote_category == "link"
|
|
293
|
+
or footnote_category == "citation"
|
|
294
|
+
):
|
|
295
|
+
footnote_ref = {
|
|
296
|
+
"symbol": footnote.get("footnote_symbol", ""),
|
|
297
|
+
"text": footnote.get("footnote_text", ""),
|
|
298
|
+
}
|
|
299
|
+
if page_index not in footnote_refs.keys():
|
|
300
|
+
footnote_refs[page_index] = []
|
|
301
|
+
footnote_refs[page_index].append(footnote_ref)
|
|
302
|
+
else:
|
|
303
|
+
# If footnote is an explanation,
|
|
304
|
+
# inline it where the footnote is called
|
|
305
|
+
page_markdown = replace_footnote_inline_operation(
|
|
306
|
+
self.api_key,
|
|
307
|
+
footnote,
|
|
308
|
+
page_markdown,
|
|
309
|
+
usage_tracker
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Delete footnote sections
|
|
313
|
+
# Use footnotes as line for closer match,
|
|
314
|
+
# instead of processed footnote
|
|
315
|
+
for footnote_as_line in footnotes_listed[page_index]:
|
|
316
|
+
page_markdown = self._remove_footnotes_by_line(
|
|
317
|
+
page_markdown, footnote_as_line
|
|
318
|
+
)
|
|
319
|
+
proc_result.pages[page_index].markdown = page_markdown
|
|
320
|
+
|
|
321
|
+
return proc_result, footnote_refs
|
|
322
|
+
|
|
323
|
+
def _remove_footnotes_by_line(
|
|
324
|
+
self, markdown: str, target_header: str, threshold=0.95
|
|
325
|
+
) -> str:
|
|
326
|
+
"""Remove footnote lines from markdown text.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
markdown: The markdown text to process
|
|
330
|
+
target_header: The footnote line to remove
|
|
331
|
+
threshold: Similarity threshold for matching lines
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Updated markdown with footnote lines removed
|
|
335
|
+
"""
|
|
336
|
+
lines = markdown.splitlines()
|
|
337
|
+
# Normalize the target header line.
|
|
338
|
+
target_line = target_header.strip()
|
|
339
|
+
|
|
340
|
+
for i, line in enumerate(lines):
|
|
341
|
+
# Compare each line after stripping extra whitespace.
|
|
342
|
+
clean_line = line.replace("[^0]:", "").replace("[^0]", "").strip()
|
|
343
|
+
if (
|
|
344
|
+
SequenceMatcher(None, clean_line, target_line).ratio()
|
|
345
|
+
>= threshold
|
|
346
|
+
):
|
|
347
|
+
# Remove the matched line from the markdown
|
|
348
|
+
line_start_index = markdown.find(line)
|
|
349
|
+
markdown = (
|
|
350
|
+
markdown[0:line_start_index]
|
|
351
|
+
+ markdown[(line_start_index + len(line)):]
|
|
352
|
+
)
|
|
353
|
+
return markdown
|