ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,353 @@
1
+ """
2
+ Footnote processor for detecting, processing,
3
+ and handling footnotes in documents.
4
+
5
+ This processor identifies footnotes in a document, categorizes them as either
6
+ references or explanations, and processes them accordingly:
7
+ - Explanation footnotes are inlined where they are referenced
8
+ - Citation/reference footnotes are collected and returned separately
9
+ """
10
+
11
+ from difflib import SequenceMatcher
12
+
13
+ from ragbandit.documents.processors.base_processor import BaseProcessor
14
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
15
+
16
+ from ragbandit.prompt_tools.footnotes_processor_tools import (
17
+ detect_footnote_section_tool,
18
+ FootnoteStart,
19
+ detect_footnote_start_tool,
20
+ classify_footnote_tool,
21
+ replace_footnote_inline_operation
22
+ )
23
+ from ragbandit.schema import (
24
+ OCRResult,
25
+ ProcessingResult,
26
+ )
27
+
28
+
29
+ class FootnoteProcessor(BaseProcessor):
30
+ """Processor for detecting and handling footnotes in documents.
31
+
32
+ This processor:
33
+ 1. Detects footnote sections at the bottom of each page
34
+ 2. Processes each footnote to determine if it's a citation or explanation
35
+ 3. Inlines explanation footnotes where they are referenced
36
+ 4. Collects citation footnotes for inclusion in references
37
+ 5. Returns the modified document and the extracted footnote references
38
+ """
39
+ def __init__(self, name: str | None = None, api_key: str | None = None):
40
+ """Initialize the references processor.
41
+
42
+ Args:
43
+ name: Optional name for the processor
44
+ api_key: API key for LLM services
45
+ """
46
+ super().__init__(name, api_key)
47
+
48
+ def process(
49
+ self,
50
+ document: OCRResult | ProcessingResult,
51
+ usage_tracker: TokenUsageTracker | None = None,
52
+ ) -> ProcessingResult:
53
+ """Process OCR pages to detect and handle footnotes.
54
+
55
+ Args:
56
+ document: OCR response or ProcessingResult to process
57
+ usage_tracker: Token usage tracker for LLM calls
58
+
59
+ Returns:
60
+ Tuple containing:
61
+ - Modified ProcessingResult with footnotes processed
62
+ - Dictionary of footnote references by page
63
+ """
64
+ # Normalise input to ProcessingResult once, then delegate
65
+ proc_input = self.ensure_processing_result(
66
+ document, processor_name=str(self)
67
+ )
68
+
69
+ proc_result, footnote_refs = self.process_footnotes(
70
+ proc_input, usage_tracker
71
+ )
72
+
73
+ # Embed footnote references into extracted_data for downstream use
74
+ if footnote_refs:
75
+ if proc_result.extracted_data is None:
76
+ proc_result.extracted_data = {}
77
+
78
+ proc_result.extracted_data["footnote_refs"] = footnote_refs
79
+
80
+ return proc_result
81
+
82
+ def process_footnotes(
83
+ self,
84
+ proc_result: ProcessingResult,
85
+ usage_tracker: TokenUsageTracker | None = None,
86
+ ) -> tuple[ProcessingResult, dict]:
87
+ """Process footnotes in document pages.
88
+
89
+ This method identifies footnote sections in each page, processes them,
90
+ and handles them based on their category (explanation or citation).
91
+
92
+ Args:
93
+ proc_result: The document to process (already a ProcessingResult)
94
+ usage_tracker: Optional tracker for token usage in LLM calls
95
+
96
+ Returns:
97
+ Tuple containing:
98
+ - Modified ProcessingResult with footnotes processed
99
+ - Dictionary of footnote references by page
100
+ """
101
+ footnote_sections: dict[int, str] = {}
102
+ for page in proc_result.pages:
103
+ page_footnote_section = detect_footnote_section_tool(
104
+ api_key=self.api_key,
105
+ ocr_response_page=page.markdown,
106
+ usage_tracker=usage_tracker
107
+ )
108
+ footnote_sections[page.index] = (
109
+ page_footnote_section.footnote_section
110
+ )
111
+
112
+ # Clean up footnote sections
113
+ footnote_sections = self._clean_footnote_sections(footnote_sections)
114
+
115
+ # Split footnote sections into individual footnotes
116
+ footnotes_listed = self._split_footnote_sections(footnote_sections)
117
+
118
+ # Process and categorize each footnote
119
+ footnotes_explained = self._categorize_footnotes(
120
+ footnotes_listed, usage_tracker
121
+ )
122
+
123
+ # Process footnotes based on their category and update document
124
+ proc_result, footnote_refs = self._process_footnotes_by_category(
125
+ proc_result, footnotes_explained, footnotes_listed, usage_tracker
126
+ )
127
+
128
+ return proc_result, footnote_refs
129
+
130
+ def _clean_footnote_sections(self, footnote_sections: dict) -> None:
131
+ """Clean up footnote sections by removing common junk characters.
132
+
133
+ Args:
134
+ footnote_sections: Dictionary mapping page index
135
+ to footnote section text
136
+ """
137
+ # Remove commonly occurring junk char [^0]
138
+ remove_char = "[^0]:"
139
+ remove_char_len = len(remove_char)
140
+
141
+ for page_index, footnote in footnote_sections.items():
142
+ if footnote:
143
+ remove_char_index = footnote.find(remove_char)
144
+ if remove_char_index >= 0:
145
+ footnote = (
146
+ footnote[0:remove_char_index]
147
+ + footnote[(remove_char_index + remove_char_len):]
148
+ )
149
+ footnote_sections[page_index] = footnote
150
+
151
+ # Delete footnote sections without actual footnotes
152
+ page_index_no_footnotes = [
153
+ page_index
154
+ for page_index in footnote_sections
155
+ if len(footnote_sections[page_index]) == 0
156
+ ]
157
+ for page_index in page_index_no_footnotes:
158
+ del footnote_sections[page_index]
159
+
160
+ return footnote_sections
161
+
162
+ def _split_footnote_sections(self, footnote_sections: dict) -> dict:
163
+ """Split footnote sections into individual footnotes.
164
+
165
+ Args:
166
+ footnote_sections: Dictionary mapping page index
167
+ to footnote section text
168
+
169
+ Returns:
170
+ Dictionary mapping page index to list of individual footnotes
171
+ """
172
+ footnotes_listed = {}
173
+ for page_index in footnote_sections:
174
+ # Split footnote section into list
175
+ footnotes_list = footnote_sections[page_index].split("\n")
176
+ clean_footnote_list = []
177
+ for footnote in footnotes_list:
178
+ # Remove extra spaces and newlines
179
+ stripped_footnote = footnote.strip()
180
+ # If footnote less than 5 characters,
181
+ # assume it's formatting junk
182
+ # And only include footnotes longer than 5 characters
183
+ if len(stripped_footnote) > 5:
184
+ clean_footnote_list.append(stripped_footnote)
185
+
186
+ if clean_footnote_list:
187
+ # Use clean footnote list
188
+ footnotes_listed[page_index] = clean_footnote_list
189
+ else:
190
+ # No clean footnote list available
191
+ # Assume cleaning process possibly removed vital information
192
+ # If footnotes are longer than 5 chars, include them
193
+ safe_footnote_list = []
194
+ for footnote in footnotes_list:
195
+ if len(footnote) > 5:
196
+ safe_footnote_list.append(footnote)
197
+ if safe_footnote_list:
198
+ footnotes_listed[page_index] = safe_footnote_list
199
+
200
+ return footnotes_listed
201
+
202
+ def _get_footnote_symbol(
203
+ self,
204
+ footnote_start: FootnoteStart,
205
+ footnote: str,
206
+ ) -> tuple[str, str]:
207
+ fn_start = footnote_start.footnote_start
208
+
209
+ footnote_start_index = footnote.find(fn_start)
210
+ footnote_symbol = footnote[0:footnote_start_index].strip()
211
+ footnote_text = footnote[footnote_start_index:].strip()
212
+
213
+ return footnote_symbol, footnote_text
214
+
215
+ def _categorize_footnotes(
216
+ self,
217
+ footnotes_listed: dict,
218
+ usage_tracker: TokenUsageTracker | None = None,
219
+ ) -> dict:
220
+ """Categorize footnotes as citations or explanations.
221
+
222
+ Args:
223
+ footnotes_listed: Dictionary mapping page index
224
+ to list of footnotes
225
+ usage_tracker: Optional tracker for token usage in LLM calls
226
+
227
+ Returns:
228
+ Dictionary mapping page index to list of categorized footnotes
229
+ """
230
+ footnotes_explained = {}
231
+ for page_index in footnotes_listed:
232
+ footnotes_explained[page_index] = []
233
+ for footnote in footnotes_listed[page_index]:
234
+ # Get footnote symbol and text
235
+ footnote_start = detect_footnote_start_tool(
236
+ api_key=self.api_key,
237
+ footnote=footnote,
238
+ usage_tracker=usage_tracker
239
+ )
240
+ footnote_symbol, footnote_text = self._get_footnote_symbol(
241
+ footnote_start, footnote
242
+ )
243
+
244
+ # Based on the text, classify the footnote
245
+ # Depending on the text the footnote can either be
246
+ # another reference OR an explanation.
247
+ # We assume that explanations are to be inlined,
248
+ # and references added to the references
249
+ footnote_classification = classify_footnote_tool(
250
+ api_key=self.api_key,
251
+ footnote_text=footnote_text,
252
+ usage_tracker=usage_tracker
253
+ )
254
+ footnotes_explained[page_index].append(
255
+ {
256
+ "footnote_symbol": footnote_symbol,
257
+ "footnote_text": footnote_text,
258
+ "category": footnote_classification.category.value,
259
+ }
260
+ )
261
+
262
+ return footnotes_explained
263
+
264
+ def _process_footnotes_by_category(
265
+ self,
266
+ proc_result: ProcessingResult,
267
+ footnotes_explained: dict,
268
+ footnotes_listed: dict,
269
+ usage_tracker: TokenUsageTracker | None = None,
270
+ ) -> tuple[ProcessingResult, dict]:
271
+ """Process footnotes based on their category and update document.
272
+
273
+ Args:
274
+ proc_result: ProcessingResult containing document pages
275
+ footnotes_explained: Dictionary mapping page index to
276
+ categorized footnotes
277
+ footnotes_listed: Dictionary mapping page index to
278
+ original footnote text
279
+
280
+ Returns:
281
+ Tuple containing:
282
+ - Modified ProcessingResult with footnotes processed
283
+ - Dictionary of footnote references by page
284
+ """
285
+ footnote_refs: dict[int, list[dict]] = {}
286
+ for page_index in footnotes_explained:
287
+ page_markdown = proc_result.pages[page_index].markdown
288
+ for footnote in footnotes_explained[page_index]:
289
+ footnote_category = footnote.get("category", "")
290
+ # If footnote is a citation, add it to footnote refs
291
+ if (
292
+ footnote_category == "link"
293
+ or footnote_category == "citation"
294
+ ):
295
+ footnote_ref = {
296
+ "symbol": footnote.get("footnote_symbol", ""),
297
+ "text": footnote.get("footnote_text", ""),
298
+ }
299
+ if page_index not in footnote_refs.keys():
300
+ footnote_refs[page_index] = []
301
+ footnote_refs[page_index].append(footnote_ref)
302
+ else:
303
+ # If footnote is an explanation,
304
+ # inline it where the footnote is called
305
+ page_markdown = replace_footnote_inline_operation(
306
+ self.api_key,
307
+ footnote,
308
+ page_markdown,
309
+ usage_tracker
310
+ )
311
+
312
+ # Delete footnote sections
313
+ # Use footnotes as line for closer match,
314
+ # instead of processed footnote
315
+ for footnote_as_line in footnotes_listed[page_index]:
316
+ page_markdown = self._remove_footnotes_by_line(
317
+ page_markdown, footnote_as_line
318
+ )
319
+ proc_result.pages[page_index].markdown = page_markdown
320
+
321
+ return proc_result, footnote_refs
322
+
323
+ def _remove_footnotes_by_line(
324
+ self, markdown: str, target_header: str, threshold=0.95
325
+ ) -> str:
326
+ """Remove footnote lines from markdown text.
327
+
328
+ Args:
329
+ markdown: The markdown text to process
330
+ target_header: The footnote line to remove
331
+ threshold: Similarity threshold for matching lines
332
+
333
+ Returns:
334
+ Updated markdown with footnote lines removed
335
+ """
336
+ lines = markdown.splitlines()
337
+ # Normalize the target header line.
338
+ target_line = target_header.strip()
339
+
340
+ for i, line in enumerate(lines):
341
+ # Compare each line after stripping extra whitespace.
342
+ clean_line = line.replace("[^0]:", "").replace("[^0]", "").strip()
343
+ if (
344
+ SequenceMatcher(None, clean_line, target_line).ratio()
345
+ >= threshold
346
+ ):
347
+ # Remove the matched line from the markdown
348
+ line_start_index = markdown.find(line)
349
+ markdown = (
350
+ markdown[0:line_start_index]
351
+ + markdown[(line_start_index + len(line)):]
352
+ )
353
+ return markdown