ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,408 @@
1
+ """
2
+ References processor for detecting and removing reference
3
+ sections from documents.
4
+
5
+ This processor identifies the references section header in a document and
6
+ extracts the references content, removing it from the main document text.
7
+ """
8
+
9
+ import re
10
+ from difflib import SequenceMatcher
11
+
12
+ from ragbandit.documents.processors.base_processor import BaseProcessor
13
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
14
+ from ragbandit.prompt_tools.references_processor_tools import (
15
+ detect_references_header_tool,
16
+ )
17
+ from ragbandit.schema import OCRResult, ProcessingResult
18
+
19
+
20
+ class ReferencesProcessor(BaseProcessor):
21
+ """Processor for detecting and removing references sections from documents.
22
+
23
+ This processor:
24
+ 1. Extracts headers from the OCR pages
25
+ 2. Identifies the references section header using an LLM
26
+ 3. Removes the references section from the document
27
+ 4. Returns the modified document and the extracted references as markdown
28
+ """
29
+
30
+ def __init__(self, name: str | None = None, api_key: str | None = None):
31
+ """Initialize the references processor.
32
+
33
+ Args:
34
+ name: Optional name for the processor
35
+ api_key: API key for LLM services
36
+ """
37
+ super().__init__(name, api_key)
38
+
39
+ def process(
40
+ self,
41
+ document: OCRResult | ProcessingResult,
42
+ usage_tracker: TokenUsageTracker | None = None,
43
+ ) -> ProcessingResult:
44
+ """Process OCR pages to detect and remove references.
45
+
46
+ Args:
47
+ document: OCRResult or ProcessingResult to process
48
+ usage_tracker: Token usage tracker for LLM calls
49
+
50
+ Returns:
51
+ Tuple containing:
52
+ - Modified ProcessingResult with references removed
53
+ - Extracted references as markdown
54
+ """
55
+
56
+ # Normalize input once
57
+ proc_input = self.ensure_processing_result(
58
+ document, processor_name=str(self)
59
+ )
60
+
61
+ proc_result, references_markdown = self.remove_refs(
62
+ proc_input, usage_tracker
63
+ )
64
+
65
+ # Save extracted references into processing result metadata
66
+ if references_markdown:
67
+ if proc_result.extracted_data is None:
68
+ proc_result.extracted_data = {}
69
+
70
+ proc_result.extracted_data["references_markdown"] = (
71
+ references_markdown
72
+ )
73
+
74
+ return proc_result
75
+
76
+ def find_best_match(
77
+ self, target: str, string_list: list[str]
78
+ ) -> tuple[str, int]:
79
+ """
80
+ Find the string in string_list that best contains the target string.
81
+
82
+ Args:
83
+ target: The string to search for
84
+ string_list: List of strings to search through
85
+
86
+ Returns:
87
+ A tuple containing (best matching string, index of best match)
88
+ If list is empty, returns ("", -1)
89
+ """
90
+ if not string_list or not target:
91
+ return "", -1
92
+
93
+ def similarity_ratio(s1: str, s2: str) -> float:
94
+ return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
95
+
96
+ best_idx = max(
97
+ range(len(string_list)),
98
+ key=lambda i: similarity_ratio(target, string_list[i]),
99
+ )
100
+ return string_list[best_idx], best_idx
101
+
102
+ def remove_refs(
103
+ self,
104
+ proc_result: ProcessingResult,
105
+ usage_tracker: TokenUsageTracker | None = None,
106
+ ) -> tuple[ProcessingResult, str]:
107
+ """Remove references section from document and extract as markdown.
108
+
109
+ This method identifies the references section in a document,
110
+ extracts it, and removes it from the original document.
111
+
112
+ Args:
113
+ proc_result: The document to process (ProcessingResult)
114
+ usage_tracker: Optional tracker for token usage in LLM calls
115
+
116
+ Returns:
117
+ Tuple containing:
118
+ - Modified ProcessingResult with references removed
119
+ - Extracted references as markdown
120
+ """
121
+ # Extract headers and identify references section
122
+ headers = self._extract_headers(proc_result)
123
+ refs_header, refs_header_index = self._identify_references_header(
124
+ headers, usage_tracker
125
+ )
126
+
127
+ # If no references header found, return original document unchanged
128
+ if not refs_header:
129
+ return proc_result, ""
130
+
131
+ # Find next header (if any) after references
132
+ next_header = self._find_next_header(headers, refs_header_index)
133
+
134
+ # Find page boundaries of references section
135
+ boundaries = self._find_reference_boundaries(
136
+ proc_result, refs_header, next_header
137
+ )
138
+
139
+ # If boundaries couldn't be determined, return original document
140
+ if not boundaries:
141
+ return proc_result, ""
142
+
143
+ # Extract references and modify document
144
+ return self._extract_references(proc_result, boundaries)
145
+
146
+ def _extract_headers(self, proc_result: ProcessingResult) -> list[str]:
147
+ """Extract all headers from document.
148
+
149
+ Args:
150
+ proc_result: ProcessingResult containing document pages
151
+
152
+ Returns:
153
+ List of headers found in the document
154
+ """
155
+ # Define header regular expression -
156
+ # looks for header symbols (# to ######)
157
+ header_regex = re.compile(
158
+ r"(?im)(\s*#{1,6}\s*(?![^\n]*\|)[^\n]+(?:\n|$))"
159
+ )
160
+
161
+ # Search for headers in complete markdown string
162
+ full_markdown = ""
163
+ for page in proc_result.pages:
164
+ full_markdown += page.markdown
165
+
166
+ return header_regex.findall(full_markdown)
167
+
168
+ def _identify_references_header(
169
+ self,
170
+ headers: list[str],
171
+ usage_tracker: TokenUsageTracker | None = None,
172
+ ) -> tuple[str, int]:
173
+ """Identify the references header from a list of headers.
174
+
175
+ Args:
176
+ headers: List of headers to search through
177
+ usage_tracker: Optional tracker for token usage in LLM calls
178
+
179
+ Returns:
180
+ Tuple containing the references header and its index
181
+ """
182
+ if not headers:
183
+ return "", -1
184
+
185
+ # Use LLM to identify the most likely references header
186
+ refs = detect_references_header_tool(
187
+ api_key=self.api_key,
188
+ usage_tracker=usage_tracker,
189
+ headers_list=headers
190
+ )
191
+
192
+ # Find the best match for the identified header
193
+ return self.find_best_match(refs.references_header, headers)
194
+
195
+ def _find_next_header(
196
+ self, headers: list[str], refs_header_index: int
197
+ ) -> str | None:
198
+ """Find the next header after the references header.
199
+
200
+ Args:
201
+ headers: List of all headers
202
+ refs_header_index: Index of the references header
203
+
204
+ Returns:
205
+ Next header if it exists, None otherwise
206
+ """
207
+ if refs_header_index < 0 or (refs_header_index + 1) >= len(headers):
208
+ return None
209
+ return headers[refs_header_index + 1]
210
+
211
+ def _find_reference_boundaries(
212
+ self,
213
+ proc_result: ProcessingResult,
214
+ refs_header: str,
215
+ next_header: str | None,
216
+ ) -> dict | None:
217
+ """Find the boundaries of the references section.
218
+
219
+ Args:
220
+ proc_result: ProcessingResult containing document pages
221
+ refs_header: The identified references header
222
+ next_header: The next header after references (if any)
223
+
224
+ Returns:
225
+ Dictionary containing boundary information or None if not found
226
+ """
227
+ refs_page = -1
228
+ next_header_page = -1
229
+
230
+ # Find the pages where references start and end
231
+ for page in proc_result.pages:
232
+ if refs_header in page.markdown:
233
+ refs_page = page.index
234
+ if next_header is not None and next_header in page.markdown:
235
+ next_header_page = page.index
236
+
237
+ # If references header wasn't found in any page, return None
238
+ if refs_page == -1:
239
+ return None
240
+
241
+ # Get the location (page, index) where references start
242
+ refs_page_markdown = proc_result.pages[refs_page].markdown
243
+ references_start_index = refs_page_markdown.find(refs_header)
244
+ references_start = (refs_page, references_start_index)
245
+
246
+ # Determine where references end
247
+ references_end = None
248
+ if next_header is not None and next_header_page != -1:
249
+ next_header_page_markdown = proc_result.pages[
250
+ next_header_page
251
+ ].markdown
252
+ references_end_index = next_header_page_markdown.find(next_header)
253
+ if references_end_index is not None:
254
+ references_end = (next_header_page, references_end_index)
255
+
256
+ return {
257
+ "start": references_start,
258
+ "end": references_end,
259
+ "refs_header": refs_header,
260
+ "next_header": next_header,
261
+ }
262
+
263
+ def _extract_references(
264
+ self, proc_result: ProcessingResult, boundaries: dict
265
+ ) -> tuple[ProcessingResult, str]:
266
+ """Extract references from document based on boundaries.
267
+
268
+ Args:
269
+ proc_result: ProcessingResult containing document pages
270
+ boundaries: Dictionary with reference section boundaries
271
+
272
+ Returns:
273
+ Tuple containing modified document and extracted references
274
+ """
275
+ references_start = boundaries["start"]
276
+ references_end = boundaries["end"]
277
+
278
+ # If references end at the end of document
279
+ if references_end is None:
280
+ return self._extract_references_at_end(
281
+ proc_result, references_start
282
+ )
283
+
284
+ # If references are contained within a single page
285
+ if references_end[0] == references_start[0]:
286
+ return self._extract_references_same_page(
287
+ proc_result, references_start, references_end
288
+ )
289
+
290
+ # If references span multiple pages
291
+ return self._extract_references_multi_page(
292
+ proc_result, references_start, references_end
293
+ )
294
+
295
+ def _extract_references_at_end(
296
+ self, proc_result: ProcessingResult, references_start: tuple[int, int]
297
+ ) -> tuple[ProcessingResult, str]:
298
+ """Extract references when they are the last section in the document.
299
+
300
+ Args:
301
+ proc_result: ProcessingResult containing document pages
302
+ references_start: Tuple (page_index, char_index) where
303
+ references start
304
+
305
+ Returns:
306
+ Tuple containing modified document and extracted references
307
+ """
308
+ references_markdown = ""
309
+ start_page = True
310
+
311
+ for page_index in range(references_start[0], len(proc_result.pages)):
312
+ if start_page:
313
+ # Extract references text from first page,
314
+ # preserve text before references
315
+ references_markdown += proc_result.pages[page_index].markdown[
316
+ references_start[1]:
317
+ ]
318
+ proc_result.pages[page_index].markdown = proc_result.pages[
319
+ page_index
320
+ ].markdown[0:references_start[1]]
321
+ start_page = False
322
+ continue
323
+
324
+ # For subsequent pages, extract all content
325
+ # (assumed to be references)
326
+ references_markdown += proc_result.pages[page_index].markdown
327
+ proc_result.pages[page_index].markdown = ""
328
+
329
+ return proc_result, references_markdown
330
+
331
+ def _extract_references_same_page(
332
+ self,
333
+ proc_result: ProcessingResult,
334
+ references_start: tuple[int, int],
335
+ references_end: tuple[int, int],
336
+ ) -> tuple[ProcessingResult, str]:
337
+ """Extract references when they start and end on the same page.
338
+
339
+ Args:
340
+ proc_result: ProcessingResult containing document pages
341
+ references_start: Tuple (page_index, char_index) where
342
+ references start
343
+ references_end: Tuple (page_index, char_index) where references end
344
+
345
+ Returns:
346
+ Tuple containing modified document and extracted references
347
+ """
348
+ page_idx = references_start[0]
349
+
350
+ # Extract the references section
351
+ references_markdown = proc_result.pages[page_idx].markdown[
352
+ references_start[1]:references_end[1]
353
+ ]
354
+
355
+ # Remove references section from the page
356
+ proc_result.pages[page_idx].markdown = (
357
+ proc_result.pages[page_idx].markdown[0:references_start[1]]
358
+ + proc_result.pages[page_idx].markdown[references_end[1]:]
359
+ )
360
+
361
+ return proc_result, references_markdown
362
+
363
+ def _extract_references_multi_page(
364
+ self,
365
+ proc_result: ProcessingResult,
366
+ references_start: tuple[int, int],
367
+ references_end: tuple[int, int],
368
+ ) -> tuple[ProcessingResult, str]:
369
+ """Extract references when they span multiple pages.
370
+
371
+ Args:
372
+ proc_result: ProcessingResult containing document pages
373
+ references_start: Tuple (page_index, char_index) where
374
+ references start
375
+ references_end: Tuple (page_index, char_index) where references end
376
+
377
+ Returns:
378
+ Tuple containing modified document and extracted references
379
+ """
380
+ references_markdown = ""
381
+
382
+ # Process each page in the range
383
+ for page_index in range(references_start[0], references_end[0] + 1):
384
+ # First page with references
385
+ if page_index == references_start[0]:
386
+ references_markdown += proc_result.pages[page_index].markdown[
387
+ references_start[1]:
388
+ ]
389
+ proc_result.pages[page_index].markdown = proc_result.pages[
390
+ page_index
391
+ ].markdown[0:references_start[1]]
392
+ continue
393
+
394
+ # Last page with references
395
+ if page_index == references_end[0]:
396
+ references_markdown += proc_result.pages[page_index].markdown[
397
+ 0:references_end[1]
398
+ ]
399
+ proc_result.pages[page_index].markdown = proc_result.pages[
400
+ page_index
401
+ ].markdown[references_end[1]:]
402
+ continue
403
+
404
+ # Middle pages (contain only references)
405
+ references_markdown += proc_result.pages[page_index].markdown
406
+ proc_result.pages[page_index].markdown = ""
407
+
408
+ return proc_result, references_markdown
@@ -0,0 +1,11 @@
1
+ """
2
+ Utility functions for document processing.
3
+
4
+ This module provides helper utilities for document handling and processing.
5
+ """
6
+
7
+ from ragbandit.documents.utils.secure_file_handler import SecureFileHandler
8
+
9
+ __all__ = [
10
+ "SecureFileHandler"
11
+ ]
@@ -0,0 +1,95 @@
1
+ """Utilities for secure file handling with encryption."""
2
+
3
+ import os
4
+ import tempfile
5
+ from pathlib import Path
6
+ from cryptography.fernet import Fernet, InvalidToken
7
+ import shutil
8
+
9
+
10
+ class SecureFileHandler:
11
+ """Handles secure file operations with encryption at rest."""
12
+
13
+ def __init__(self, encryption_key: str):
14
+ """Initialize the secure file handler with an encryption key.
15
+
16
+ Args:
17
+ encryption_key: The encryption key to use for file operations
18
+
19
+ Raises:
20
+ ValueError: If encryption_key is empty or invalid format
21
+ """
22
+ if not encryption_key:
23
+ raise ValueError("Encryption key cannot be empty")
24
+ try:
25
+ # Validate the key by creating a cipher
26
+ self._cipher = Fernet(encryption_key.encode())
27
+ except InvalidToken:
28
+ raise ValueError("Invalid encryption key format")
29
+
30
+ def save_encrypted_file(
31
+ self, content: bytes, prefix: str = "doc", original_file_name=""
32
+ ) -> Path:
33
+ """Save file content with encryption.
34
+
35
+ Args:
36
+ content: Raw bytes to encrypt and save
37
+ prefix: Prefix for the temporary file name
38
+
39
+ Returns:
40
+ Path to the encrypted file
41
+ """
42
+ # Create a temporary directory that only this process can access
43
+ temp_dir = Path(tempfile.mkdtemp(prefix="secure_"))
44
+ try:
45
+ # Create encrypted file path
46
+ suffix = ""
47
+ if original_file_name:
48
+ suffix = Path(original_file_name).suffix
49
+
50
+ file_path = temp_dir / f"{prefix}_{os.urandom(8).hex()}{suffix}"
51
+
52
+ # Encrypt and save
53
+ encrypted_content = self._cipher.encrypt(content)
54
+ file_path.write_bytes(encrypted_content)
55
+
56
+ return file_path
57
+
58
+ except Exception as e:
59
+ # Clean up on error
60
+ shutil.rmtree(temp_dir)
61
+ raise e
62
+
63
+ def read_encrypted_file(self, file_path: Path) -> bytes:
64
+ """Read and decrypt file content.
65
+
66
+ Args:
67
+ file_path: Path to the encrypted file
68
+
69
+ Returns:
70
+ Decrypted content as bytes
71
+ """
72
+ encrypted_content = file_path.read_bytes()
73
+ return self._cipher.decrypt(encrypted_content)
74
+
75
+ def secure_delete(self, file_path: Path):
76
+ """Securely delete a file and its parent directory.
77
+
78
+ Args:
79
+ file_path: Path to the file to delete
80
+ """
81
+ if file_path.exists():
82
+ try:
83
+ # Securely overwrite file contents before deletion
84
+ # Write random data 3 times to make recovery harder
85
+ file_size = file_path.stat().st_size
86
+ for _ in range(3):
87
+ with open(file_path, "wb") as f:
88
+ f.write(os.urandom(file_size))
89
+ f.flush()
90
+ os.fsync(f.fileno())
91
+
92
+ # Now delete the parent directory and all its contents
93
+ shutil.rmtree(file_path.parent)
94
+ except FileNotFoundError:
95
+ pass # Already deleted
@@ -0,0 +1,27 @@
1
+ """
2
+ Prompt tools for structured LLM interactions.
3
+
4
+ This module provides tools for creating and using structured prompts with LLMs.
5
+ """
6
+
7
+ from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
8
+ from ragbandit.prompt_tools.footnotes_processor_tools import (
9
+ detect_footnote_section_tool,
10
+ detect_footnote_start_tool,
11
+ classify_footnote_tool,
12
+ footnote_insertion_instruction_tool,
13
+ replace_footnote_inline_operation
14
+ )
15
+ from ragbandit.prompt_tools.references_processor_tools import (
16
+ detect_references_header_tool
17
+ )
18
+
19
+ __all__ = [
20
+ "create_prompt_tool",
21
+ "detect_footnote_section_tool",
22
+ "detect_footnote_start_tool",
23
+ "classify_footnote_tool",
24
+ "footnote_insertion_instruction_tool",
25
+ "replace_footnote_inline_operation",
26
+ "detect_references_header_tool"
27
+ ]