PyPI - ragbandit-core - Versions diffs - 0.1.1__py3-none-any.whl - Mend

ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

ragbandit/__init__.py +26 -0
ragbandit/config/__init__.py +3 -0
ragbandit/config/llms.py +34 -0
ragbandit/config/pricing.py +38 -0
ragbandit/documents/__init__.py +66 -0
ragbandit/documents/chunkers/__init__.py +18 -0
ragbandit/documents/chunkers/base_chunker.py +201 -0
ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
ragbandit/documents/chunkers/semantic_chunker.py +205 -0
ragbandit/documents/document_pipeline.py +350 -0
ragbandit/documents/embedders/__init__.py +14 -0
ragbandit/documents/embedders/base_embedder.py +82 -0
ragbandit/documents/embedders/mistral_embedder.py +129 -0
ragbandit/documents/ocr/__init__.py +13 -0
ragbandit/documents/ocr/base_ocr.py +136 -0
ragbandit/documents/ocr/mistral_ocr.py +147 -0
ragbandit/documents/processors/__init__.py +16 -0
ragbandit/documents/processors/base_processor.py +88 -0
ragbandit/documents/processors/footnotes_processor.py +353 -0
ragbandit/documents/processors/references_processor.py +408 -0
ragbandit/documents/utils/__init__.py +11 -0
ragbandit/documents/utils/secure_file_handler.py +95 -0
ragbandit/prompt_tools/__init__.py +27 -0
ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
ragbandit/prompt_tools/prompt_tool.py +118 -0
ragbandit/prompt_tools/references_processor_tools.py +31 -0
ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
ragbandit/schema.py +206 -0
ragbandit/utils/__init__.py +19 -0
ragbandit/utils/in_memory_log_handler.py +33 -0
ragbandit/utils/llm_utils.py +188 -0
ragbandit/utils/mistral_client.py +76 -0
ragbandit/utils/token_usage_tracker.py +220 -0
ragbandit_core-0.1.1.dist-info/METADATA +145 -0
ragbandit_core-0.1.1.dist-info/RECORD +38 -0
ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0

ragbandit/documents/processors/references_processor.py ADDED Viewed

@@ -0,0 +1,408 @@
+"""
+References processor for detecting and removing reference
+sections from documents.
+This processor identifies the references section header in a document and
+extracts the references content, removing it from the main document text.
+"""
+import re
+from difflib import SequenceMatcher
+from ragbandit.documents.processors.base_processor import BaseProcessor
+from ragbandit.utils.token_usage_tracker import TokenUsageTracker
+from ragbandit.prompt_tools.references_processor_tools import (
+    detect_references_header_tool,
+)
+from ragbandit.schema import OCRResult, ProcessingResult
+class ReferencesProcessor(BaseProcessor):
+    """Processor for detecting and removing references sections from documents.
+    This processor:
+    1. Extracts headers from the OCR pages
+    2. Identifies the references section header using an LLM
+    3. Removes the references section from the document
+    4. Returns the modified document and the extracted references as markdown
+    """
+    def __init__(self, name: str | None = None, api_key: str | None = None):
+        """Initialize the references processor.
+        Args:
+            name: Optional name for the processor
+            api_key: API key for LLM services
+        """
+        super().__init__(name, api_key)
+    def process(
+        self,
+        document: OCRResult | ProcessingResult,
+        usage_tracker: TokenUsageTracker | None = None,
+    ) -> ProcessingResult:
+        """Process OCR pages to detect and remove references.
+        Args:
+            document: OCRResult or ProcessingResult to process
+            usage_tracker: Token usage tracker for LLM calls
+        Returns:
+            Tuple containing:
+            - Modified ProcessingResult with references removed
+            - Extracted references as markdown
+        """
+        # Normalize input once
+        proc_input = self.ensure_processing_result(
+                        document, processor_name=str(self)
+                    )
+        proc_result, references_markdown = self.remove_refs(
+            proc_input, usage_tracker
+        )
+        # Save extracted references into processing result metadata
+        if references_markdown:
+            if proc_result.extracted_data is None:
+                proc_result.extracted_data = {}
+            proc_result.extracted_data["references_markdown"] = (
+                references_markdown
+            )
+        return proc_result
+    def find_best_match(
+        self, target: str, string_list: list[str]
+    ) -> tuple[str, int]:
+        """
+        Find the string in string_list that best contains the target string.
+        Args:
+            target: The string to search for
+            string_list: List of strings to search through
+        Returns:
+            A tuple containing (best matching string, index of best match)
+            If list is empty, returns ("", -1)
+        """
+        if not string_list or not target:
+            return "", -1
+        def similarity_ratio(s1: str, s2: str) -> float:
+            return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
+        best_idx = max(
+            range(len(string_list)),
+            key=lambda i: similarity_ratio(target, string_list[i]),
+        )
+        return string_list[best_idx], best_idx
+    def remove_refs(
+        self,
+        proc_result: ProcessingResult,
+        usage_tracker: TokenUsageTracker | None = None,
+    ) -> tuple[ProcessingResult, str]:
+        """Remove references section from document and extract as markdown.
+        This method identifies the references section in a document,
+        extracts it, and removes it from the original document.
+        Args:
+            proc_result: The document to process (ProcessingResult)
+            usage_tracker: Optional tracker for token usage in LLM calls
+        Returns:
+            Tuple containing:
+            - Modified ProcessingResult with references removed
+            - Extracted references as markdown
+        """
+        # Extract headers and identify references section
+        headers = self._extract_headers(proc_result)
+        refs_header, refs_header_index = self._identify_references_header(
+            headers, usage_tracker
+        )
+        # If no references header found, return original document unchanged
+        if not refs_header:
+            return proc_result, ""
+        # Find next header (if any) after references
+        next_header = self._find_next_header(headers, refs_header_index)
+        # Find page boundaries of references section
+        boundaries = self._find_reference_boundaries(
+            proc_result, refs_header, next_header
+        )
+        # If boundaries couldn't be determined, return original document
+        if not boundaries:
+            return proc_result, ""
+        # Extract references and modify document
+        return self._extract_references(proc_result, boundaries)
+    def _extract_headers(self, proc_result: ProcessingResult) -> list[str]:
+        """Extract all headers from document.
+        Args:
+            proc_result: ProcessingResult containing document pages
+        Returns:
+            List of headers found in the document
+        """
+        # Define header regular expression -
+        # looks for header symbols (# to ######)
+        header_regex = re.compile(
+            r"(?im)(\s*#{1,6}\s*(?![^\n]*\|)[^\n]+(?:\n|$))"
+        )
+        # Search for headers in complete markdown string
+        full_markdown = ""
+        for page in proc_result.pages:
+            full_markdown += page.markdown
+        return header_regex.findall(full_markdown)
+    def _identify_references_header(
+        self,
+        headers: list[str],
+        usage_tracker: TokenUsageTracker | None = None,
+    ) -> tuple[str, int]:
+        """Identify the references header from a list of headers.
+        Args:
+            headers: List of headers to search through
+            usage_tracker: Optional tracker for token usage in LLM calls
+        Returns:
+            Tuple containing the references header and its index
+        """
+        if not headers:
+            return "", -1
+        # Use LLM to identify the most likely references header
+        refs = detect_references_header_tool(
+            api_key=self.api_key,
+            usage_tracker=usage_tracker,
+            headers_list=headers
+        )
+        # Find the best match for the identified header
+        return self.find_best_match(refs.references_header, headers)
+    def _find_next_header(
+        self, headers: list[str], refs_header_index: int
+    ) -> str | None:
+        """Find the next header after the references header.
+        Args:
+            headers: List of all headers
+            refs_header_index: Index of the references header
+        Returns:
+            Next header if it exists, None otherwise
+        """
+        if refs_header_index < 0 or (refs_header_index + 1) >= len(headers):
+            return None
+        return headers[refs_header_index + 1]
+    def _find_reference_boundaries(
+        self,
+        proc_result: ProcessingResult,
+        refs_header: str,
+        next_header: str | None,
+    ) -> dict | None:
+        """Find the boundaries of the references section.
+        Args:
+            proc_result: ProcessingResult containing document pages
+            refs_header: The identified references header
+            next_header: The next header after references (if any)
+        Returns:
+            Dictionary containing boundary information or None if not found
+        """
+        refs_page = -1
+        next_header_page = -1
+        # Find the pages where references start and end
+        for page in proc_result.pages:
+            if refs_header in page.markdown:
+                refs_page = page.index
+            if next_header is not None and next_header in page.markdown:
+                next_header_page = page.index
+        # If references header wasn't found in any page, return None
+        if refs_page == -1:
+            return None
+        # Get the location (page, index) where references start
+        refs_page_markdown = proc_result.pages[refs_page].markdown
+        references_start_index = refs_page_markdown.find(refs_header)
+        references_start = (refs_page, references_start_index)
+        # Determine where references end
+        references_end = None
+        if next_header is not None and next_header_page != -1:
+            next_header_page_markdown = proc_result.pages[
+                next_header_page
+            ].markdown
+            references_end_index = next_header_page_markdown.find(next_header)
+            if references_end_index is not None:
+                references_end = (next_header_page, references_end_index)
+        return {
+            "start": references_start,
+            "end": references_end,
+            "refs_header": refs_header,
+            "next_header": next_header,
+        }
+    def _extract_references(
+        self, proc_result: ProcessingResult, boundaries: dict
+    ) -> tuple[ProcessingResult, str]:
+        """Extract references from document based on boundaries.
+        Args:
+            proc_result: ProcessingResult containing document pages
+            boundaries: Dictionary with reference section boundaries
+        Returns:
+            Tuple containing modified document and extracted references
+        """
+        references_start = boundaries["start"]
+        references_end = boundaries["end"]
+        # If references end at the end of document
+        if references_end is None:
+            return self._extract_references_at_end(
+                proc_result, references_start
+            )
+        # If references are contained within a single page
+        if references_end[0] == references_start[0]:
+            return self._extract_references_same_page(
+                proc_result, references_start, references_end
+            )
+        # If references span multiple pages
+        return self._extract_references_multi_page(
+            proc_result, references_start, references_end
+        )
+    def _extract_references_at_end(
+        self, proc_result: ProcessingResult, references_start: tuple[int, int]
+    ) -> tuple[ProcessingResult, str]:
+        """Extract references when they are the last section in the document.
+        Args:
+            proc_result: ProcessingResult containing document pages
+            references_start: Tuple (page_index, char_index) where
+                              references start
+        Returns:
+            Tuple containing modified document and extracted references
+        """
+        references_markdown = ""
+        start_page = True
+        for page_index in range(references_start[0], len(proc_result.pages)):
+            if start_page:
+                # Extract references text from first page,
+                # preserve text before references
+                references_markdown += proc_result.pages[page_index].markdown[
+                    references_start[1]:
+                ]
+                proc_result.pages[page_index].markdown = proc_result.pages[
+                    page_index
+                ].markdown[0:references_start[1]]
+                start_page = False
+                continue
+            # For subsequent pages, extract all content
+            # (assumed to be references)
+            references_markdown += proc_result.pages[page_index].markdown
+            proc_result.pages[page_index].markdown = ""
+        return proc_result, references_markdown
+    def _extract_references_same_page(
+        self,
+        proc_result: ProcessingResult,
+        references_start: tuple[int, int],
+        references_end: tuple[int, int],
+    ) -> tuple[ProcessingResult, str]:
+        """Extract references when they start and end on the same page.
+        Args:
+            proc_result: ProcessingResult containing document pages
+            references_start: Tuple (page_index, char_index) where
+                              references start
+            references_end: Tuple (page_index, char_index) where references end
+        Returns:
+            Tuple containing modified document and extracted references
+        """
+        page_idx = references_start[0]
+        # Extract the references section
+        references_markdown = proc_result.pages[page_idx].markdown[
+            references_start[1]:references_end[1]
+        ]
+        # Remove references section from the page
+        proc_result.pages[page_idx].markdown = (
+            proc_result.pages[page_idx].markdown[0:references_start[1]]
+            + proc_result.pages[page_idx].markdown[references_end[1]:]
+        )
+        return proc_result, references_markdown
+    def _extract_references_multi_page(
+        self,
+        proc_result: ProcessingResult,
+        references_start: tuple[int, int],
+        references_end: tuple[int, int],
+    ) -> tuple[ProcessingResult, str]:
+        """Extract references when they span multiple pages.
+        Args:
+            proc_result: ProcessingResult containing document pages
+            references_start: Tuple (page_index, char_index) where
+                              references start
+            references_end: Tuple (page_index, char_index) where references end
+        Returns:
+            Tuple containing modified document and extracted references
+        """
+        references_markdown = ""
+        # Process each page in the range
+        for page_index in range(references_start[0], references_end[0] + 1):
+            # First page with references
+            if page_index == references_start[0]:
+                references_markdown += proc_result.pages[page_index].markdown[
+                    references_start[1]:
+                ]
+                proc_result.pages[page_index].markdown = proc_result.pages[
+                    page_index
+                ].markdown[0:references_start[1]]
+                continue
+            # Last page with references
+            if page_index == references_end[0]:
+                references_markdown += proc_result.pages[page_index].markdown[
+                    0:references_end[1]
+                ]
+                proc_result.pages[page_index].markdown = proc_result.pages[
+                    page_index
+                ].markdown[references_end[1]:]
+                continue
+            # Middle pages (contain only references)
+            references_markdown += proc_result.pages[page_index].markdown
+            proc_result.pages[page_index].markdown = ""
+        return proc_result, references_markdown

ragbandit/documents/utils/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+Utility functions for document processing.
+This module provides helper utilities for document handling and processing.
+"""
+from ragbandit.documents.utils.secure_file_handler import SecureFileHandler
+__all__ = [
+    "SecureFileHandler"
+]

ragbandit/documents/utils/secure_file_handler.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Utilities for secure file handling with encryption."""
+import os
+import tempfile
+from pathlib import Path
+from cryptography.fernet import Fernet, InvalidToken
+import shutil
+class SecureFileHandler:
+    """Handles secure file operations with encryption at rest."""
+    def __init__(self, encryption_key: str):
+        """Initialize the secure file handler with an encryption key.
+        Args:
+            encryption_key: The encryption key to use for file operations
+        Raises:
+            ValueError: If encryption_key is empty or invalid format
+        """
+        if not encryption_key:
+            raise ValueError("Encryption key cannot be empty")
+        try:
+            # Validate the key by creating a cipher
+            self._cipher = Fernet(encryption_key.encode())
+        except InvalidToken:
+            raise ValueError("Invalid encryption key format")
+    def save_encrypted_file(
+        self, content: bytes, prefix: str = "doc", original_file_name=""
+    ) -> Path:
+        """Save file content with encryption.
+        Args:
+            content: Raw bytes to encrypt and save
+            prefix: Prefix for the temporary file name
+        Returns:
+            Path to the encrypted file
+        """
+        # Create a temporary directory that only this process can access
+        temp_dir = Path(tempfile.mkdtemp(prefix="secure_"))
+        try:
+            # Create encrypted file path
+            suffix = ""
+            if original_file_name:
+                suffix = Path(original_file_name).suffix
+            file_path = temp_dir / f"{prefix}_{os.urandom(8).hex()}{suffix}"
+            # Encrypt and save
+            encrypted_content = self._cipher.encrypt(content)
+            file_path.write_bytes(encrypted_content)
+            return file_path
+        except Exception as e:
+            # Clean up on error
+            shutil.rmtree(temp_dir)
+            raise e
+    def read_encrypted_file(self, file_path: Path) -> bytes:
+        """Read and decrypt file content.
+        Args:
+            file_path: Path to the encrypted file
+        Returns:
+            Decrypted content as bytes
+        """
+        encrypted_content = file_path.read_bytes()
+        return self._cipher.decrypt(encrypted_content)
+    def secure_delete(self, file_path: Path):
+        """Securely delete a file and its parent directory.
+        Args:
+            file_path: Path to the file to delete
+        """
+        if file_path.exists():
+            try:
+                # Securely overwrite file contents before deletion
+                # Write random data 3 times to make recovery harder
+                file_size = file_path.stat().st_size
+                for _ in range(3):
+                    with open(file_path, "wb") as f:
+                        f.write(os.urandom(file_size))
+                        f.flush()
+                        os.fsync(f.fileno())
+                # Now delete the parent directory and all its contents
+                shutil.rmtree(file_path.parent)
+            except FileNotFoundError:
+                pass  # Already deleted

ragbandit/prompt_tools/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Prompt tools for structured LLM interactions.
+This module provides tools for creating and using structured prompts with LLMs.
+"""
+from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
+from ragbandit.prompt_tools.footnotes_processor_tools import (
+    detect_footnote_section_tool,
+    detect_footnote_start_tool,
+    classify_footnote_tool,
+    footnote_insertion_instruction_tool,
+    replace_footnote_inline_operation
+)
+from ragbandit.prompt_tools.references_processor_tools import (
+    detect_references_header_tool
+)
+__all__ = [
+    "create_prompt_tool",
+    "detect_footnote_section_tool",
+    "detect_footnote_start_tool",
+    "classify_footnote_tool",
+    "footnote_insertion_instruction_tool",
+    "replace_footnote_inline_operation",
+    "detect_references_header_tool"
+]