PyPI - ragbandit-core - Versions diffs - 0.1.1__py3-none-any.whl - Mend

ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

ragbandit/__init__.py +26 -0
ragbandit/config/__init__.py +3 -0
ragbandit/config/llms.py +34 -0
ragbandit/config/pricing.py +38 -0
ragbandit/documents/__init__.py +66 -0
ragbandit/documents/chunkers/__init__.py +18 -0
ragbandit/documents/chunkers/base_chunker.py +201 -0
ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
ragbandit/documents/chunkers/semantic_chunker.py +205 -0
ragbandit/documents/document_pipeline.py +350 -0
ragbandit/documents/embedders/__init__.py +14 -0
ragbandit/documents/embedders/base_embedder.py +82 -0
ragbandit/documents/embedders/mistral_embedder.py +129 -0
ragbandit/documents/ocr/__init__.py +13 -0
ragbandit/documents/ocr/base_ocr.py +136 -0
ragbandit/documents/ocr/mistral_ocr.py +147 -0
ragbandit/documents/processors/__init__.py +16 -0
ragbandit/documents/processors/base_processor.py +88 -0
ragbandit/documents/processors/footnotes_processor.py +353 -0
ragbandit/documents/processors/references_processor.py +408 -0
ragbandit/documents/utils/__init__.py +11 -0
ragbandit/documents/utils/secure_file_handler.py +95 -0
ragbandit/prompt_tools/__init__.py +27 -0
ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
ragbandit/prompt_tools/prompt_tool.py +118 -0
ragbandit/prompt_tools/references_processor_tools.py +31 -0
ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
ragbandit/schema.py +206 -0
ragbandit/utils/__init__.py +19 -0
ragbandit/utils/in_memory_log_handler.py +33 -0
ragbandit/utils/llm_utils.py +188 -0
ragbandit/utils/mistral_client.py +76 -0
ragbandit/utils/token_usage_tracker.py +220 -0
ragbandit_core-0.1.1.dist-info/METADATA +145 -0
ragbandit_core-0.1.1.dist-info/RECORD +38 -0
ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0

ragbandit/prompt_tools/footnotes_processor_tools.py ADDED Viewed

@@ -0,0 +1,195 @@
+from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
+from pydantic import BaseModel
+from enum import Enum
+from ragbandit.utils.token_usage_tracker import TokenUsageTracker
+# Detect Footnote Section Tool
+class FootnoteSection(BaseModel):  # noqa
+    footnote_section: str
+footnote_section_tool_prompt = (
+    "You are an expert at identifying the footnotes section of a page. "
+    "The footnotes section of a page appears at the bottom of the page "
+    "and contains text with notes or references. "
+    "Identify the footnote section of the following page of markdown. "
+    "Return a JSON object with a 'footnote_section' key containing "
+    "a string with the footnotes section. "
+    "If there's no footnote section, then return an empty string. "
+    "Include all of the text in the footnotes section."
+    "Page (enclosed in <<<>>>):\n"
+    "<<<\n"
+    "{{ocr_response_page}}\n"
+    ">>>"
+)
+detect_footnote_section_tool = create_prompt_tool(
+    template=footnote_section_tool_prompt,
+    output_schema=FootnoteSection,
+    model="mistral-medium-latest",
+    temperature=0
+)
+# Detect Footnote Symbol Tool
+class FootnoteStart(BaseModel):
+    footnote_start: str
+footnote_start_tool_prompt = (
+    "You will be given a footnote of a page. "
+    "Your task is to extract the first word of the footnote text. "
+    "Return a JSON object with a single key: 'footnote_start'. "
+    "The value should be the first word of the footnote. "
+    "Example:\n"
+    "<<<\n"
+    r"[{\/12}] This study explores the effects of climate change on marine biodiversity."  # noqa
+    ">>>\n\n"
+    "Output:\n"
+    "{'footnote_start': 'This'}"
+    "Footnote (enclosed in <<<>>>):\n"
+    "<<<\n"
+    "{{footnote}}\n"
+    ">>>\n\n"
+)
+detect_footnote_start_tool = create_prompt_tool(
+    template=footnote_start_tool_prompt,
+    output_schema=FootnoteStart,
+    model="mistral-medium-latest",
+    temperature=0,
+)
+# Classify Footnote Tool
+class Label(Enum):
+    CITATION = "citation"  # noqa:E221
+    EXPLANATION = "explanation"
+    LINK = "link"  # noqa:E221
+    EDITORIAL = "editorial"  # noqa:E221
+    OTHER = "other"  # noqa:E221
+class FootnoteLabel(BaseModel):
+    category: Label
+    reason: str
+classify_footnote_tool_prompt = (
+    "Classify the following Footnote.\n"
+    "Use the following categories:\n"
+    "- citation: Contains bibliographic information.\n"
+    "- explanation: Provides additional context.\n"
+    "- link: Includes URLs or references to online resources.\n"
+    "- editorial: Contains subjective remarks or corrections.\n"
+    "- other: The footnote does not fit into any of the above.\n"
+    "Provide a 'reason' for the chosen 'category'.\n"
+    "Here's the expected schema:\n"
+    "{'category': [category], 'reason': [reason]} json format.\n"
+    "Here's the footnote:\n"
+    "<<<\n"
+    "{{footnote_text}}\n"
+    ">>>\n"
+)
+classify_footnote_tool = create_prompt_tool(
+    template=classify_footnote_tool_prompt,
+    output_schema=FootnoteLabel,
+    model="mistral-small-latest",
+    temperature=0,
+)
+# Footnote Replacement Tool
+class SingleFootnoteChange(BaseModel):
+    text_to_replace: str
+    replacement_text: str
+footnote_insertion_instruction_prompt = (
+    "You are a text-cleaning assistant. "
+    "We will provide you a markdown and details about a footnote. "
+    "You must generate minimal edits to inline that footnote. "
+    "Only output a JSON array of a single instruction in the form:\n"
+    "{'text_to_replace': str, 'replacement_text': str}\n"
+    "Do NOT encapsulate the JSON in a list."
+    "Do NOT rewrite lines that do not contain the footnote. "
+    "Do NOT provide any other text or commentary.\n\n"
+    "Rules:\n"
+    "1. Inline the footnote right after the usage text, "
+    "replacing the footnote symbol.\n"
+    "2. Keep everything else exactly as is.\n"
+    "Example:\n"
+    "Input: \n"
+    "Footnote:\n"
+    "- Footnote text: *Hej means hello in swedish\n"
+    "Text:\n"
+    "<<<"
+    "Hej*, said the nice old lady. She was wearing an apron.\n"
+    "*Hej means hello in swedish\n"
+    ">>>"
+    "Output: "
+    "{'text_to_replace': 'Hej*', "
+    "'replacement_text': 'Hej (Hej means hello in swedish)'},"
+    "Now process this text:\n"
+    "<<<\n"
+    "Footnote:\n"
+    "- Footnote text: {{footnote_text}}\n"
+    "Text:\n"
+    "<<<"
+    "{{markdown}}\n"
+    ">>>"
+)
+footnote_insertion_instruction_tool = create_prompt_tool(
+    template=footnote_insertion_instruction_prompt,
+    output_schema=SingleFootnoteChange,
+    model="mistral-small-latest",
+    temperature=0,
+)
+def replace_footnote_inline_operation(
+    api_key: str,
+    footnote: dict,
+    markdown: str,
+    usage_tracker: TokenUsageTracker | None = None
+) -> str:
+    """
+    Given a footnote and the page's markdown text,
+    perform an inline replacement using a 'diff/instructions' approach
+    to ensure no unintended text changes occur.
+    Steps:
+      1) Prompt the LLM to output structured edit instructions
+         (text_to_replace, replacement_text).
+      2) Apply those instructions to the original text.
+    Args:
+        api_key: Mistral API Key
+        footnote (dict): {
+            'footnote_symbol': '*',
+            'footnote_text': 'Corresponding author',
+            'usage_text': 'James Andrews*',
+            'category': 'other',
+            'details': 'Footnote indicating that James Andrews
+                        is a corresponding author.'
+        }
+        markdown str: OCRed page.
+    Returns:
+        str: The updated text, with the footnote properly inlined
+             (and footnote lines removed) without altering other content.
+    """
+    footnote_symbol = footnote.get("footnote_symbol", "")
+    footnote_text = f"{footnote_symbol}{footnote.get('footnote_text', '')}"
+    replace_instruction = footnote_insertion_instruction_tool(
+        api_key=api_key,
+        footnote_text=footnote_text,
+        markdown=markdown,
+        usage_tracker=usage_tracker
+    )
+    markdown = markdown.replace(
+        replace_instruction.text_to_replace,
+        replace_instruction.replacement_text,
+    )
+    return markdown

ragbandit/prompt_tools/prompt_tool.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+Utilities for creating LLM-powered tools based on prompt templates.
+"""
+from typing import Generic, TypeVar, Callable
+from pydantic import BaseModel
+from ragbandit.utils.llm_utils import query_llm
+from ragbandit.utils.token_usage_tracker import TokenUsageTracker
+T = TypeVar("T", bound=BaseModel)
+class PromptTool(Generic[T]):
+    """A tool that uses a prompt template to query an
+    LLM and return structured data."""
+    def __init__(
+        self,
+        template: str,
+        output_schema: type[T],
+        model: str = "mistral-small-latest",
+        temperature: float = 0,
+        preprocess_fn: Callable[[dict[str, object]], dict[str, object]] = None,
+        postprocess_fn: Callable[[T], object] = None,
+    ):
+        """Initialize a new prompt-based tool.
+        Args:
+            template: String template with {variable} placeholders
+            output_schema: Pydantic model for response validation
+            model: LLM model to use
+            temperature: Sampling temperature
+            preprocess_fn: Optional function to preprocess variables
+                           before formatting
+            postprocess_fn: Optional function to process the result
+                            after LLM response
+        """
+        self.template = template
+        self.output_schema = output_schema
+        self.model = model
+        self.temperature = temperature
+        self.preprocess_fn = preprocess_fn or (lambda x: x)
+        self.postprocess_fn = postprocess_fn or (lambda x: x)
+    def format_prompt(self, **kwargs) -> str:
+        """Format the template with the provided variables.
+        This method handles variable substitution in the template.
+        """
+        processed_kwargs = self.preprocess_fn(kwargs)
+        # Simple string replacement approach - more reliable than format()
+        result = self.template
+        for key, value in processed_kwargs.items():
+            placeholder = "{{" + key + "}}"
+            result = result.replace(placeholder, str(value))
+        return result
+    def __call__(
+        self,
+        api_key: str,
+        usage_tracker: TokenUsageTracker | None = None,
+        **kwargs
+    ) -> object:
+        """Execute the tool with the given variables.
+        Args:
+            api_key: Mistral API key for authentication
+            usage_tracker: Optional token usage tracker
+            **kwargs: Variables to substitute in the prompt template
+        Returns:
+            Processed result from the LLM
+        This makes the tool callable like a function, e.g.:
+        result = my_tool(api_key="your_api_key", var1="value", var2="value2")
+        """
+        # Format the prompt with variables
+        prompt = self.format_prompt(**kwargs)
+        # Query the LLM
+        result = query_llm(
+            prompt=prompt,
+            output_schema=self.output_schema,
+            api_key=api_key,
+            usage_tracker=usage_tracker,
+            model=self.model,
+            temperature=self.temperature,
+        )
+        # Apply any post-processing
+        return self.postprocess_fn(result)
+# Helper function to create a tool more easily
+def create_prompt_tool(
+    template: str,
+    output_schema: type[T],
+    model: str = "mistral-small-latest",
+    temperature: float = 0,
+    preprocess_fn: Callable[[dict[str, object]], dict[str, object]] = None,
+    postprocess_fn: Callable[[T], object] = None,
+) -> PromptTool[T]:
+    """Create a new prompt-based tool with the given template and schema.
+    Note: When calling the returned tool,
+    you must provide an api_key parameter.
+    """
+    return PromptTool(
+        template=template,
+        output_schema=output_schema,
+        model=model,
+        temperature=temperature,
+        preprocess_fn=preprocess_fn,
+        postprocess_fn=postprocess_fn,
+    )

ragbandit/prompt_tools/references_processor_tools.py ADDED Viewed

@@ -0,0 +1,31 @@
+from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
+from pydantic import BaseModel
+class ReferencesHeader(BaseModel):
+    references_header: str
+references_tool_prompt = (
+    "You are an expert at identifying the references section "
+    "of a document. You will be given a list of headers. "
+    "Identify the header that represents the references section "
+    "(e.g., 'References', 'Bibliography', 'Sources', etc.). "
+    "Return a JSON object with a single key 'references_header' "
+    "containing the identified header. "
+    "If no references header is found, return an empty string.\n"
+    "The available headers are provided below (enclosed in <<< and >>>):\n"
+    "<<<\n"
+    "{{headers}}"
+    "\n>>>"
+)
+detect_references_header_tool = create_prompt_tool(
+    template=references_tool_prompt,
+    output_schema=ReferencesHeader,
+    model="mistral-medium-latest",
+    temperature=0,
+    # Optional preprocessing function to join headers
+    preprocess_fn=lambda kwargs: {
+        "headers": "\n".join(kwargs["headers_list"])
+    },
+)

ragbandit/prompt_tools/semantic_chunker_tools.py ADDED Viewed

@@ -0,0 +1,56 @@
+from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
+from pydantic import BaseModel
+class SemanticBreak(BaseModel):
+    semantic_break: str
+semantic_break_tool_prompt = (
+    "EXAMPLE TEXT:\n"
+    "Once upon a time in a faraway land, a brave knight set forth "
+    "on a quest to rescue the princess. \n"
+    "He traveled through forests and mountains, encountering "
+    "strange creatures along the way. \n"
+    "Finally, he reached the dragon's lair. "
+    "![img-0.jpeg](img-0.jpeg)(Image description: A large dragon "
+    "perched on a rocky ledge.)\n"
+    "The knight prepared for battle, sword in hand.\n"
+    "\n"
+    "Instruction:\n"
+    "1. We want to split the text into coherent chunks. "
+    "The first chunk begins at the start of the text.\n"
+    "2. Identify where the next chunk should begin—that is, "
+    "find the point at which the first chunk naturally ends "
+    "(thematic break), and the second chunk begins.\n"
+    "3. Return ONLY a short snippet of text (up to ~30 characters) "
+    "that marks the beginning of the next chunk. "
+    "For example, if the next chunk starts at the word 'Finally,' "
+    "return 'Finally, he reached the dragon's lair.' "
+    "(truncated if necessary).\n"
+    "4. If the entire text above is just one cohesive chunk "
+    "with no good break, return \"NO_BREAK\".\n"
+    "5. Do not split inside any "
+    "![img-0.jpeg](img-0.jpeg)(Image description: ...) text. "
+    "Keep these intact.\n"
+    "6. Do not output any additional commentary—"
+    "just provide the snippet or NO_BREAK.\n"
+    "7. Your output should be a JSON containing "
+    "a single key 'semantic_break' with the text snippet "
+    "for the semantic break.\n"
+    "Now find the next semantic break in this text:\n"
+    "{{text}}\n"
+)
+def return_break_string(result: SemanticBreak) -> str:
+    return result.semantic_break
+find_semantic_break_tool = create_prompt_tool(
+    template=semantic_break_tool_prompt,
+    output_schema=SemanticBreak,
+    model="mistral-small-latest",
+    temperature=0,
+    postprocess_fn=return_break_string
+)

ragbandit/schema.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Base schema for data structures."""
+from pydantic import BaseModel
+from datetime import datetime
+from enum import Enum
+##########################################
+# ************* V2 Schema ************** #
+##########################################
+##########################################
+#                Metrics                 #
+##########################################
+class TokenUsageMetrics(BaseModel):
+    """Aggregated token/cost usage metrics returned by TokenUsageTracker."""
+    total_calls: int
+    total_input_tokens: int
+    total_output_tokens: int
+    total_embedding_tokens: int
+    total_tokens: int
+    total_cost_usd: float
+    class ModelUsage(BaseModel):
+        calls: int
+        input_tokens: int | None = 0
+        output_tokens: int | None = 0
+        embedding_tokens: int | None = 0
+        cost: float
+    models: dict[str, ModelUsage]
+class PagesProcessedMetrics(BaseModel):
+    """Metrics for pages processed and associated cost."""
+    pages_processed: int
+    cost_per_page: float
+    total_cost_usd: float
+class TimingMetrics(BaseModel):
+    """Metrics for pipeline step durations in seconds."""
+    total_duration: float | None = None
+    ocr: float | None = None
+    # processing_steps: list[dict[str, float]] | None = None
+    processing: float | None = None
+    chunking: float | None = None
+    embedding: float | None = None
+##########################################
+#                  OCR                   #
+##########################################
+class PageDimensions(BaseModel):
+    dpi: int
+    height: int
+    width: int
+class Image(BaseModel):
+    """Represents an image extracted from a page."""
+    id: str  # e.g., 'img-01.jpg'
+    top_left_x: int | None = None
+    top_left_y: int | None = None
+    bottom_right_x: int | None = None
+    bottom_right_y: int | None = None
+    image_base64: str
+    image_annotation: str | None = None  # JSON string
+class BasePage(BaseModel):
+    """Base schema for a single page of a document."""
+    index: int  # Page number
+    markdown: str
+    images: list[Image] | None = None
+    dimensions: PageDimensions
+class OCRPage(BasePage):
+    """Represents a single page from an OCR result."""
+    pass
+class OCRUsageInfo(BaseModel):
+    pages_processed: int
+    doc_size_bytes: int
+class OCRResult(BaseModel):
+    """Represents the output of the OCR process."""
+    source_file_path: str
+    processed_at: datetime
+    model: str
+    document_annotation: str | None = None
+    pages: list[OCRPage]
+    usage_info: OCRUsageInfo
+    # Metrics for OCR; can include token-usage or page-processing metrics
+    metrics: list[TokenUsageMetrics | PagesProcessedMetrics] | None = None
+##########################################
+#               Processing               #
+##########################################
+class ProcessedPage(BasePage):
+    """Represents a single page after text processors have been applied."""
+    pass
+class ProcessingTraceItem(BaseModel):
+    """Trace of a single processor's execution."""
+    step_name: str  # Name of the step in the processing
+    summary: str
+    duration: float  # Duration in seconds
+class ProcessingResult(BaseModel):
+    """Represents the output of the text processors."""
+    processor_name: str
+    processed_at: datetime
+    pages: list[ProcessedPage]  # The text content, now structured per page
+    processing_trace: list[ProcessingTraceItem]
+    extracted_data: dict[str, object]  # For footnotes, references, etc.
+    processing_duration: float | None = None
+    metrics: TokenUsageMetrics | None = None
+##########################################
+#                Chunking                #
+##########################################
+class ChunkMetadata(BaseModel):
+    """Metadata associated with a chunk."""
+    page_index: int
+    source_references: list[str] | None = None
+    footnotes: list[dict] | None = None
+    images: list[Image] | None = None
+    extra: dict[str, object] = {}
+class Chunk(BaseModel):
+    """Represents a chunk of text, ready for embedding."""
+    text: str
+    metadata: ChunkMetadata
+class ChunkingResult(BaseModel):
+    """Represents the output of the chunking process."""
+    processed_at: datetime
+    chunks: list[Chunk]
+    metrics: TokenUsageMetrics | None = None  # If chunker uses an LLM
+##########################################
+#                Embedding               #
+##########################################
+class ChunkWithEmbedding(Chunk):
+    """Represents a chunk that has been embedded."""
+    embedding: list[float]
+    embedding_model: str
+class EmbeddingResult(BaseModel):
+    """Represents the output of the embedding process."""
+    processed_at: datetime | None = None
+    chunks_with_embeddings: list[ChunkWithEmbedding]
+    model_name: str
+    metrics: TokenUsageMetrics | None = None
+##########################################
+#            Document Pipeline           #
+##########################################
+class StepStatus(str, Enum):
+    success = "success"
+    failed = "failed"
+    skipped = "skipped"
+class StepReport(BaseModel):
+    ocr: StepStatus | None = None
+    processing: StepStatus | None = None
+    chunking: StepStatus | None = None
+    embedding: StepStatus | None = None
+class DocumentPipelineResult(BaseModel):
+    """The composite result for an end-to-end pipeline run."""
+    source_file_path: str
+    processed_at: datetime
+    pipeline_config: dict
+    timings: TimingMetrics
+    total_metrics: (
+        list[TokenUsageMetrics | PagesProcessedMetrics] | None
+    ) = None
+    total_cost_usd: float | None = None
+    ocr_result: OCRResult | None = None
+    processing_results: list[ProcessingResult] | None = None
+    chunking_result: ChunkingResult | None = None
+    embedding_result: EmbeddingResult | None = None
+    step_report: StepReport
+    logs: str | None = None

ragbandit/utils/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""
+Utility functions and classes for the ragbandit package.
+This module provides various utilities used throughout the package.
+"""
+from ragbandit.utils.token_usage_tracker import TokenUsageTracker
+from ragbandit.utils.in_memory_log_handler import InMemoryLogHandler
+from ragbandit.utils.mistral_client import (
+    MistralClientManager,
+    mistral_client_manager
+)
+__all__ = [
+    "TokenUsageTracker",
+    "InMemoryLogHandler",
+    "MistralClientManager",  # The class
+    "mistral_client_manager"  # The instance
+]

ragbandit/utils/in_memory_log_handler.py ADDED Viewed

@@ -0,0 +1,33 @@
+import logging
+import io
+class InMemoryLogHandler(logging.Handler):
+    """
+    Collects every formatted log record that flows through it
+    into an in-memory buffer.  Thread-safe because `logging`
+    already locks `emit()`.
+    """
+    def __init__(self, level=logging.INFO,
+                 fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s"):
+        super().__init__(level)
+        self.buffer = io.StringIO()
+        self.setFormatter(logging.Formatter(fmt))
+        print("InMemoryLogHandler initialized with level:", level)
+        print("Current root logger level:", logging.getLogger().level)
+    def emit(self, record):
+        print(
+            f"Emit called with record: {record.levelname} - "
+            f"{record.name} - {record.msg}"
+        )
+        print(f"Record level: {record.levelno}, Handler level: {self.level}")
+        self.buffer.write(self.format(record) + "\n")
+    def dump(self) -> str:
+        """Return the whole transcript so far."""
+        return self.buffer.getvalue()
+    def clear(self) -> None:
+        self.buffer.truncate(0)
+        self.buffer.seek(0)