ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,195 @@
1
+ from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
2
+ from pydantic import BaseModel
3
+ from enum import Enum
4
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
5
+
6
+ # Detect Footnote Section Tool
7
+ class FootnoteSection(BaseModel): # noqa
8
+ footnote_section: str
9
+
10
+
11
+ footnote_section_tool_prompt = (
12
+ "You are an expert at identifying the footnotes section of a page. "
13
+ "The footnotes section of a page appears at the bottom of the page "
14
+ "and contains text with notes or references. "
15
+ "Identify the footnote section of the following page of markdown. "
16
+ "Return a JSON object with a 'footnote_section' key containing "
17
+ "a string with the footnotes section. "
18
+ "If there's no footnote section, then return an empty string. "
19
+ "Include all of the text in the footnotes section."
20
+ "Page (enclosed in <<<>>>):\n"
21
+ "<<<\n"
22
+ "{{ocr_response_page}}\n"
23
+ ">>>"
24
+ )
25
+ detect_footnote_section_tool = create_prompt_tool(
26
+ template=footnote_section_tool_prompt,
27
+ output_schema=FootnoteSection,
28
+ model="mistral-medium-latest",
29
+ temperature=0
30
+ )
31
+
32
+
33
+ # Detect Footnote Symbol Tool
34
+ class FootnoteStart(BaseModel):
35
+ footnote_start: str
36
+
37
+
38
+ footnote_start_tool_prompt = (
39
+ "You will be given a footnote of a page. "
40
+ "Your task is to extract the first word of the footnote text. "
41
+ "Return a JSON object with a single key: 'footnote_start'. "
42
+ "The value should be the first word of the footnote. "
43
+ "Example:\n"
44
+ "<<<\n"
45
+ r"[{\/12}] This study explores the effects of climate change on marine biodiversity." # noqa
46
+ ">>>\n\n"
47
+ "Output:\n"
48
+ "{'footnote_start': 'This'}"
49
+ "Footnote (enclosed in <<<>>>):\n"
50
+ "<<<\n"
51
+ "{{footnote}}\n"
52
+ ">>>\n\n"
53
+ )
54
+ detect_footnote_start_tool = create_prompt_tool(
55
+ template=footnote_start_tool_prompt,
56
+ output_schema=FootnoteStart,
57
+ model="mistral-medium-latest",
58
+ temperature=0,
59
+ )
60
+
61
+
62
+ # Classify Footnote Tool
63
+ class Label(Enum):
64
+ CITATION = "citation" # noqa:E221
65
+ EXPLANATION = "explanation"
66
+ LINK = "link" # noqa:E221
67
+ EDITORIAL = "editorial" # noqa:E221
68
+ OTHER = "other" # noqa:E221
69
+
70
+
71
+ class FootnoteLabel(BaseModel):
72
+ category: Label
73
+ reason: str
74
+
75
+
76
+ classify_footnote_tool_prompt = (
77
+ "Classify the following Footnote.\n"
78
+ "Use the following categories:\n"
79
+ "- citation: Contains bibliographic information.\n"
80
+ "- explanation: Provides additional context.\n"
81
+ "- link: Includes URLs or references to online resources.\n"
82
+ "- editorial: Contains subjective remarks or corrections.\n"
83
+ "- other: The footnote does not fit into any of the above.\n"
84
+ "Provide a 'reason' for the chosen 'category'.\n"
85
+ "Here's the expected schema:\n"
86
+ "{'category': [category], 'reason': [reason]} json format.\n"
87
+ "Here's the footnote:\n"
88
+ "<<<\n"
89
+ "{{footnote_text}}\n"
90
+ ">>>\n"
91
+ )
92
+ classify_footnote_tool = create_prompt_tool(
93
+ template=classify_footnote_tool_prompt,
94
+ output_schema=FootnoteLabel,
95
+ model="mistral-small-latest",
96
+ temperature=0,
97
+ )
98
+
99
+
100
+ # Footnote Replacement Tool
101
+ class SingleFootnoteChange(BaseModel):
102
+ text_to_replace: str
103
+ replacement_text: str
104
+
105
+
106
+ footnote_insertion_instruction_prompt = (
107
+ "You are a text-cleaning assistant. "
108
+ "We will provide you a markdown and details about a footnote. "
109
+ "You must generate minimal edits to inline that footnote. "
110
+ "Only output a JSON array of a single instruction in the form:\n"
111
+ "{'text_to_replace': str, 'replacement_text': str}\n"
112
+ "Do NOT encapsulate the JSON in a list."
113
+ "Do NOT rewrite lines that do not contain the footnote. "
114
+ "Do NOT provide any other text or commentary.\n\n"
115
+ "Rules:\n"
116
+ "1. Inline the footnote right after the usage text, "
117
+ "replacing the footnote symbol.\n"
118
+ "2. Keep everything else exactly as is.\n"
119
+ "Example:\n"
120
+ "Input: \n"
121
+ "Footnote:\n"
122
+ "- Footnote text: *Hej means hello in swedish\n"
123
+ "Text:\n"
124
+ "<<<"
125
+ "Hej*, said the nice old lady. She was wearing an apron.\n"
126
+ "*Hej means hello in swedish\n"
127
+ ">>>"
128
+ "Output: "
129
+ "{'text_to_replace': 'Hej*', "
130
+ "'replacement_text': 'Hej (Hej means hello in swedish)'},"
131
+ "Now process this text:\n"
132
+ "<<<\n"
133
+ "Footnote:\n"
134
+ "- Footnote text: {{footnote_text}}\n"
135
+ "Text:\n"
136
+ "<<<"
137
+ "{{markdown}}\n"
138
+ ">>>"
139
+ )
140
+ footnote_insertion_instruction_tool = create_prompt_tool(
141
+ template=footnote_insertion_instruction_prompt,
142
+ output_schema=SingleFootnoteChange,
143
+ model="mistral-small-latest",
144
+ temperature=0,
145
+ )
146
+
147
+
148
+ def replace_footnote_inline_operation(
149
+ api_key: str,
150
+ footnote: dict,
151
+ markdown: str,
152
+ usage_tracker: TokenUsageTracker | None = None
153
+ ) -> str:
154
+ """
155
+ Given a footnote and the page's markdown text,
156
+ perform an inline replacement using a 'diff/instructions' approach
157
+ to ensure no unintended text changes occur.
158
+
159
+ Steps:
160
+ 1) Prompt the LLM to output structured edit instructions
161
+ (text_to_replace, replacement_text).
162
+ 2) Apply those instructions to the original text.
163
+
164
+ Args:
165
+ api_key: Mistral API Key
166
+ footnote (dict): {
167
+ 'footnote_symbol': '*',
168
+ 'footnote_text': 'Corresponding author',
169
+ 'usage_text': 'James Andrews*',
170
+ 'category': 'other',
171
+ 'details': 'Footnote indicating that James Andrews
172
+ is a corresponding author.'
173
+ }
174
+ markdown str: OCRed page.
175
+
176
+ Returns:
177
+ str: The updated text, with the footnote properly inlined
178
+ (and footnote lines removed) without altering other content.
179
+ """
180
+
181
+ footnote_symbol = footnote.get("footnote_symbol", "")
182
+ footnote_text = f"{footnote_symbol}{footnote.get('footnote_text', '')}"
183
+
184
+ replace_instruction = footnote_insertion_instruction_tool(
185
+ api_key=api_key,
186
+ footnote_text=footnote_text,
187
+ markdown=markdown,
188
+ usage_tracker=usage_tracker
189
+ )
190
+ markdown = markdown.replace(
191
+ replace_instruction.text_to_replace,
192
+ replace_instruction.replacement_text,
193
+ )
194
+
195
+ return markdown
@@ -0,0 +1,118 @@
1
+ """
2
+ Utilities for creating LLM-powered tools based on prompt templates.
3
+ """
4
+
5
+ from typing import Generic, TypeVar, Callable
6
+
7
+ from pydantic import BaseModel
8
+ from ragbandit.utils.llm_utils import query_llm
9
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
10
+
11
+ T = TypeVar("T", bound=BaseModel)
12
+
13
+
14
+ class PromptTool(Generic[T]):
15
+ """A tool that uses a prompt template to query an
16
+ LLM and return structured data."""
17
+
18
+ def __init__(
19
+ self,
20
+ template: str,
21
+ output_schema: type[T],
22
+ model: str = "mistral-small-latest",
23
+ temperature: float = 0,
24
+ preprocess_fn: Callable[[dict[str, object]], dict[str, object]] = None,
25
+ postprocess_fn: Callable[[T], object] = None,
26
+ ):
27
+ """Initialize a new prompt-based tool.
28
+
29
+ Args:
30
+ template: String template with {variable} placeholders
31
+ output_schema: Pydantic model for response validation
32
+ model: LLM model to use
33
+ temperature: Sampling temperature
34
+ preprocess_fn: Optional function to preprocess variables
35
+ before formatting
36
+ postprocess_fn: Optional function to process the result
37
+ after LLM response
38
+ """
39
+ self.template = template
40
+ self.output_schema = output_schema
41
+ self.model = model
42
+ self.temperature = temperature
43
+ self.preprocess_fn = preprocess_fn or (lambda x: x)
44
+ self.postprocess_fn = postprocess_fn or (lambda x: x)
45
+
46
+ def format_prompt(self, **kwargs) -> str:
47
+ """Format the template with the provided variables.
48
+
49
+ This method handles variable substitution in the template.
50
+ """
51
+ processed_kwargs = self.preprocess_fn(kwargs)
52
+
53
+ # Simple string replacement approach - more reliable than format()
54
+ result = self.template
55
+ for key, value in processed_kwargs.items():
56
+ placeholder = "{{" + key + "}}"
57
+ result = result.replace(placeholder, str(value))
58
+
59
+ return result
60
+
61
+ def __call__(
62
+ self,
63
+ api_key: str,
64
+ usage_tracker: TokenUsageTracker | None = None,
65
+ **kwargs
66
+ ) -> object:
67
+ """Execute the tool with the given variables.
68
+
69
+ Args:
70
+ api_key: Mistral API key for authentication
71
+ usage_tracker: Optional token usage tracker
72
+ **kwargs: Variables to substitute in the prompt template
73
+
74
+ Returns:
75
+ Processed result from the LLM
76
+
77
+ This makes the tool callable like a function, e.g.:
78
+ result = my_tool(api_key="your_api_key", var1="value", var2="value2")
79
+ """
80
+ # Format the prompt with variables
81
+ prompt = self.format_prompt(**kwargs)
82
+
83
+ # Query the LLM
84
+ result = query_llm(
85
+ prompt=prompt,
86
+ output_schema=self.output_schema,
87
+ api_key=api_key,
88
+ usage_tracker=usage_tracker,
89
+ model=self.model,
90
+ temperature=self.temperature,
91
+ )
92
+
93
+ # Apply any post-processing
94
+ return self.postprocess_fn(result)
95
+
96
+
97
+ # Helper function to create a tool more easily
98
+ def create_prompt_tool(
99
+ template: str,
100
+ output_schema: type[T],
101
+ model: str = "mistral-small-latest",
102
+ temperature: float = 0,
103
+ preprocess_fn: Callable[[dict[str, object]], dict[str, object]] = None,
104
+ postprocess_fn: Callable[[T], object] = None,
105
+ ) -> PromptTool[T]:
106
+ """Create a new prompt-based tool with the given template and schema.
107
+
108
+ Note: When calling the returned tool,
109
+ you must provide an api_key parameter.
110
+ """
111
+ return PromptTool(
112
+ template=template,
113
+ output_schema=output_schema,
114
+ model=model,
115
+ temperature=temperature,
116
+ preprocess_fn=preprocess_fn,
117
+ postprocess_fn=postprocess_fn,
118
+ )
@@ -0,0 +1,31 @@
1
+ from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class ReferencesHeader(BaseModel):
6
+ references_header: str
7
+
8
+
9
+ references_tool_prompt = (
10
+ "You are an expert at identifying the references section "
11
+ "of a document. You will be given a list of headers. "
12
+ "Identify the header that represents the references section "
13
+ "(e.g., 'References', 'Bibliography', 'Sources', etc.). "
14
+ "Return a JSON object with a single key 'references_header' "
15
+ "containing the identified header. "
16
+ "If no references header is found, return an empty string.\n"
17
+ "The available headers are provided below (enclosed in <<< and >>>):\n"
18
+ "<<<\n"
19
+ "{{headers}}"
20
+ "\n>>>"
21
+ )
22
+ detect_references_header_tool = create_prompt_tool(
23
+ template=references_tool_prompt,
24
+ output_schema=ReferencesHeader,
25
+ model="mistral-medium-latest",
26
+ temperature=0,
27
+ # Optional preprocessing function to join headers
28
+ preprocess_fn=lambda kwargs: {
29
+ "headers": "\n".join(kwargs["headers_list"])
30
+ },
31
+ )
@@ -0,0 +1,56 @@
1
+ from ragbandit.prompt_tools.prompt_tool import create_prompt_tool
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class SemanticBreak(BaseModel):
6
+ semantic_break: str
7
+
8
+
9
+ semantic_break_tool_prompt = (
10
+ "EXAMPLE TEXT:\n"
11
+ "Once upon a time in a faraway land, a brave knight set forth "
12
+ "on a quest to rescue the princess. \n"
13
+ "He traveled through forests and mountains, encountering "
14
+ "strange creatures along the way. \n"
15
+ "Finally, he reached the dragon's lair. "
16
+ "![img-0.jpeg](img-0.jpeg)(Image description: A large dragon "
17
+ "perched on a rocky ledge.)\n"
18
+ "The knight prepared for battle, sword in hand.\n"
19
+ "\n"
20
+ "Instruction:\n"
21
+ "1. We want to split the text into coherent chunks. "
22
+ "The first chunk begins at the start of the text.\n"
23
+ "2. Identify where the next chunk should begin—that is, "
24
+ "find the point at which the first chunk naturally ends "
25
+ "(thematic break), and the second chunk begins.\n"
26
+ "3. Return ONLY a short snippet of text (up to ~30 characters) "
27
+ "that marks the beginning of the next chunk. "
28
+ "For example, if the next chunk starts at the word 'Finally,' "
29
+ "return 'Finally, he reached the dragon's lair.' "
30
+ "(truncated if necessary).\n"
31
+ "4. If the entire text above is just one cohesive chunk "
32
+ "with no good break, return \"NO_BREAK\".\n"
33
+ "5. Do not split inside any "
34
+ "![img-0.jpeg](img-0.jpeg)(Image description: ...) text. "
35
+ "Keep these intact.\n"
36
+ "6. Do not output any additional commentary—"
37
+ "just provide the snippet or NO_BREAK.\n"
38
+ "7. Your output should be a JSON containing "
39
+ "a single key 'semantic_break' with the text snippet "
40
+ "for the semantic break.\n"
41
+ "Now find the next semantic break in this text:\n"
42
+ "{{text}}\n"
43
+ )
44
+
45
+
46
+ def return_break_string(result: SemanticBreak) -> str:
47
+ return result.semantic_break
48
+
49
+
50
+ find_semantic_break_tool = create_prompt_tool(
51
+ template=semantic_break_tool_prompt,
52
+ output_schema=SemanticBreak,
53
+ model="mistral-small-latest",
54
+ temperature=0,
55
+ postprocess_fn=return_break_string
56
+ )
ragbandit/schema.py ADDED
@@ -0,0 +1,206 @@
1
+ """Base schema for data structures."""
2
+ from pydantic import BaseModel
3
+ from datetime import datetime
4
+ from enum import Enum
5
+
6
+ ##########################################
7
+ # ************* V2 Schema ************** #
8
+ ##########################################
9
+
10
+ ##########################################
11
+ # Metrics #
12
+ ##########################################
13
+
14
+
15
+ class TokenUsageMetrics(BaseModel):
16
+ """Aggregated token/cost usage metrics returned by TokenUsageTracker."""
17
+ total_calls: int
18
+ total_input_tokens: int
19
+ total_output_tokens: int
20
+ total_embedding_tokens: int
21
+ total_tokens: int
22
+ total_cost_usd: float
23
+
24
+ class ModelUsage(BaseModel):
25
+ calls: int
26
+ input_tokens: int | None = 0
27
+ output_tokens: int | None = 0
28
+ embedding_tokens: int | None = 0
29
+ cost: float
30
+
31
+ models: dict[str, ModelUsage]
32
+
33
+
34
+ class PagesProcessedMetrics(BaseModel):
35
+ """Metrics for pages processed and associated cost."""
36
+ pages_processed: int
37
+ cost_per_page: float
38
+ total_cost_usd: float
39
+
40
+
41
+ class TimingMetrics(BaseModel):
42
+ """Metrics for pipeline step durations in seconds."""
43
+ total_duration: float | None = None
44
+ ocr: float | None = None
45
+ # processing_steps: list[dict[str, float]] | None = None
46
+ processing: float | None = None
47
+ chunking: float | None = None
48
+ embedding: float | None = None
49
+
50
+ ##########################################
51
+ # OCR #
52
+ ##########################################
53
+
54
+
55
+ class PageDimensions(BaseModel):
56
+ dpi: int
57
+ height: int
58
+ width: int
59
+
60
+
61
+ class Image(BaseModel):
62
+ """Represents an image extracted from a page."""
63
+ id: str # e.g., 'img-01.jpg'
64
+ top_left_x: int | None = None
65
+ top_left_y: int | None = None
66
+ bottom_right_x: int | None = None
67
+ bottom_right_y: int | None = None
68
+ image_base64: str
69
+ image_annotation: str | None = None # JSON string
70
+
71
+
72
+ class BasePage(BaseModel):
73
+ """Base schema for a single page of a document."""
74
+ index: int # Page number
75
+ markdown: str
76
+ images: list[Image] | None = None
77
+ dimensions: PageDimensions
78
+
79
+
80
+ class OCRPage(BasePage):
81
+ """Represents a single page from an OCR result."""
82
+ pass
83
+
84
+
85
+ class OCRUsageInfo(BaseModel):
86
+ pages_processed: int
87
+ doc_size_bytes: int
88
+
89
+
90
+ class OCRResult(BaseModel):
91
+ """Represents the output of the OCR process."""
92
+ source_file_path: str
93
+ processed_at: datetime
94
+ model: str
95
+ document_annotation: str | None = None
96
+ pages: list[OCRPage]
97
+ usage_info: OCRUsageInfo
98
+ # Metrics for OCR; can include token-usage or page-processing metrics
99
+ metrics: list[TokenUsageMetrics | PagesProcessedMetrics] | None = None
100
+
101
+ ##########################################
102
+ # Processing #
103
+ ##########################################
104
+
105
+
106
+ class ProcessedPage(BasePage):
107
+ """Represents a single page after text processors have been applied."""
108
+ pass
109
+
110
+
111
+ class ProcessingTraceItem(BaseModel):
112
+ """Trace of a single processor's execution."""
113
+ step_name: str # Name of the step in the processing
114
+ summary: str
115
+ duration: float # Duration in seconds
116
+
117
+
118
+ class ProcessingResult(BaseModel):
119
+ """Represents the output of the text processors."""
120
+ processor_name: str
121
+ processed_at: datetime
122
+ pages: list[ProcessedPage] # The text content, now structured per page
123
+ processing_trace: list[ProcessingTraceItem]
124
+ extracted_data: dict[str, object] # For footnotes, references, etc.
125
+ processing_duration: float | None = None
126
+ metrics: TokenUsageMetrics | None = None
127
+
128
+ ##########################################
129
+ # Chunking #
130
+ ##########################################
131
+
132
+
133
+ class ChunkMetadata(BaseModel):
134
+ """Metadata associated with a chunk."""
135
+ page_index: int
136
+ source_references: list[str] | None = None
137
+ footnotes: list[dict] | None = None
138
+ images: list[Image] | None = None
139
+ extra: dict[str, object] = {}
140
+
141
+
142
+ class Chunk(BaseModel):
143
+ """Represents a chunk of text, ready for embedding."""
144
+ text: str
145
+ metadata: ChunkMetadata
146
+
147
+
148
+ class ChunkingResult(BaseModel):
149
+ """Represents the output of the chunking process."""
150
+ processed_at: datetime
151
+ chunks: list[Chunk]
152
+ metrics: TokenUsageMetrics | None = None # If chunker uses an LLM
153
+
154
+
155
+ ##########################################
156
+ # Embedding #
157
+ ##########################################
158
+
159
+
160
+ class ChunkWithEmbedding(Chunk):
161
+ """Represents a chunk that has been embedded."""
162
+ embedding: list[float]
163
+ embedding_model: str
164
+
165
+
166
+ class EmbeddingResult(BaseModel):
167
+ """Represents the output of the embedding process."""
168
+ processed_at: datetime | None = None
169
+ chunks_with_embeddings: list[ChunkWithEmbedding]
170
+ model_name: str
171
+ metrics: TokenUsageMetrics | None = None
172
+
173
+ ##########################################
174
+ # Document Pipeline #
175
+ ##########################################
176
+
177
+
178
+ class StepStatus(str, Enum):
179
+ success = "success"
180
+ failed = "failed"
181
+ skipped = "skipped"
182
+
183
+
184
+ class StepReport(BaseModel):
185
+ ocr: StepStatus | None = None
186
+ processing: StepStatus | None = None
187
+ chunking: StepStatus | None = None
188
+ embedding: StepStatus | None = None
189
+
190
+
191
+ class DocumentPipelineResult(BaseModel):
192
+ """The composite result for an end-to-end pipeline run."""
193
+ source_file_path: str
194
+ processed_at: datetime
195
+ pipeline_config: dict
196
+ timings: TimingMetrics
197
+ total_metrics: (
198
+ list[TokenUsageMetrics | PagesProcessedMetrics] | None
199
+ ) = None
200
+ total_cost_usd: float | None = None
201
+ ocr_result: OCRResult | None = None
202
+ processing_results: list[ProcessingResult] | None = None
203
+ chunking_result: ChunkingResult | None = None
204
+ embedding_result: EmbeddingResult | None = None
205
+ step_report: StepReport
206
+ logs: str | None = None
@@ -0,0 +1,19 @@
1
+ """
2
+ Utility functions and classes for the ragbandit package.
3
+
4
+ This module provides various utilities used throughout the package.
5
+ """
6
+
7
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
8
+ from ragbandit.utils.in_memory_log_handler import InMemoryLogHandler
9
+ from ragbandit.utils.mistral_client import (
10
+ MistralClientManager,
11
+ mistral_client_manager
12
+ )
13
+
14
+ __all__ = [
15
+ "TokenUsageTracker",
16
+ "InMemoryLogHandler",
17
+ "MistralClientManager", # The class
18
+ "mistral_client_manager" # The instance
19
+ ]
@@ -0,0 +1,33 @@
1
+ import logging
2
+ import io
3
+
4
+
5
+ class InMemoryLogHandler(logging.Handler):
6
+ """
7
+ Collects every formatted log record that flows through it
8
+ into an in-memory buffer. Thread-safe because `logging`
9
+ already locks `emit()`.
10
+ """
11
+ def __init__(self, level=logging.INFO,
12
+ fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s"):
13
+ super().__init__(level)
14
+ self.buffer = io.StringIO()
15
+ self.setFormatter(logging.Formatter(fmt))
16
+ print("InMemoryLogHandler initialized with level:", level)
17
+ print("Current root logger level:", logging.getLogger().level)
18
+
19
+ def emit(self, record):
20
+ print(
21
+ f"Emit called with record: {record.levelname} - "
22
+ f"{record.name} - {record.msg}"
23
+ )
24
+ print(f"Record level: {record.levelno}, Handler level: {self.level}")
25
+ self.buffer.write(self.format(record) + "\n")
26
+
27
+ def dump(self) -> str:
28
+ """Return the whole transcript so far."""
29
+ return self.buffer.getvalue()
30
+
31
+ def clear(self) -> None:
32
+ self.buffer.truncate(0)
33
+ self.buffer.seek(0)