ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,188 @@
1
+ """
2
+ Utility functions for interacting with LLM services.
3
+
4
+ This module provides standardized ways to make LLM requests with
5
+ consistent error handling, retries, and response parsing.
6
+ """
7
+
8
+ import json
9
+ import time
10
+ import logging
11
+ import requests
12
+ from typing import Type, TypeVar
13
+ from pydantic import BaseModel
14
+ from ragbandit.utils.mistral_client import mistral_client_manager
15
+ from ragbandit.config.llms import (
16
+ DEFAULT_MODEL,
17
+ DEFAULT_TEMPERATURE,
18
+ DEFAULT_MAX_RETRIES,
19
+ DEFAULT_RETRY_DELAY,
20
+ DEFAULT_BACKOFF_FACTOR,
21
+ )
22
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker, count_tokens
23
+
24
+ # Configure logger
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Type variable for Pydantic model return types
28
+ T = TypeVar("T", bound=BaseModel)
29
+
30
+
31
+ def query_llm(
32
+ prompt: str,
33
+ output_schema: Type[T],
34
+ api_key: str,
35
+ usage_tracker: TokenUsageTracker | None = None,
36
+ model: str = DEFAULT_MODEL,
37
+ temperature: float = DEFAULT_TEMPERATURE,
38
+ max_retries: int = DEFAULT_MAX_RETRIES,
39
+ retry_delay: float = DEFAULT_RETRY_DELAY,
40
+ backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
41
+ track_usage: bool = True,
42
+ ) -> T:
43
+ """
44
+ Send a query to the LLM with standardized formatting and retry logic.
45
+
46
+ Args:
47
+ prompt: The prompt to send to the LLM
48
+ output_schema: Pydantic model class for response validation and parsing
49
+ api_key: API key to use for the request
50
+ usage_tracker: Optional custom token usage tracker for
51
+ document-specific tracking.
52
+ If None, no tracking will be performed even
53
+ if track_usage is True.
54
+ model: Model name to use for the request
55
+ temperature: Sampling temperature (0 = deterministic)
56
+ max_retries: Maximum number of retry attempts
57
+ retry_delay: Initial delay between retries in seconds
58
+ backoff_factor: Multiplier for delay on each retry attempt
59
+ track_usage: Whether to track token usage and costs
60
+
61
+ Returns:
62
+ Validated instance of the output_schema model
63
+
64
+ Raises:
65
+ ValueError: If response cannot be parsed according to schema
66
+ RuntimeError: If all retry attempts fail
67
+ """
68
+ retry_count = 0
69
+ current_delay = retry_delay
70
+
71
+ # Only track usage if both conditions are met:
72
+ # 1. User wants to track usage (track_usage=True)
73
+ # 2. We have a tracker to use (usage_tracker is not None)
74
+ should_track = track_usage and usage_tracker is not None
75
+
76
+ # Count input tokens if tracking is enabled
77
+ input_tokens = 0
78
+ if should_track:
79
+ # Count tokens in the prompt
80
+ input_tokens = count_tokens(prompt, model)
81
+ logger.debug(f"Input tokens: {input_tokens} for model {model}")
82
+
83
+ while retry_count <= max_retries:
84
+ try:
85
+ # Make the API request
86
+ client = mistral_client_manager.get_client(api_key)
87
+ chat_response = client.chat.complete(
88
+ model=model,
89
+ messages=[
90
+ {
91
+ "role": "user",
92
+ "content": prompt,
93
+ },
94
+ ],
95
+ response_format={
96
+ "type": "json_object",
97
+ "schema": output_schema.model_json_schema(),
98
+ },
99
+ temperature=temperature,
100
+ )
101
+
102
+ # Parse and validate the response
103
+ response_content = chat_response.choices[0].message.content
104
+ response_dict = json.loads(response_content)
105
+
106
+ # Track token usage if enabled
107
+ if should_track and hasattr(chat_response, 'usage'):
108
+ # Get token counts from the API response
109
+ output_tokens = chat_response.usage.completion_tokens
110
+ actual_input_tokens = chat_response.usage.prompt_tokens
111
+
112
+ # Use the actual input tokens from the API if available
113
+ if actual_input_tokens > 0:
114
+ input_tokens = actual_input_tokens
115
+
116
+ # Log token usage
117
+ logger.debug(
118
+ f"Token usage - Input: {input_tokens}, "
119
+ f"Output: {output_tokens}, "
120
+ f"Model: {model}"
121
+ )
122
+
123
+ # Track in the global usage tracker
124
+ usage_tracker.add_usage(input_tokens, output_tokens, model)
125
+ elif should_track:
126
+ # If API doesn't return usage stats, estimate output tokens
127
+ output_tokens = count_tokens(response_content, model)
128
+ usage_tracker.add_usage(input_tokens, output_tokens, model)
129
+ logger.debug(
130
+ f"Estimated token usage - Input: {input_tokens}, "
131
+ f"Output: {output_tokens}, "
132
+ f"Model: {model}"
133
+ )
134
+
135
+ return output_schema.model_validate(response_dict)
136
+
137
+ except (requests.RequestException, TimeoutError, ConnectionError) as e:
138
+ # Handle network-related errors (timeouts, connection issues)
139
+ retry_count += 1
140
+
141
+ # If we've exhausted retries, raise the error
142
+ if retry_count > max_retries:
143
+ logger.error(f"Failed after {max_retries} retries: {str(e)}")
144
+ raise RuntimeError(
145
+ f"LLM request failed after {max_retries} retries: {str(e)}"
146
+ )
147
+
148
+ # Log the error and retry
149
+ logger.warning(
150
+ "LLM request failed "
151
+ f"(attempt {retry_count}/{max_retries}): {str(e)}. "
152
+ f"Retrying in {current_delay} seconds..."
153
+ )
154
+ time.sleep(current_delay)
155
+ current_delay *= backoff_factor
156
+
157
+ except Exception as e:
158
+ # Handle other API errors (rate limits, server errors)
159
+ if "429" in str(e) or "too many requests" in str(e).lower():
160
+ # Rate limiting error - retry with backoff
161
+ retry_count += 1
162
+
163
+ if retry_count > max_retries:
164
+ logger.error(
165
+ (
166
+ f"Rate limit exceeded after {max_retries} "
167
+ f"retries: {str(e)}"
168
+ )
169
+ )
170
+ raise RuntimeError(
171
+ f"Rate limit exceeded after {max_retries} retries"
172
+ )
173
+
174
+ logger.warning(
175
+ f"Rate limit hit (attempt {retry_count}/{max_retries}). "
176
+ f"Retrying in {current_delay} seconds..."
177
+ )
178
+ time.sleep(current_delay)
179
+ current_delay *= (
180
+ backoff_factor * 2
181
+ ) # More aggressive backoff for rate limits
182
+ else:
183
+ # Other API errors - don't retry
184
+ logger.error(f"API error: {str(e)}")
185
+ raise RuntimeError(f"LLM API error: {str(e)}")
186
+
187
+ # This should never be reached due to the exception in the loop
188
+ raise RuntimeError("Unexpected error in LLM request retry loop")
@@ -0,0 +1,76 @@
1
+ """
2
+ Utility functions for working with the Mistral API.
3
+
4
+ This module provides helper functions for creating and managing
5
+ Mistral API client instances.
6
+
7
+ The module exports a singleton instance of MistralClientManager as
8
+ 'mistral_client_manager' that should be used throughout the application
9
+ to ensure consistent client caching and management.
10
+ """
11
+
12
+ from mistralai import Mistral
13
+ import logging
14
+
15
+ # Configure logger
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class MistralClientManager:
20
+ """
21
+ Manager class for Mistral API clients.
22
+
23
+ This class provides a way to cache and
24
+ reuse Mistral client instances
25
+ based on API keys, avoiding the need to
26
+ create a new client for each request.
27
+ """
28
+
29
+ def __init__(self):
30
+ """Initialize an empty client cache."""
31
+ self._clients: dict[str, Mistral] = {}
32
+
33
+ def get_mistral_client(self, api_key: str):
34
+ """
35
+ Get a configured Mistral client instance.
36
+
37
+ Returns:
38
+ Mistral: Configured client instance
39
+
40
+ Raises:
41
+ ValueError: If api_key is None
42
+ """
43
+ if not api_key or not api_key.strip():
44
+ raise ValueError("Mistral API key cannot be empty or None")
45
+ return Mistral(api_key=api_key)
46
+
47
+ def get_client(self, api_key: str) -> Mistral:
48
+ """
49
+ Get a Mistral client for the given API key.
50
+
51
+ If a client with this API key already exists in the cache,
52
+ it will be reused.
53
+ Otherwise, a new client will be created and cached.
54
+
55
+ Args:
56
+ api_key: Mistral API key to use for authentication
57
+
58
+ Returns:
59
+ Mistral: A configured Mistral client instance
60
+
61
+ Raises:
62
+ ValueError: If api_key is empty or None
63
+ """
64
+ # Hash the API key to use as a dictionary key
65
+ # This avoids storing the actual API key in memory as a dictionary key
66
+ key_hash = hash(api_key)
67
+
68
+ if key_hash not in self._clients:
69
+ # Create a new client and cache it
70
+ self._clients[key_hash] = self.get_mistral_client(api_key)
71
+
72
+ return self._clients[key_hash]
73
+
74
+
75
+ # Global instance of the client manager
76
+ mistral_client_manager = MistralClientManager()
@@ -0,0 +1,220 @@
1
+ """
2
+ Cost tracking utilities for LLM API calls.
3
+
4
+ This module provides functions to calculate token usage and costs
5
+ for different LLM models.
6
+ """
7
+
8
+ import logging
9
+ import tiktoken
10
+ from ragbandit.config.pricing import (
11
+ MODEL_COSTS,
12
+ EMBEDDING_COSTS,
13
+ DEFAULT_MODEL
14
+ )
15
+ from ragbandit.schema import TokenUsageMetrics
16
+
17
+ # Configure logger
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def count_tokens(text: str, model: str = DEFAULT_MODEL) -> int:
22
+ """
23
+ Count the number of tokens in a text string for a specific model.
24
+
25
+ Args:
26
+ text: The text to count tokens for
27
+ model: The model to use for token counting
28
+
29
+ Returns:
30
+ int: Number of tokens
31
+ """
32
+ try:
33
+ # For Mistral models, use cl100k_base encoding (same as GPT-4)
34
+ encoding = tiktoken.get_encoding("cl100k_base")
35
+ return len(encoding.encode(text))
36
+ except Exception as e:
37
+ logger.warning(
38
+ f"Error counting tokens: {e}. Using character-based estimate."
39
+ )
40
+ # Fallback: rough estimate based on characters (1 token ≈ 4 chars)
41
+ return len(text) // 4
42
+
43
+
44
+ def calculate_cost(
45
+ input_tokens: int, output_tokens: int, model: str = DEFAULT_MODEL
46
+ ) -> tuple[float, dict[str, float]]:
47
+ """
48
+ Calculate the cost of an API call based on token usage.
49
+
50
+ Args:
51
+ input_tokens: Number of input tokens
52
+ output_tokens: Number of output tokens
53
+ model: Model name
54
+
55
+ Returns:
56
+ Tuple containing:
57
+ - Total cost in USD
58
+ - Dictionary with detailed cost breakdown
59
+ """
60
+ # Get cost rates, defaulting to mistral-small if model not found
61
+ input_rate, output_rate = MODEL_COSTS.get(
62
+ model, MODEL_COSTS[DEFAULT_MODEL]
63
+ )
64
+
65
+ # Calculate costs (rates are per 1M tokens)
66
+ input_cost = (input_tokens / 1_000_000) * input_rate
67
+ output_cost = (output_tokens / 1_000_000) * output_rate
68
+ total_cost = input_cost + output_cost
69
+
70
+ cost_details = {
71
+ "model": model,
72
+ "input_tokens": input_tokens,
73
+ "output_tokens": output_tokens,
74
+ "total_tokens": input_tokens + output_tokens,
75
+ "input_cost_usd": input_cost,
76
+ "output_cost_usd": output_cost,
77
+ "total_cost_usd": total_cost,
78
+ }
79
+
80
+ return total_cost, cost_details
81
+
82
+
83
+ class TokenUsageTracker:
84
+ """Track token usage and costs across multiple API calls."""
85
+ total_input_tokens: int
86
+ total_output_tokens: int
87
+ total_embedding_tokens: int
88
+ total_cost: float
89
+ calls_by_model: dict[str, dict[str, int | float]]
90
+
91
+ def __init__(self):
92
+ self.total_input_tokens = 0
93
+ self.total_output_tokens = 0
94
+ self.total_embedding_tokens = 0
95
+ self.total_cost = 0.0
96
+ self.calls_by_model = {}
97
+
98
+ def add_usage(
99
+ self,
100
+ input_tokens: int,
101
+ output_tokens: int,
102
+ model: str = DEFAULT_MODEL,
103
+ ) -> None:
104
+ """
105
+ Add usage statistics from an API call.
106
+
107
+ Args:
108
+ input_tokens: Number of input tokens
109
+ output_tokens: Number of output tokens
110
+ model: Model name
111
+ """
112
+ cost, details = calculate_cost(input_tokens, output_tokens, model)
113
+
114
+ # Update totals
115
+ self.total_input_tokens += input_tokens
116
+ self.total_output_tokens += output_tokens
117
+ self.total_cost += cost
118
+
119
+ # Update per-model tracking
120
+ if model not in self.calls_by_model:
121
+ self.calls_by_model[model] = {
122
+ "calls": 0,
123
+ "input_tokens": 0,
124
+ "output_tokens": 0,
125
+ "cost": 0.0,
126
+ }
127
+
128
+ self.calls_by_model[model]["calls"] += 1
129
+ self.calls_by_model[model]["input_tokens"] += input_tokens
130
+ self.calls_by_model[model]["output_tokens"] += output_tokens
131
+ self.calls_by_model[model]["cost"] += cost
132
+
133
+ def add_embedding_tokens(self, tokens: int, model: str) -> None:
134
+ """
135
+ Add embedding token usage statistics.
136
+
137
+ Args:
138
+ tokens: Number of tokens processed for embedding
139
+ model: Embedding model name
140
+ """
141
+ # Calculate cost based on embedding model rates
142
+ # Default to 0.10 if model not found
143
+ cost_per_million = EMBEDDING_COSTS.get(model, 0.10)
144
+ cost = (tokens / 1_000_000) * cost_per_million
145
+
146
+ # Update totals
147
+ self.total_embedding_tokens += tokens
148
+ self.total_cost += cost
149
+
150
+ # Update per-model tracking
151
+ if model not in self.calls_by_model:
152
+ self.calls_by_model[model] = {
153
+ "calls": 0,
154
+ "embedding_tokens": 0,
155
+ "cost": 0.0,
156
+ }
157
+ else:
158
+ # Add embedding_tokens field if it doesn't exist
159
+ if "embedding_tokens" not in self.calls_by_model[model]:
160
+ self.calls_by_model[model]["embedding_tokens"] = 0
161
+
162
+ self.calls_by_model[model]["calls"] += 1
163
+ self.calls_by_model[model]["embedding_tokens"] = (
164
+ self.calls_by_model[model].get("embedding_tokens", 0) + tokens
165
+ )
166
+ self.calls_by_model[model]["cost"] += cost
167
+
168
+ def get_summary(self) -> TokenUsageMetrics:
169
+ """
170
+ Get a summary of token usage and costs.
171
+
172
+ Returns:
173
+ TokenUsageMetrics object with usage summary
174
+ """
175
+ models_converted: dict[str, TokenUsageMetrics.ModelUsage] = {}
176
+ for model_name, stats in self.calls_by_model.items():
177
+ models_converted[model_name] = TokenUsageMetrics.ModelUsage(
178
+ calls=int(stats.get("calls", 0)),
179
+ input_tokens=int(stats.get("input_tokens", 0)),
180
+ output_tokens=int(stats.get("output_tokens", 0)),
181
+ embedding_tokens=int(stats.get("embedding_tokens", 0)),
182
+ cost=float(stats.get("cost", 0.0)),
183
+ )
184
+
185
+ return TokenUsageMetrics(
186
+ total_calls=sum(m.calls for m in models_converted.values()),
187
+ total_input_tokens=self.total_input_tokens,
188
+ total_output_tokens=self.total_output_tokens,
189
+ total_embedding_tokens=self.total_embedding_tokens,
190
+ total_tokens=(
191
+ self.total_input_tokens +
192
+ self.total_output_tokens +
193
+ self.total_embedding_tokens
194
+ ),
195
+ total_cost_usd=self.total_cost,
196
+ models=models_converted,
197
+ )
198
+
199
+ def log_summary(self, level: int = logging.INFO) -> None:
200
+ """Log a summary of token usage and costs."""
201
+ summary = self.get_summary()
202
+
203
+ # Build log message
204
+ message = f"API Usage: {summary.total_calls} calls, "
205
+
206
+ # Add LLM token counts if any
207
+ if summary.total_input_tokens > 0 or summary.total_output_tokens > 0: # noqa
208
+ message += (
209
+ f"LLM: {summary.total_input_tokens:,} input + "
210
+ f"{summary.total_output_tokens:,} output tokens, "
211
+ )
212
+
213
+ # Add embedding token counts if any
214
+ if summary.total_embedding_tokens > 0:
215
+ message += f"Embeddings: {summary.total_embedding_tokens:,} tokens, " # noqa
216
+
217
+ # Add total cost
218
+ message += f"Total: ${summary.total_cost_usd:.4f} USD"
219
+
220
+ logger.log(level, message)
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: ragbandit-core
3
+ Version: 0.1.1
4
+ Summary: Core utilities for document processing, RAG configuration, querying, and evaluation.
5
+ Author-email: Martim Chaves <martim@ragbandit.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/MartimChaves/ragbandit-core
8
+ Project-URL: Documentation, https://github.com/MartimChaves/ragbandit-core#readme
9
+ Project-URL: Source, https://github.com/MartimChaves/ragbandit-core
10
+ Project-URL: Issues, https://github.com/MartimChaves/ragbandit-core/issues
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE.md
18
+ Requires-Dist: pydantic>=2.11.7
19
+ Requires-Dist: llama-index>=0.12.52
20
+ Requires-Dist: mistralai>=1.7.0
21
+ Requires-Dist: ragas>=0.3.0
22
+ Requires-Dist: cryptography>=44.0.2
23
+ Dynamic: license-file
24
+
25
+ # ragbandit-core
26
+
27
+ Core utilities for:
28
+
29
+ * Document ingestion & processing (OCR, chunking, embedding)
30
+ * Building and running Retrieval-Augmented Generation (RAG) pipelines
31
+ * Evaluating answers with automated metrics
32
+
33
+ ## Quick start
34
+
35
+ ```bash
36
+ pip install ragbandit-core
37
+ ```
38
+
39
+ ```python
40
+ from ragbandit.documents import (
41
+ DocumentPipeline,
42
+ ReferencesProcessor,
43
+ FootnoteProcessor,
44
+ MistralOCRDocument,
45
+ MistralEmbedder,
46
+ SemanticChunker
47
+ )
48
+ import os
49
+ import logging
50
+ from dotenv import load_dotenv
51
+ load_dotenv()
52
+
53
+ # Configure logging
54
+ logging.basicConfig(
55
+ level=logging.INFO,
56
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
57
+ )
58
+
59
+ MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
60
+
61
+ file_path = "./data/raw/[document_name].pdf"
62
+
63
+ doc_pipeline = DocumentPipeline(
64
+ chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
65
+ embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY), # noqa
66
+ ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
67
+ processors=[
68
+ ReferencesProcessor(api_key=MISTRAL_API_KEY),
69
+ FootnoteProcessor(api_key=MISTRAL_API_KEY),
70
+ ],
71
+ )
72
+
73
+ extended_response = doc_pipeline.process(file_path)
74
+
75
+ ```
76
+
77
+ ### Running Steps Manually
78
+
79
+ For more control, you can run each pipeline step independently:
80
+
81
+ ```python
82
+ from ragbandit.documents import (
83
+ DocumentPipeline,
84
+ ReferencesProcessor,
85
+ MistralOCRDocument,
86
+ MistralEmbedder,
87
+ SemanticChunker
88
+ )
89
+ import os
90
+ from dotenv import load_dotenv
91
+ load_dotenv()
92
+
93
+ MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
94
+ file_path = "./data/raw/[document_name].pdf"
95
+
96
+ # Create pipeline with only the components you need
97
+ pipeline = DocumentPipeline(
98
+ ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
99
+ processors=[ReferencesProcessor(api_key=MISTRAL_API_KEY)],
100
+ chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
101
+ embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY),
102
+ )
103
+
104
+ # Step 1: Run OCR
105
+ ocr_result = pipeline.run_ocr(file_path)
106
+
107
+ # Step 2: Run processors (optional)
108
+ processing_results = pipeline.run_processors(ocr_result)
109
+ final_doc = processing_results[-1] # Get the last processor's output
110
+
111
+ # Step 3: Chunk the document
112
+ chunk_result = pipeline.run_chunker(final_doc)
113
+
114
+ # Step 4: Embed chunks
115
+ embedding_result = pipeline.run_embedder(chunk_result)
116
+ ```
117
+
118
+ You can also create separate pipelines for different steps:
119
+
120
+ ```python
121
+ # OCR-only pipeline
122
+ ocr_pipeline = DocumentPipeline(
123
+ ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY)
124
+ )
125
+ ocr_result = ocr_pipeline.run_ocr(file_path)
126
+
127
+ # Later, chunk with a different pipeline
128
+ chunk_pipeline = DocumentPipeline(
129
+ chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY)
130
+ )
131
+ chunks = chunk_pipeline.run_chunker(ocr_result)
132
+ ```
133
+
134
+ ## Package layout
135
+
136
+ ```
137
+ ragbandit-core/
138
+ ├── src/ragbandit/
139
+ │ ├── documents/ # document ingestion, OCR, chunking,
140
+ └── tests/
141
+ ```
142
+
143
+ ## License
144
+
145
+ MIT
@@ -0,0 +1,38 @@
1
+ ragbandit/__init__.py,sha256=W3cOQHgNtVtHh9o0l2myI7Nv8QLfW-5r06ddLJuhhyM,718
2
+ ragbandit/schema.py,sha256=mGoqqluTCgZaAq4yQQxn4evDjr03gJRmEKT6Xn-F3wI,5859
3
+ ragbandit/config/__init__.py,sha256=Xr1-QgP6oUhBR6I3OHP4dgCR6lh2LHcNqwC6AsA2bNI,52
4
+ ragbandit/config/llms.py,sha256=HHB4BbzlgTy7kZxHJ77tpEPdZfhlOU8nlc4rYLw6v7w,816
5
+ ragbandit/config/pricing.py,sha256=0PE4WTsdRUyG3eABUP9Zhme6QpHwGD1RBYOqqSRzD4A,1143
6
+ ragbandit/documents/__init__.py,sha256=0r0zNIz_MzrzpKpLopUh0le8wCzI5xhi9CPjlelXz4Y,1254
7
+ ragbandit/documents/document_pipeline.py,sha256=bDmOjhj8mIbMii3ZTLHA5blzPvdMmD_wTpx319gAjME,12328
8
+ ragbandit/documents/chunkers/__init__.py,sha256=U2ptxUtW-e_OCl50kjg-YmKD2e1eGI6iPAWl8hG2W0s,464
9
+ ragbandit/documents/chunkers/base_chunker.py,sha256=9sCEqn-uVesvYOh2aNbjoN9-mkkAQxml8-nuonr5zUk,6616
10
+ ragbandit/documents/chunkers/fixed_size_chunker.py,sha256=--OQ5XVhATw4v_MEmfTJPkfUQ2fHXCJyYpWktWVi9JY,5764
11
+ ragbandit/documents/chunkers/semantic_chunker.py,sha256=67I2TjytMSB_LWloGWhesOB7Hu4RlYBUsr9T8A9SD2I,7188
12
+ ragbandit/documents/embedders/__init__.py,sha256=6do7BGP8rHCLvIoPIwcK5W751jl8spEfq2smgkoTK3o,418
13
+ ragbandit/documents/embedders/base_embedder.py,sha256=Bdbmhvi82JxRCBtaY6ZFIbmmPkHriTqtZvvRs6k3dfg,2287
14
+ ragbandit/documents/embedders/mistral_embedder.py,sha256=T0FPILc7PKIgWxV_lWQbqyP5_LElkuuGtDvn4-Ec6d8,4242
15
+ ragbandit/documents/ocr/__init__.py,sha256=Dg3R2ClL1fDOA4a6hY8F7gHiR1mIgL0tNSbo88NPsGE,336
16
+ ragbandit/documents/ocr/base_ocr.py,sha256=o3gTEg6WW88JDOAaKSKusqOGco4jQ9Q8nUl-zsStjMg,4375
17
+ ragbandit/documents/ocr/mistral_ocr.py,sha256=074LRKDEIH4PYoRPrwP_3dUboVByefIqnLniqclz3Bg,5327
18
+ ragbandit/documents/processors/__init__.py,sha256=ecWUqcNqSEuloXYczgfMPQwWb5vlehHM7s6jB3uEVwM,541
19
+ ragbandit/documents/processors/base_processor.py,sha256=wVqgIjKDUhaSt8oyktTvY0b8jL9OYHQNvcD0VZcn-Wg,3128
20
+ ragbandit/documents/processors/footnotes_processor.py,sha256=dRb6NXiRaXVMVKB7lijCofmZZUHqqFuyl7JunqEbqC8,13630
21
+ ragbandit/documents/processors/references_processor.py,sha256=Fmg9MwBO9fB7sjrZl2q_ESTqTXNdzikVi1TLMJTAC3U,14520
22
+ ragbandit/documents/utils/__init__.py,sha256=eS9AhbxRSJVnUU7JlHKNfJl3KNKCpLYTyl_Lo_SVaWU,244
23
+ ragbandit/documents/utils/secure_file_handler.py,sha256=XGNd5dBfoqhYWo-jxHQkCWEidhurNI7MA-yR7KQHpF4,3098
24
+ ragbandit/prompt_tools/__init__.py,sha256=PRnA0EF9yKSvErRxVs8esOJ4761UQjgtHvHdcjsaCB4,801
25
+ ragbandit/prompt_tools/footnotes_processor_tools.py,sha256=_SgKN0wn98eBNlM7WGMAkiTEUnFXm9w5X5f7KKapnK8,6098
26
+ ragbandit/prompt_tools/prompt_tool.py,sha256=lN8rEwWnuS3gW6c2l1IYadgRvBnCB0tiU5mkpeoxl3s,3824
27
+ ragbandit/prompt_tools/references_processor_tools.py,sha256=0AqKRrdKiUKWsS0EAnrGlXVDPZM_oLdsr52WriWekNA,1063
28
+ ragbandit/prompt_tools/semantic_chunker_tools.py,sha256=Y66-ttVhpyiHdq6He1D0vaaJthDc8pX45mDbn24ONrs,2088
29
+ ragbandit/utils/__init__.py,sha256=nKj-69XL3HjzbMNOcJdMpsJt0J8N2KhGdQ19V_jQR1g,522
30
+ ragbandit/utils/in_memory_log_handler.py,sha256=vMtCG-Wk9OwiCo2087nQovSIOQCu5ZWXg3lBf0hPEkk,1109
31
+ ragbandit/utils/llm_utils.py,sha256=7motkdeez9D_eEBemY6Mw_tZZCCyvt886GEYHv9ddvs,7079
32
+ ragbandit/utils/mistral_client.py,sha256=VkqFgquyjCmUllBjrHKqQnnmvU3yF0frvCSVIkHH-jQ,2195
33
+ ragbandit/utils/token_usage_tracker.py,sha256=CMAuJolcdJ258CVMkpNVhZAnQIGC-jTnxYkOr2jvp0M,7196
34
+ ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md,sha256=rZBctov8cSToljMmrdApur6WqyMIrX0KjkMKDpqx9w8,1070
35
+ ragbandit_core-0.1.1.dist-info/METADATA,sha256=FsGiCB_o8_vMQohVWsGN_o3gHzdM1usLA9Xw5zEi7bc,3924
36
+ ragbandit_core-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
+ ragbandit_core-0.1.1.dist-info/top_level.txt,sha256=UDjwZ4afIob8DIsuV6D08lU5bHCeN00grjXpzgDhsQ8,10
38
+ ragbandit_core-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Martim Chaves
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ ragbandit