ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
ragbandit/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ """ragbandit core package.
2
+
3
+ This package contains sub-modules for document processing,
4
+ RAG pipeline configuration/execution, and evaluation utilities.
5
+ Only lightweight interfaces and shared utilities are defined here;
6
+ heavy logic resides in sub-packages.
7
+ """
8
+
9
+ from importlib import metadata as _metadata
10
+
11
+ __version__: str
12
+ try:
13
+ __version__ = _metadata.version("ragbandit-core")
14
+ except _metadata.PackageNotFoundError: # pragma: no cover
15
+ __version__ = "0.0.0+dev"
16
+
17
+ # Re-export public interfaces so that users can simply:
18
+ # from ragbandit import DocumentProcessor, RAGConfig, RAGPipeline, evaluate
19
+
20
+ # from ragbandit.documents import DocumentPipeline
21
+
22
+
23
+ __all__ = [
24
+ "__version__",
25
+ # "DocumentPipeline",
26
+ ]
@@ -0,0 +1,3 @@
1
+ """
2
+ Configuration module for ragbandit package.
3
+ """
@@ -0,0 +1,34 @@
1
+ """
2
+ LLM configuration settings for ragbandit.
3
+
4
+ This module defines default settings and constants for LLM interactions.
5
+ """
6
+
7
+ # Default model settings
8
+ DEFAULT_MODEL = "mistral-small-latest"
9
+ DEFAULT_TEMPERATURE = 0.0
10
+
11
+ # Retry settings
12
+ DEFAULT_MAX_RETRIES = 3
13
+ DEFAULT_RETRY_DELAY = 1.0 # seconds
14
+ DEFAULT_BACKOFF_FACTOR = 2.0 # exponential backoff factor
15
+ DEFAULT_TIMEOUT = 30.0 # seconds
16
+
17
+ # Token limits
18
+ MAX_PROMPT_TOKENS = {
19
+ "mistral-small-latest": 8000,
20
+ "mistral-medium-latest": 32000,
21
+ "mistral-large-latest": 32000,
22
+ "gpt-3.5-turbo": 4096,
23
+ "gpt-4": 8192,
24
+ "gpt-4-turbo": 128000,
25
+ }
26
+
27
+ # System prompts
28
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful AI assistant."""
29
+
30
+ # Response formats
31
+ JSON_FORMAT_INSTRUCTION = """
32
+ Your response must be valid JSON that matches the following schema:
33
+ {schema}
34
+ """
@@ -0,0 +1,38 @@
1
+ """
2
+ Pricing configuration for LLM API calls.
3
+
4
+ This module contains pricing constants for various
5
+ LLM models and embedding models.
6
+ """
7
+
8
+ # Token cost rates per 1M tokens (in USD)
9
+ # Based on Mistral AI pricing as of July 2025
10
+ MODEL_COSTS = {
11
+ # Format: "model_name": (input_cost_per_1M, output_cost_per_1M)
12
+ "mistral-small-latest": (2.00, 6.00),
13
+ "mistral-medium-latest": (6.00, 18.00),
14
+ "mistral-large-latest": (12.00, 36.00),
15
+ # Add other models as needed
16
+ }
17
+
18
+ # Embedding model costs per 1M tokens
19
+ EMBEDDING_COSTS = {
20
+ # Format: "model_name": cost_per_1M_tokens
21
+ "mistral-embed": 0.10,
22
+ "text-embedding-3-small": 0.02,
23
+ "text-embedding-3-large": 0.13,
24
+ # Add other embedding models as needed
25
+ }
26
+
27
+ # OCR model costs per page (in EUR)
28
+ OCR_MODEL_COSTS = {
29
+ # Format: "model_name": cost_per_page
30
+ "mistral-ocr-latest": 0.001, # 1 EUR per 1000 pages
31
+ # Add other OCR models as needed
32
+ }
33
+
34
+ # Default OCR model to use if the specified model is not in OCR_MODEL_COSTS
35
+ DEFAULT_OCR_MODEL = "mistral-ocr-latest"
36
+
37
+ # Default model to use if the specified model is not in MODEL_COSTS
38
+ DEFAULT_MODEL = "mistral-small-latest"
@@ -0,0 +1,66 @@
1
+ """
2
+ Document processing module for handling, analyzing, and transforming documents.
3
+
4
+ This package provides tools for OCR, chunking,
5
+ embedding, and processing documents.
6
+ """
7
+
8
+ # Import key components from subdirectories
9
+ from ragbandit.documents.document_pipeline import DocumentPipeline
10
+
11
+ # Import from chunkers
12
+ from ragbandit.documents.chunkers import (
13
+ BaseChunker,
14
+ FixedSizeChunker,
15
+ SemanticChunker,
16
+ SemanticBreak
17
+ )
18
+
19
+ # Import from processors
20
+ from ragbandit.documents.processors import (
21
+ BaseProcessor,
22
+ FootnoteProcessor,
23
+ ReferencesProcessor
24
+ )
25
+
26
+ # Import from embedders
27
+ from ragbandit.documents.embedders import (
28
+ BaseEmbedder,
29
+ MistralEmbedder
30
+ )
31
+
32
+ # Import from OCR
33
+ from ragbandit.documents.ocr import (
34
+ BaseOCR,
35
+ MistralOCRDocument
36
+ )
37
+
38
+ # Import from utils
39
+ from ragbandit.documents.utils import SecureFileHandler
40
+
41
+ __all__ = [
42
+ # Main pipeline
43
+ "DocumentPipeline",
44
+
45
+ # Chunkers
46
+ "BaseChunker",
47
+ "FixedSizeChunker",
48
+ "SemanticChunker",
49
+ "SemanticBreak",
50
+
51
+ # Processors
52
+ "BaseProcessor",
53
+ "FootnoteProcessor",
54
+ "ReferencesProcessor",
55
+
56
+ # Embedders
57
+ "BaseEmbedder",
58
+ "MistralEmbedder",
59
+
60
+ # OCR
61
+ "BaseOCR",
62
+ "MistralOCRDocument",
63
+
64
+ # Utils
65
+ "SecureFileHandler"
66
+ ]
@@ -0,0 +1,18 @@
1
+ """
2
+ Chunker implementations for document processing.
3
+
4
+ This module provides various chunking strategies for documents.
5
+ """
6
+
7
+ from ragbandit.documents.chunkers.base_chunker import BaseChunker
8
+ from ragbandit.documents.chunkers.fixed_size_chunker import FixedSizeChunker
9
+ from ragbandit.documents.chunkers.semantic_chunker import (
10
+ SemanticChunker, SemanticBreak
11
+ )
12
+
13
+ __all__ = [
14
+ "BaseChunker",
15
+ "FixedSizeChunker",
16
+ "SemanticChunker",
17
+ "SemanticBreak"
18
+ ]
@@ -0,0 +1,201 @@
1
+ # ----------------------------------------------------------------------
2
+ # Standard library
3
+ import logging
4
+ import re
5
+ from abc import ABC, abstractmethod
6
+
7
+ # Project
8
+ from ragbandit.schema import (
9
+ ProcessingResult,
10
+ Chunk,
11
+ ChunkingResult,
12
+ Image,
13
+ )
14
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
15
+
16
+
17
+ class BaseChunker(ABC):
18
+ """
19
+ Base class for document chunking strategies.
20
+ Subclasses should implement the `chunk()` method to
21
+ provide specific chunking logic.
22
+ """
23
+
24
+ def __init__(self, name: str | None = None, api_key: str | None = None):
25
+ """
26
+ Initialize the chunker.
27
+
28
+ Args:
29
+ name: Optional name for the chunker
30
+ api_key: API key for LLM services
31
+ """
32
+ # Hierarchical names make it easy to filter later:
33
+ # chunker.semantic, chunker.fixed_size, etc.
34
+ base = "chunker"
35
+ self.logger = logging.getLogger(
36
+ f"{base}.{name or self.__class__.__name__}"
37
+ )
38
+ self.api_key = api_key
39
+
40
+ @abstractmethod
41
+ def chunk(
42
+ self,
43
+ document: ProcessingResult,
44
+ usage_tracker: TokenUsageTracker | None = None,
45
+ ) -> ChunkingResult:
46
+ """
47
+ Chunk the document content from a ProcessingResult.
48
+
49
+ Args:
50
+ document: The ProcessingResult containing
51
+ document content to chunk
52
+ usage_tracker: Optional tracker for token usage during chunking
53
+
54
+ Returns:
55
+ A `ChunkingResult` containing a list of `Chunk` objects and
56
+ optional metrics.
57
+ """
58
+ raise NotImplementedError
59
+
60
+ def merge_small_chunks(
61
+ self, chunks: list[Chunk], min_size: int
62
+ ) -> list[Chunk]:
63
+ """
64
+ Merge small chunks with adjacent chunks to ensure minimum chunk size.
65
+
66
+ Args:
67
+ chunks: The chunks to process
68
+ min_size: Minimum size for chunks (smaller chunks will be merged)
69
+
70
+ Returns:
71
+ Processed chunks with small chunks merged
72
+ """
73
+ if not chunks:
74
+ return []
75
+
76
+ merged = []
77
+ i = 0
78
+ n = len(chunks)
79
+
80
+ while i < n:
81
+ current_chunk = chunks[i]
82
+ current_text = current_chunk.text
83
+
84
+ # Check if this chunk is "small"
85
+ if len(current_text) < min_size:
86
+ # 1) Try to merge with the NEXT chunk if same page_index
87
+ next_chunk_exists = (i + 1) < n
88
+ if next_chunk_exists:
89
+ next_chunk_same_page = (
90
+ chunks[i + 1].metadata.page_index
91
+ == current_chunk.metadata.page_index
92
+ )
93
+ else:
94
+ next_chunk_same_page = False
95
+
96
+ if i < n - 1 and next_chunk_same_page:
97
+ # Merge current with the next chunk
98
+ current_chunk.text += (" " + chunks[i + 1].text)
99
+
100
+ # Merge images if they exist
101
+ if (
102
+ current_chunk.metadata.images
103
+ and chunks[i + 1].metadata.images
104
+ ):
105
+ current_chunk.metadata.images.extend(
106
+ chunks[i + 1].metadata.images
107
+ )
108
+
109
+ # We've used chunk i+1, so skip it
110
+ i += 2
111
+
112
+ # Now this newly merged chunk is complete; add to 'merged'
113
+ merged.append(current_chunk)
114
+ else:
115
+ # 2) Otherwise, try to merge with
116
+ # PREVIOUS chunk in 'merged'
117
+ if merged:
118
+ # Merge current chunk into the last chunk in 'merged'
119
+ merged[-1].text += (" " + current_chunk.text)
120
+
121
+ # Merge images if they exist
122
+ if (
123
+ merged[-1].metadata.images
124
+ and current_chunk.metadata.images
125
+ ):
126
+ merged[-1].metadata.images.extend(
127
+ current_chunk.metadata.images
128
+ )
129
+ else:
130
+ # If there's no previous chunk in 'merged', just add it
131
+ merged.append(current_chunk)
132
+
133
+ i += 1
134
+ else:
135
+ # If it's not "small," just add it as-is
136
+ merged.append(current_chunk)
137
+ i += 1
138
+
139
+ return merged
140
+
141
+ def process_chunks(
142
+ self, chunks: list[Chunk]
143
+ ) -> list[Chunk]:
144
+ """
145
+ Optional post-processing of chunks after initial chunking.
146
+ This can be overridden by subclasses to
147
+ implement additional processing.
148
+
149
+ Args:
150
+ chunks: The initial chunks produced by the chunk method
151
+
152
+ Returns:
153
+ Processed chunks
154
+ """
155
+ return chunks
156
+
157
+ # ------------------------------------------------------------------
158
+ # Shared helpers
159
+ def attach_images(
160
+ self,
161
+ chunks: list[Chunk],
162
+ proc_result: ProcessingResult,
163
+ ) -> list[Chunk]:
164
+ """Populate each Chunk's metadata.images with inlined image data.
165
+
166
+ Looks for `![img-XX.jpeg](img-XX.jpeg)` markers inside the chunk text
167
+ and copies the matching `image_base64` from the corresponding page's
168
+ images collection.
169
+ """
170
+
171
+ img_pattern = re.compile(r"!\[img-\d+\.jpeg\]\(img-\d+\.jpeg\)")
172
+
173
+ for chunk in chunks:
174
+ images_in_chunk = img_pattern.findall(chunk.text)
175
+ if not images_in_chunk:
176
+ # No image markers, ensure empty list and continue
177
+ chunk.metadata.images = []
178
+ continue
179
+
180
+ page_idx = chunk.metadata.page_index
181
+ rel_images = proc_result.pages[page_idx].images or []
182
+ chunk.metadata.images = []
183
+
184
+ for img_tag in images_in_chunk:
185
+ img_id = img_tag.split("[")[1].split("]")[0]
186
+ for ocr_img in rel_images:
187
+ if ocr_img.id == img_id:
188
+ chunk.metadata.images.append(
189
+ Image(id=img_id, image_base64=ocr_img.image_base64)
190
+ )
191
+ break
192
+
193
+ return chunks
194
+
195
+ def __str__(self) -> str:
196
+ """Return a string representation of the chunker."""
197
+ return self.__class__.__name__
198
+
199
+ def __repr__(self) -> str:
200
+ """Return a string representation of the chunker."""
201
+ return f"{self.__class__.__name__}()"
@@ -0,0 +1,174 @@
1
+ from datetime import datetime, timezone
2
+
3
+ from ragbandit.schema import (
4
+ ProcessingResult,
5
+ Chunk,
6
+ ChunkMetadata,
7
+ ChunkingResult,
8
+ )
9
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
10
+ from ragbandit.documents.chunkers.base_chunker import BaseChunker
11
+
12
+
13
+ class FixedSizeChunker(BaseChunker):
14
+ """
15
+ A document chunker that splits documents into fixed-size chunks
16
+ with optional overlap between chunks.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ chunk_size: int = 1000,
22
+ overlap: int = 200,
23
+ name: str | None = None,
24
+ ):
25
+ """
26
+ Initialize the fixed size chunker.
27
+
28
+ Args:
29
+ chunk_size: Target size for each chunk in characters
30
+ overlap: Number of characters to overlap between chunks
31
+ name: Optional name for the chunker
32
+ """
33
+ super().__init__(name)
34
+ self.chunk_size = chunk_size
35
+ self.overlap = overlap
36
+
37
+ def chunk(
38
+ self,
39
+ proc_result: ProcessingResult,
40
+ usage_tracker: TokenUsageTracker | None = None,
41
+ ) -> ChunkingResult:
42
+ """
43
+ Chunk the document into fixed-size chunks.
44
+
45
+ Args:
46
+ proc_result: The ProcessingResult containing
47
+ document content to chunk
48
+ usage_tracker: Optional tracker for token usage
49
+ (not used in this chunker)
50
+
51
+ Returns:
52
+ A ChunkingResult containing Chunk objects
53
+ """
54
+ # 1. Generate raw chunks for each page
55
+ chunks = self._fixed_size_chunk_pages(proc_result)
56
+
57
+ # 2. Attach any inline images using BaseChunker helper
58
+ chunks = self.attach_images(chunks, proc_result)
59
+
60
+ # 3. Merge small chunks if needed
61
+ chunks = self.process_chunks(chunks)
62
+
63
+ # 4. Wrap in ChunkingResult
64
+ return ChunkingResult(
65
+ processed_at=datetime.now(timezone.utc),
66
+ chunks=chunks,
67
+ metrics=None,
68
+ )
69
+
70
+ # ------------------------------------------------------------------
71
+ # Internal helpers
72
+ def _fixed_size_chunk_pages(
73
+ self, proc_result: ProcessingResult
74
+ ) -> list[Chunk]:
75
+ """Split each page into fixed-size chunks with optional overlap."""
76
+
77
+ self.logger.info(
78
+ f"Starting fixed-size chunking with size={self.chunk_size}, "
79
+ f"overlap={self.overlap}"
80
+ )
81
+
82
+ chunks: list[Chunk] = []
83
+
84
+ # Process each page
85
+ for page_index, page in enumerate(proc_result.pages):
86
+ page_text = page.markdown
87
+
88
+ # Skip empty pages
89
+ if not page_text.strip():
90
+ continue
91
+
92
+ # Create chunks from this page
93
+ start = 0
94
+ while start < len(page_text):
95
+ # Determine end position for this chunk
96
+ end = min(start + self.chunk_size, len(page_text))
97
+
98
+ # If we're not at the end of the text,
99
+ # try to find a good break point
100
+ if end < len(page_text):
101
+ # Look for a period, question mark, or exclamation mark
102
+ # followed by whitespace
103
+ # within the last 100 characters of the chunk
104
+ search_start = max(end - 100, start)
105
+ for i in range(end, search_start, -1):
106
+ # Check if we're at a valid position to examine
107
+ if i <= 0 or i >= len(page_text):
108
+ continue
109
+
110
+ # Check if the previous character is punctuation
111
+ # and the current character is whitespace
112
+ if (
113
+ page_text[i - 1] in [".", "!", "?"]
114
+ and page_text[i].isspace()
115
+ ):
116
+ end = i
117
+ break
118
+
119
+ # Create the chunk
120
+ chunk_text = page_text[start:end]
121
+ meta = ChunkMetadata(
122
+ page_index=page_index, images=[], extra={}
123
+ )
124
+ chunks.append(Chunk(text=chunk_text, metadata=meta))
125
+
126
+ # Check if we've reached the end of the page text
127
+ if end >= len(page_text):
128
+ # We've processed the entire page, exit the loop
129
+ break
130
+
131
+ # Move to next chunk start position, accounting for overlap
132
+ start = end - self.overlap
133
+
134
+ # Make sure we're making progress
135
+ if start <= 0 or start >= len(page_text):
136
+ break
137
+
138
+ self.logger.info(
139
+ f"Fixed-size chunking complete. Created {len(chunks)} chunks."
140
+ )
141
+
142
+ return chunks
143
+
144
+ def process_chunks(
145
+ self, chunks: list[Chunk]
146
+ ) -> list[Chunk]:
147
+ """
148
+ Process chunks after initial chunking - merge small chunks if needed.
149
+
150
+ Args:
151
+ chunks: The initial chunks produced by the chunk method
152
+
153
+ Returns:
154
+ Processed chunks with small chunks merged if needed
155
+ """
156
+ if not chunks:
157
+ return chunks
158
+
159
+ # Calculate minimum chunk size as a fraction of the target chunk size
160
+ min_chunk_size = self.chunk_size // 2
161
+
162
+ # Check if any chunks are too small
163
+ min_len = min(len(c.text) for c in chunks)
164
+
165
+ # Merge small chunks if needed
166
+ if min_len < min_chunk_size:
167
+ self.logger.info(
168
+ f"Found chunks smaller than {min_chunk_size} characters. "
169
+ "Merging..."
170
+ )
171
+ chunks = self.merge_small_chunks(chunks, min_size=min_chunk_size)
172
+ self.logger.info(f"After merging: {len(chunks)} chunks")
173
+
174
+ return chunks