ragbandit-core 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbandit/__init__.py +26 -0
- ragbandit/config/__init__.py +3 -0
- ragbandit/config/llms.py +34 -0
- ragbandit/config/pricing.py +38 -0
- ragbandit/documents/__init__.py +66 -0
- ragbandit/documents/chunkers/__init__.py +18 -0
- ragbandit/documents/chunkers/base_chunker.py +201 -0
- ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
- ragbandit/documents/chunkers/semantic_chunker.py +205 -0
- ragbandit/documents/document_pipeline.py +350 -0
- ragbandit/documents/embedders/__init__.py +14 -0
- ragbandit/documents/embedders/base_embedder.py +82 -0
- ragbandit/documents/embedders/mistral_embedder.py +129 -0
- ragbandit/documents/ocr/__init__.py +13 -0
- ragbandit/documents/ocr/base_ocr.py +136 -0
- ragbandit/documents/ocr/mistral_ocr.py +147 -0
- ragbandit/documents/processors/__init__.py +16 -0
- ragbandit/documents/processors/base_processor.py +88 -0
- ragbandit/documents/processors/footnotes_processor.py +353 -0
- ragbandit/documents/processors/references_processor.py +408 -0
- ragbandit/documents/utils/__init__.py +11 -0
- ragbandit/documents/utils/secure_file_handler.py +95 -0
- ragbandit/prompt_tools/__init__.py +27 -0
- ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
- ragbandit/prompt_tools/prompt_tool.py +118 -0
- ragbandit/prompt_tools/references_processor_tools.py +31 -0
- ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
- ragbandit/schema.py +206 -0
- ragbandit/utils/__init__.py +19 -0
- ragbandit/utils/in_memory_log_handler.py +33 -0
- ragbandit/utils/llm_utils.py +188 -0
- ragbandit/utils/mistral_client.py +76 -0
- ragbandit/utils/token_usage_tracker.py +220 -0
- ragbandit_core-0.1.1.dist-info/METADATA +145 -0
- ragbandit_core-0.1.1.dist-info/RECORD +38 -0
- ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
- ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
- ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
ragbandit/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""ragbandit core package.
|
|
2
|
+
|
|
3
|
+
This package contains sub-modules for document processing,
|
|
4
|
+
RAG pipeline configuration/execution, and evaluation utilities.
|
|
5
|
+
Only lightweight interfaces and shared utilities are defined here;
|
|
6
|
+
heavy logic resides in sub-packages.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from importlib import metadata as _metadata
|
|
10
|
+
|
|
11
|
+
__version__: str
|
|
12
|
+
try:
|
|
13
|
+
__version__ = _metadata.version("ragbandit-core")
|
|
14
|
+
except _metadata.PackageNotFoundError: # pragma: no cover
|
|
15
|
+
__version__ = "0.0.0+dev"
|
|
16
|
+
|
|
17
|
+
# Re-export public interfaces so that users can simply:
|
|
18
|
+
# from ragbandit import DocumentProcessor, RAGConfig, RAGPipeline, evaluate
|
|
19
|
+
|
|
20
|
+
# from ragbandit.documents import DocumentPipeline
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"__version__",
|
|
25
|
+
# "DocumentPipeline",
|
|
26
|
+
]
|
ragbandit/config/llms.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM configuration settings for ragbandit.
|
|
3
|
+
|
|
4
|
+
This module defines default settings and constants for LLM interactions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Default model settings
|
|
8
|
+
DEFAULT_MODEL = "mistral-small-latest"
|
|
9
|
+
DEFAULT_TEMPERATURE = 0.0
|
|
10
|
+
|
|
11
|
+
# Retry settings
|
|
12
|
+
DEFAULT_MAX_RETRIES = 3
|
|
13
|
+
DEFAULT_RETRY_DELAY = 1.0 # seconds
|
|
14
|
+
DEFAULT_BACKOFF_FACTOR = 2.0 # exponential backoff factor
|
|
15
|
+
DEFAULT_TIMEOUT = 30.0 # seconds
|
|
16
|
+
|
|
17
|
+
# Token limits
|
|
18
|
+
MAX_PROMPT_TOKENS = {
|
|
19
|
+
"mistral-small-latest": 8000,
|
|
20
|
+
"mistral-medium-latest": 32000,
|
|
21
|
+
"mistral-large-latest": 32000,
|
|
22
|
+
"gpt-3.5-turbo": 4096,
|
|
23
|
+
"gpt-4": 8192,
|
|
24
|
+
"gpt-4-turbo": 128000,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# System prompts
|
|
28
|
+
DEFAULT_SYSTEM_PROMPT = """You are a helpful AI assistant."""
|
|
29
|
+
|
|
30
|
+
# Response formats
|
|
31
|
+
JSON_FORMAT_INSTRUCTION = """
|
|
32
|
+
Your response must be valid JSON that matches the following schema:
|
|
33
|
+
{schema}
|
|
34
|
+
"""
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pricing configuration for LLM API calls.
|
|
3
|
+
|
|
4
|
+
This module contains pricing constants for various
|
|
5
|
+
LLM models and embedding models.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Token cost rates per 1M tokens (in USD)
|
|
9
|
+
# Based on Mistral AI pricing as of July 2025
|
|
10
|
+
MODEL_COSTS = {
|
|
11
|
+
# Format: "model_name": (input_cost_per_1M, output_cost_per_1M)
|
|
12
|
+
"mistral-small-latest": (2.00, 6.00),
|
|
13
|
+
"mistral-medium-latest": (6.00, 18.00),
|
|
14
|
+
"mistral-large-latest": (12.00, 36.00),
|
|
15
|
+
# Add other models as needed
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Embedding model costs per 1M tokens
|
|
19
|
+
EMBEDDING_COSTS = {
|
|
20
|
+
# Format: "model_name": cost_per_1M_tokens
|
|
21
|
+
"mistral-embed": 0.10,
|
|
22
|
+
"text-embedding-3-small": 0.02,
|
|
23
|
+
"text-embedding-3-large": 0.13,
|
|
24
|
+
# Add other embedding models as needed
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# OCR model costs per page (in EUR)
|
|
28
|
+
OCR_MODEL_COSTS = {
|
|
29
|
+
# Format: "model_name": cost_per_page
|
|
30
|
+
"mistral-ocr-latest": 0.001, # 1 EUR per 1000 pages
|
|
31
|
+
# Add other OCR models as needed
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Default OCR model to use if the specified model is not in OCR_MODEL_COSTS
|
|
35
|
+
DEFAULT_OCR_MODEL = "mistral-ocr-latest"
|
|
36
|
+
|
|
37
|
+
# Default model to use if the specified model is not in MODEL_COSTS
|
|
38
|
+
DEFAULT_MODEL = "mistral-small-latest"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing module for handling, analyzing, and transforming documents.
|
|
3
|
+
|
|
4
|
+
This package provides tools for OCR, chunking,
|
|
5
|
+
embedding, and processing documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Import key components from subdirectories
|
|
9
|
+
from ragbandit.documents.document_pipeline import DocumentPipeline
|
|
10
|
+
|
|
11
|
+
# Import from chunkers
|
|
12
|
+
from ragbandit.documents.chunkers import (
|
|
13
|
+
BaseChunker,
|
|
14
|
+
FixedSizeChunker,
|
|
15
|
+
SemanticChunker,
|
|
16
|
+
SemanticBreak
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Import from processors
|
|
20
|
+
from ragbandit.documents.processors import (
|
|
21
|
+
BaseProcessor,
|
|
22
|
+
FootnoteProcessor,
|
|
23
|
+
ReferencesProcessor
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Import from embedders
|
|
27
|
+
from ragbandit.documents.embedders import (
|
|
28
|
+
BaseEmbedder,
|
|
29
|
+
MistralEmbedder
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Import from OCR
|
|
33
|
+
from ragbandit.documents.ocr import (
|
|
34
|
+
BaseOCR,
|
|
35
|
+
MistralOCRDocument
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Import from utils
|
|
39
|
+
from ragbandit.documents.utils import SecureFileHandler
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Main pipeline
|
|
43
|
+
"DocumentPipeline",
|
|
44
|
+
|
|
45
|
+
# Chunkers
|
|
46
|
+
"BaseChunker",
|
|
47
|
+
"FixedSizeChunker",
|
|
48
|
+
"SemanticChunker",
|
|
49
|
+
"SemanticBreak",
|
|
50
|
+
|
|
51
|
+
# Processors
|
|
52
|
+
"BaseProcessor",
|
|
53
|
+
"FootnoteProcessor",
|
|
54
|
+
"ReferencesProcessor",
|
|
55
|
+
|
|
56
|
+
# Embedders
|
|
57
|
+
"BaseEmbedder",
|
|
58
|
+
"MistralEmbedder",
|
|
59
|
+
|
|
60
|
+
# OCR
|
|
61
|
+
"BaseOCR",
|
|
62
|
+
"MistralOCRDocument",
|
|
63
|
+
|
|
64
|
+
# Utils
|
|
65
|
+
"SecureFileHandler"
|
|
66
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunker implementations for document processing.
|
|
3
|
+
|
|
4
|
+
This module provides various chunking strategies for documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ragbandit.documents.chunkers.base_chunker import BaseChunker
|
|
8
|
+
from ragbandit.documents.chunkers.fixed_size_chunker import FixedSizeChunker
|
|
9
|
+
from ragbandit.documents.chunkers.semantic_chunker import (
|
|
10
|
+
SemanticChunker, SemanticBreak
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseChunker",
|
|
15
|
+
"FixedSizeChunker",
|
|
16
|
+
"SemanticChunker",
|
|
17
|
+
"SemanticBreak"
|
|
18
|
+
]
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------
|
|
2
|
+
# Standard library
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
# Project
|
|
8
|
+
from ragbandit.schema import (
|
|
9
|
+
ProcessingResult,
|
|
10
|
+
Chunk,
|
|
11
|
+
ChunkingResult,
|
|
12
|
+
Image,
|
|
13
|
+
)
|
|
14
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseChunker(ABC):
|
|
18
|
+
"""
|
|
19
|
+
Base class for document chunking strategies.
|
|
20
|
+
Subclasses should implement the `chunk()` method to
|
|
21
|
+
provide specific chunking logic.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, name: str | None = None, api_key: str | None = None):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the chunker.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
name: Optional name for the chunker
|
|
30
|
+
api_key: API key for LLM services
|
|
31
|
+
"""
|
|
32
|
+
# Hierarchical names make it easy to filter later:
|
|
33
|
+
# chunker.semantic, chunker.fixed_size, etc.
|
|
34
|
+
base = "chunker"
|
|
35
|
+
self.logger = logging.getLogger(
|
|
36
|
+
f"{base}.{name or self.__class__.__name__}"
|
|
37
|
+
)
|
|
38
|
+
self.api_key = api_key
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def chunk(
|
|
42
|
+
self,
|
|
43
|
+
document: ProcessingResult,
|
|
44
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
45
|
+
) -> ChunkingResult:
|
|
46
|
+
"""
|
|
47
|
+
Chunk the document content from a ProcessingResult.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
document: The ProcessingResult containing
|
|
51
|
+
document content to chunk
|
|
52
|
+
usage_tracker: Optional tracker for token usage during chunking
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
A `ChunkingResult` containing a list of `Chunk` objects and
|
|
56
|
+
optional metrics.
|
|
57
|
+
"""
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
60
|
+
def merge_small_chunks(
|
|
61
|
+
self, chunks: list[Chunk], min_size: int
|
|
62
|
+
) -> list[Chunk]:
|
|
63
|
+
"""
|
|
64
|
+
Merge small chunks with adjacent chunks to ensure minimum chunk size.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
chunks: The chunks to process
|
|
68
|
+
min_size: Minimum size for chunks (smaller chunks will be merged)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Processed chunks with small chunks merged
|
|
72
|
+
"""
|
|
73
|
+
if not chunks:
|
|
74
|
+
return []
|
|
75
|
+
|
|
76
|
+
merged = []
|
|
77
|
+
i = 0
|
|
78
|
+
n = len(chunks)
|
|
79
|
+
|
|
80
|
+
while i < n:
|
|
81
|
+
current_chunk = chunks[i]
|
|
82
|
+
current_text = current_chunk.text
|
|
83
|
+
|
|
84
|
+
# Check if this chunk is "small"
|
|
85
|
+
if len(current_text) < min_size:
|
|
86
|
+
# 1) Try to merge with the NEXT chunk if same page_index
|
|
87
|
+
next_chunk_exists = (i + 1) < n
|
|
88
|
+
if next_chunk_exists:
|
|
89
|
+
next_chunk_same_page = (
|
|
90
|
+
chunks[i + 1].metadata.page_index
|
|
91
|
+
== current_chunk.metadata.page_index
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
next_chunk_same_page = False
|
|
95
|
+
|
|
96
|
+
if i < n - 1 and next_chunk_same_page:
|
|
97
|
+
# Merge current with the next chunk
|
|
98
|
+
current_chunk.text += (" " + chunks[i + 1].text)
|
|
99
|
+
|
|
100
|
+
# Merge images if they exist
|
|
101
|
+
if (
|
|
102
|
+
current_chunk.metadata.images
|
|
103
|
+
and chunks[i + 1].metadata.images
|
|
104
|
+
):
|
|
105
|
+
current_chunk.metadata.images.extend(
|
|
106
|
+
chunks[i + 1].metadata.images
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# We've used chunk i+1, so skip it
|
|
110
|
+
i += 2
|
|
111
|
+
|
|
112
|
+
# Now this newly merged chunk is complete; add to 'merged'
|
|
113
|
+
merged.append(current_chunk)
|
|
114
|
+
else:
|
|
115
|
+
# 2) Otherwise, try to merge with
|
|
116
|
+
# PREVIOUS chunk in 'merged'
|
|
117
|
+
if merged:
|
|
118
|
+
# Merge current chunk into the last chunk in 'merged'
|
|
119
|
+
merged[-1].text += (" " + current_chunk.text)
|
|
120
|
+
|
|
121
|
+
# Merge images if they exist
|
|
122
|
+
if (
|
|
123
|
+
merged[-1].metadata.images
|
|
124
|
+
and current_chunk.metadata.images
|
|
125
|
+
):
|
|
126
|
+
merged[-1].metadata.images.extend(
|
|
127
|
+
current_chunk.metadata.images
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
# If there's no previous chunk in 'merged', just add it
|
|
131
|
+
merged.append(current_chunk)
|
|
132
|
+
|
|
133
|
+
i += 1
|
|
134
|
+
else:
|
|
135
|
+
# If it's not "small," just add it as-is
|
|
136
|
+
merged.append(current_chunk)
|
|
137
|
+
i += 1
|
|
138
|
+
|
|
139
|
+
return merged
|
|
140
|
+
|
|
141
|
+
def process_chunks(
|
|
142
|
+
self, chunks: list[Chunk]
|
|
143
|
+
) -> list[Chunk]:
|
|
144
|
+
"""
|
|
145
|
+
Optional post-processing of chunks after initial chunking.
|
|
146
|
+
This can be overridden by subclasses to
|
|
147
|
+
implement additional processing.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
chunks: The initial chunks produced by the chunk method
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Processed chunks
|
|
154
|
+
"""
|
|
155
|
+
return chunks
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
# Shared helpers
|
|
159
|
+
def attach_images(
|
|
160
|
+
self,
|
|
161
|
+
chunks: list[Chunk],
|
|
162
|
+
proc_result: ProcessingResult,
|
|
163
|
+
) -> list[Chunk]:
|
|
164
|
+
"""Populate each Chunk's metadata.images with inlined image data.
|
|
165
|
+
|
|
166
|
+
Looks for `` markers inside the chunk text
|
|
167
|
+
and copies the matching `image_base64` from the corresponding page's
|
|
168
|
+
images collection.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
img_pattern = re.compile(r"!\[img-\d+\.jpeg\]\(img-\d+\.jpeg\)")
|
|
172
|
+
|
|
173
|
+
for chunk in chunks:
|
|
174
|
+
images_in_chunk = img_pattern.findall(chunk.text)
|
|
175
|
+
if not images_in_chunk:
|
|
176
|
+
# No image markers, ensure empty list and continue
|
|
177
|
+
chunk.metadata.images = []
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
page_idx = chunk.metadata.page_index
|
|
181
|
+
rel_images = proc_result.pages[page_idx].images or []
|
|
182
|
+
chunk.metadata.images = []
|
|
183
|
+
|
|
184
|
+
for img_tag in images_in_chunk:
|
|
185
|
+
img_id = img_tag.split("[")[1].split("]")[0]
|
|
186
|
+
for ocr_img in rel_images:
|
|
187
|
+
if ocr_img.id == img_id:
|
|
188
|
+
chunk.metadata.images.append(
|
|
189
|
+
Image(id=img_id, image_base64=ocr_img.image_base64)
|
|
190
|
+
)
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
return chunks
|
|
194
|
+
|
|
195
|
+
def __str__(self) -> str:
|
|
196
|
+
"""Return a string representation of the chunker."""
|
|
197
|
+
return self.__class__.__name__
|
|
198
|
+
|
|
199
|
+
def __repr__(self) -> str:
|
|
200
|
+
"""Return a string representation of the chunker."""
|
|
201
|
+
return f"{self.__class__.__name__}()"
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
|
|
3
|
+
from ragbandit.schema import (
|
|
4
|
+
ProcessingResult,
|
|
5
|
+
Chunk,
|
|
6
|
+
ChunkMetadata,
|
|
7
|
+
ChunkingResult,
|
|
8
|
+
)
|
|
9
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
10
|
+
from ragbandit.documents.chunkers.base_chunker import BaseChunker
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FixedSizeChunker(BaseChunker):
|
|
14
|
+
"""
|
|
15
|
+
A document chunker that splits documents into fixed-size chunks
|
|
16
|
+
with optional overlap between chunks.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
chunk_size: int = 1000,
|
|
22
|
+
overlap: int = 200,
|
|
23
|
+
name: str | None = None,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the fixed size chunker.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
chunk_size: Target size for each chunk in characters
|
|
30
|
+
overlap: Number of characters to overlap between chunks
|
|
31
|
+
name: Optional name for the chunker
|
|
32
|
+
"""
|
|
33
|
+
super().__init__(name)
|
|
34
|
+
self.chunk_size = chunk_size
|
|
35
|
+
self.overlap = overlap
|
|
36
|
+
|
|
37
|
+
def chunk(
|
|
38
|
+
self,
|
|
39
|
+
proc_result: ProcessingResult,
|
|
40
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
41
|
+
) -> ChunkingResult:
|
|
42
|
+
"""
|
|
43
|
+
Chunk the document into fixed-size chunks.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
proc_result: The ProcessingResult containing
|
|
47
|
+
document content to chunk
|
|
48
|
+
usage_tracker: Optional tracker for token usage
|
|
49
|
+
(not used in this chunker)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
A ChunkingResult containing Chunk objects
|
|
53
|
+
"""
|
|
54
|
+
# 1. Generate raw chunks for each page
|
|
55
|
+
chunks = self._fixed_size_chunk_pages(proc_result)
|
|
56
|
+
|
|
57
|
+
# 2. Attach any inline images using BaseChunker helper
|
|
58
|
+
chunks = self.attach_images(chunks, proc_result)
|
|
59
|
+
|
|
60
|
+
# 3. Merge small chunks if needed
|
|
61
|
+
chunks = self.process_chunks(chunks)
|
|
62
|
+
|
|
63
|
+
# 4. Wrap in ChunkingResult
|
|
64
|
+
return ChunkingResult(
|
|
65
|
+
processed_at=datetime.now(timezone.utc),
|
|
66
|
+
chunks=chunks,
|
|
67
|
+
metrics=None,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# Internal helpers
|
|
72
|
+
def _fixed_size_chunk_pages(
|
|
73
|
+
self, proc_result: ProcessingResult
|
|
74
|
+
) -> list[Chunk]:
|
|
75
|
+
"""Split each page into fixed-size chunks with optional overlap."""
|
|
76
|
+
|
|
77
|
+
self.logger.info(
|
|
78
|
+
f"Starting fixed-size chunking with size={self.chunk_size}, "
|
|
79
|
+
f"overlap={self.overlap}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
chunks: list[Chunk] = []
|
|
83
|
+
|
|
84
|
+
# Process each page
|
|
85
|
+
for page_index, page in enumerate(proc_result.pages):
|
|
86
|
+
page_text = page.markdown
|
|
87
|
+
|
|
88
|
+
# Skip empty pages
|
|
89
|
+
if not page_text.strip():
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Create chunks from this page
|
|
93
|
+
start = 0
|
|
94
|
+
while start < len(page_text):
|
|
95
|
+
# Determine end position for this chunk
|
|
96
|
+
end = min(start + self.chunk_size, len(page_text))
|
|
97
|
+
|
|
98
|
+
# If we're not at the end of the text,
|
|
99
|
+
# try to find a good break point
|
|
100
|
+
if end < len(page_text):
|
|
101
|
+
# Look for a period, question mark, or exclamation mark
|
|
102
|
+
# followed by whitespace
|
|
103
|
+
# within the last 100 characters of the chunk
|
|
104
|
+
search_start = max(end - 100, start)
|
|
105
|
+
for i in range(end, search_start, -1):
|
|
106
|
+
# Check if we're at a valid position to examine
|
|
107
|
+
if i <= 0 or i >= len(page_text):
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Check if the previous character is punctuation
|
|
111
|
+
# and the current character is whitespace
|
|
112
|
+
if (
|
|
113
|
+
page_text[i - 1] in [".", "!", "?"]
|
|
114
|
+
and page_text[i].isspace()
|
|
115
|
+
):
|
|
116
|
+
end = i
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
# Create the chunk
|
|
120
|
+
chunk_text = page_text[start:end]
|
|
121
|
+
meta = ChunkMetadata(
|
|
122
|
+
page_index=page_index, images=[], extra={}
|
|
123
|
+
)
|
|
124
|
+
chunks.append(Chunk(text=chunk_text, metadata=meta))
|
|
125
|
+
|
|
126
|
+
# Check if we've reached the end of the page text
|
|
127
|
+
if end >= len(page_text):
|
|
128
|
+
# We've processed the entire page, exit the loop
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
# Move to next chunk start position, accounting for overlap
|
|
132
|
+
start = end - self.overlap
|
|
133
|
+
|
|
134
|
+
# Make sure we're making progress
|
|
135
|
+
if start <= 0 or start >= len(page_text):
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
self.logger.info(
|
|
139
|
+
f"Fixed-size chunking complete. Created {len(chunks)} chunks."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return chunks
|
|
143
|
+
|
|
144
|
+
def process_chunks(
|
|
145
|
+
self, chunks: list[Chunk]
|
|
146
|
+
) -> list[Chunk]:
|
|
147
|
+
"""
|
|
148
|
+
Process chunks after initial chunking - merge small chunks if needed.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
chunks: The initial chunks produced by the chunk method
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Processed chunks with small chunks merged if needed
|
|
155
|
+
"""
|
|
156
|
+
if not chunks:
|
|
157
|
+
return chunks
|
|
158
|
+
|
|
159
|
+
# Calculate minimum chunk size as a fraction of the target chunk size
|
|
160
|
+
min_chunk_size = self.chunk_size // 2
|
|
161
|
+
|
|
162
|
+
# Check if any chunks are too small
|
|
163
|
+
min_len = min(len(c.text) for c in chunks)
|
|
164
|
+
|
|
165
|
+
# Merge small chunks if needed
|
|
166
|
+
if min_len < min_chunk_size:
|
|
167
|
+
self.logger.info(
|
|
168
|
+
f"Found chunks smaller than {min_chunk_size} characters. "
|
|
169
|
+
"Merging..."
|
|
170
|
+
)
|
|
171
|
+
chunks = self.merge_small_chunks(chunks, min_size=min_chunk_size)
|
|
172
|
+
self.logger.info(f"After merging: {len(chunks)} chunks")
|
|
173
|
+
|
|
174
|
+
return chunks
|