ragbandit-core 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbandit/__init__.py +26 -0
- ragbandit/config/__init__.py +3 -0
- ragbandit/config/llms.py +34 -0
- ragbandit/config/pricing.py +38 -0
- ragbandit/documents/__init__.py +66 -0
- ragbandit/documents/chunkers/__init__.py +18 -0
- ragbandit/documents/chunkers/base_chunker.py +201 -0
- ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
- ragbandit/documents/chunkers/semantic_chunker.py +205 -0
- ragbandit/documents/document_pipeline.py +350 -0
- ragbandit/documents/embedders/__init__.py +14 -0
- ragbandit/documents/embedders/base_embedder.py +82 -0
- ragbandit/documents/embedders/mistral_embedder.py +129 -0
- ragbandit/documents/ocr/__init__.py +13 -0
- ragbandit/documents/ocr/base_ocr.py +136 -0
- ragbandit/documents/ocr/mistral_ocr.py +147 -0
- ragbandit/documents/processors/__init__.py +16 -0
- ragbandit/documents/processors/base_processor.py +88 -0
- ragbandit/documents/processors/footnotes_processor.py +353 -0
- ragbandit/documents/processors/references_processor.py +408 -0
- ragbandit/documents/utils/__init__.py +11 -0
- ragbandit/documents/utils/secure_file_handler.py +95 -0
- ragbandit/prompt_tools/__init__.py +27 -0
- ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
- ragbandit/prompt_tools/prompt_tool.py +118 -0
- ragbandit/prompt_tools/references_processor_tools.py +31 -0
- ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
- ragbandit/schema.py +206 -0
- ragbandit/utils/__init__.py +19 -0
- ragbandit/utils/in_memory_log_handler.py +33 -0
- ragbandit/utils/llm_utils.py +188 -0
- ragbandit/utils/mistral_client.py +76 -0
- ragbandit/utils/token_usage_tracker.py +220 -0
- ragbandit_core-0.1.1.dist-info/METADATA +145 -0
- ragbandit_core-0.1.1.dist-info/RECORD +38 -0
- ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
- ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
- ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from ragbandit.schema import (
|
|
4
|
+
ProcessingResult,
|
|
5
|
+
Chunk,
|
|
6
|
+
ChunkMetadata,
|
|
7
|
+
ChunkingResult,
|
|
8
|
+
)
|
|
9
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
10
|
+
|
|
11
|
+
from ragbandit.prompt_tools.semantic_chunker_tools import (
|
|
12
|
+
find_semantic_break_tool,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from ragbandit.documents.chunkers.base_chunker import BaseChunker
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SemanticBreak(BaseModel):
|
|
19
|
+
semantic_break: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SemanticChunker(BaseChunker):
|
|
23
|
+
"""
|
|
24
|
+
A document chunker that uses semantic understanding to split documents
|
|
25
|
+
into coherent chunks based on content.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
min_chunk_size: int = 500,
|
|
31
|
+
name: str | None = None,
|
|
32
|
+
api_key: str | None = None
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the semantic chunker.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
min_chunk_size: Minimum size for chunks
|
|
39
|
+
(smaller chunks will be merged)
|
|
40
|
+
name: Optional name for the chunker
|
|
41
|
+
api_key: Mistral API Key
|
|
42
|
+
"""
|
|
43
|
+
super().__init__(name, api_key)
|
|
44
|
+
self.min_chunk_size = min_chunk_size
|
|
45
|
+
|
|
46
|
+
def semantic_chunk_pages(
|
|
47
|
+
self, pages: list, usage_tracker: TokenUsageTracker | None = None
|
|
48
|
+
) -> list[Chunk]:
|
|
49
|
+
"""
|
|
50
|
+
Chunk pages semantically using LLM-based semantic breaks.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
pages: List of page dictionaries with markdown content
|
|
54
|
+
usage_tracker: Optional tracker for token usage
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A list of chunk dictionaries
|
|
58
|
+
"""
|
|
59
|
+
i = 0
|
|
60
|
+
full_text = pages[i].markdown
|
|
61
|
+
chunks: list[Chunk] = []
|
|
62
|
+
|
|
63
|
+
while i < len(pages):
|
|
64
|
+
# If we have "remainder" from the last iteration,
|
|
65
|
+
# it might be appended here
|
|
66
|
+
break_lead = find_semantic_break_tool(
|
|
67
|
+
api_key=self.api_key,
|
|
68
|
+
text=full_text,
|
|
69
|
+
usage_tracker=usage_tracker
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if break_lead == "NO_BREAK":
|
|
73
|
+
# This means the LLM found no break;
|
|
74
|
+
# treat the entire `full_text` as one chunk
|
|
75
|
+
meta = ChunkMetadata(page_index=i, images=[], extra={})
|
|
76
|
+
chunks.append(Chunk(text=full_text, metadata=meta))
|
|
77
|
+
# Move to the next page
|
|
78
|
+
i += 1
|
|
79
|
+
if i < len(pages):
|
|
80
|
+
full_text = pages[i].markdown
|
|
81
|
+
else:
|
|
82
|
+
break
|
|
83
|
+
else:
|
|
84
|
+
# Attempt to find the snippet in the text
|
|
85
|
+
idx = full_text.find(break_lead)
|
|
86
|
+
|
|
87
|
+
# If exact match fails, try progressively shorter versions
|
|
88
|
+
if idx == -1 and len(break_lead) > 0:
|
|
89
|
+
current_break_lead = break_lead
|
|
90
|
+
min_length = 10 # Minimum characters to try matching
|
|
91
|
+
|
|
92
|
+
# Try progressively shorter versions
|
|
93
|
+
# of the break_lead until we find a match
|
|
94
|
+
# or reach the minimum length
|
|
95
|
+
while idx == -1 and len(current_break_lead) >= min_length:
|
|
96
|
+
# Cut the break_lead in half and try again
|
|
97
|
+
current_break_lead = current_break_lead[
|
|
98
|
+
: len(current_break_lead) // 2
|
|
99
|
+
]
|
|
100
|
+
idx = full_text.find(current_break_lead)
|
|
101
|
+
|
|
102
|
+
if idx == -1:
|
|
103
|
+
# If we still can't find the snippet after
|
|
104
|
+
# trying shorter versions,
|
|
105
|
+
# fallback: chunk everything as is
|
|
106
|
+
meta = ChunkMetadata(page_index=i, images=[], extra={})
|
|
107
|
+
chunks.append(Chunk(text=full_text, metadata=meta))
|
|
108
|
+
i += 1
|
|
109
|
+
if i < len(pages):
|
|
110
|
+
full_text = pages[i].markdown
|
|
111
|
+
else:
|
|
112
|
+
break
|
|
113
|
+
else:
|
|
114
|
+
# We found a break
|
|
115
|
+
chunk_text = full_text[:idx]
|
|
116
|
+
remainder = full_text[idx:]
|
|
117
|
+
meta = ChunkMetadata(page_index=i, images=[], extra={})
|
|
118
|
+
chunks.append(Chunk(text=chunk_text, metadata=meta))
|
|
119
|
+
|
|
120
|
+
# Now we see if remainder is too small
|
|
121
|
+
if len(remainder) < 1500: # ~some threshold
|
|
122
|
+
i += 1
|
|
123
|
+
if i < len(pages):
|
|
124
|
+
# Combine remainder with next page
|
|
125
|
+
remainder += "\n" + pages[i].markdown
|
|
126
|
+
# remainder becomes the new full_text
|
|
127
|
+
full_text = remainder
|
|
128
|
+
|
|
129
|
+
# If we used up the last page, break
|
|
130
|
+
if i >= len(pages):
|
|
131
|
+
# Possibly chunk the remainder if it's not empty
|
|
132
|
+
if len(full_text.strip()) > 0:
|
|
133
|
+
meta = ChunkMetadata(
|
|
134
|
+
page_index=min(i, len(pages) - 1),
|
|
135
|
+
images=[],
|
|
136
|
+
extra={},
|
|
137
|
+
)
|
|
138
|
+
chunks.append(Chunk(text=full_text, metadata=meta))
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
return chunks
|
|
142
|
+
|
|
143
|
+
def chunk(
|
|
144
|
+
self,
|
|
145
|
+
proc_result: ProcessingResult,
|
|
146
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
147
|
+
) -> ChunkingResult:
|
|
148
|
+
"""
|
|
149
|
+
Chunk the document using semantic chunking.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
proc_result: The ProcessingResult containing
|
|
153
|
+
document content to chunk
|
|
154
|
+
usage_tracker: Tracker for token usage during chunking
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
A list of chunk dictionaries
|
|
158
|
+
"""
|
|
159
|
+
self.logger.info("Starting semantic chunking")
|
|
160
|
+
|
|
161
|
+
# Get the pages from the response
|
|
162
|
+
pages = proc_result.pages
|
|
163
|
+
|
|
164
|
+
# Perform semantic chunking
|
|
165
|
+
chunks = self.semantic_chunk_pages(pages, usage_tracker)
|
|
166
|
+
|
|
167
|
+
# Attach image data to chunks using shared helper
|
|
168
|
+
chunks = self.attach_images(chunks, proc_result)
|
|
169
|
+
|
|
170
|
+
# Merge small chunks if needed
|
|
171
|
+
chunks = self.process_chunks(chunks)
|
|
172
|
+
|
|
173
|
+
return ChunkingResult(
|
|
174
|
+
processed_at=datetime.now(timezone.utc),
|
|
175
|
+
chunks=chunks,
|
|
176
|
+
metrics=usage_tracker.get_summary() if usage_tracker else None,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def process_chunks(
|
|
180
|
+
self, chunks: list[Chunk]
|
|
181
|
+
) -> list[Chunk]:
|
|
182
|
+
"""
|
|
183
|
+
Process chunks after initial chunking - merge small chunks.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
chunks: The initial chunks produced by the chunk method
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Processed chunks with small chunks merged
|
|
190
|
+
"""
|
|
191
|
+
# Check if any chunks are too small
|
|
192
|
+
min_len = min([len(c.text) for c in chunks]) if chunks else 0
|
|
193
|
+
|
|
194
|
+
# Merge small chunks if needed
|
|
195
|
+
if min_len < self.min_chunk_size:
|
|
196
|
+
self.logger.info(
|
|
197
|
+
f"Found chunks smaller than {self.min_chunk_size} characters. "
|
|
198
|
+
"Merging..."
|
|
199
|
+
)
|
|
200
|
+
chunks = self.merge_small_chunks(
|
|
201
|
+
chunks, min_size=self.min_chunk_size
|
|
202
|
+
)
|
|
203
|
+
self.logger.info(f"After merging: {len(chunks)} chunks")
|
|
204
|
+
|
|
205
|
+
return chunks
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing pipeline that orchestrates multiple document processors.
|
|
3
|
+
|
|
4
|
+
This module provides the main DocumentPipeline class that manages the execution
|
|
5
|
+
of document processors in sequence, chunking, and embedding.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import traceback
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from typing import Callable
|
|
12
|
+
import time
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from ragbandit.schema import (
|
|
15
|
+
OCRResult,
|
|
16
|
+
ProcessingResult,
|
|
17
|
+
ChunkingResult,
|
|
18
|
+
EmbeddingResult,
|
|
19
|
+
DocumentPipelineResult,
|
|
20
|
+
TimingMetrics,
|
|
21
|
+
StepReport,
|
|
22
|
+
StepStatus,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from ragbandit.documents.ocr import BaseOCR
|
|
26
|
+
from ragbandit.documents.processors.base_processor import BaseProcessor
|
|
27
|
+
from ragbandit.documents.chunkers.base_chunker import BaseChunker
|
|
28
|
+
from ragbandit.documents.embedders.base_embedder import BaseEmbedder
|
|
29
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
30
|
+
|
|
31
|
+
from ragbandit.utils.in_memory_log_handler import InMemoryLogHandler
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DocumentPipeline:
|
|
35
|
+
"""Pipeline for processing documents through a
|
|
36
|
+
sequence of document processors, chunkers, and embedders.
|
|
37
|
+
|
|
38
|
+
The pipeline manages the execution of document processors in sequence,
|
|
39
|
+
where each processor receives the output of the previous processor.
|
|
40
|
+
The pipeline also tracks token usage and costs for each document.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class _PipelineStep:
|
|
45
|
+
key: str # "ocr" | "processing" | "chunking" | "embedding"
|
|
46
|
+
run: Callable[[], object]
|
|
47
|
+
on_success: Callable[[object], None]
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
ocr_processor: BaseOCR | None = None,
|
|
52
|
+
processors: list[BaseProcessor] | None = None,
|
|
53
|
+
chunker: BaseChunker | None = None,
|
|
54
|
+
embedder: BaseEmbedder | None = None,
|
|
55
|
+
logger: logging.Logger | None = None,
|
|
56
|
+
):
|
|
57
|
+
"""Initialize a new document processing pipeline.
|
|
58
|
+
|
|
59
|
+
All components are optional to allow running
|
|
60
|
+
individual steps independently.
|
|
61
|
+
For full pipeline execution via process(),
|
|
62
|
+
all components must be provided.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
ocr_processor: OCR processor to use (required for run_ocr
|
|
66
|
+
and process)
|
|
67
|
+
processors: List of document processors to execute in
|
|
68
|
+
sequence
|
|
69
|
+
chunker: Chunker to use for document chunking (required
|
|
70
|
+
for run_chunker and process)
|
|
71
|
+
embedder: Embedder to use for chunk embedding (required
|
|
72
|
+
for run_embedder and process)
|
|
73
|
+
logger: Optional logger for pipeline events
|
|
74
|
+
"""
|
|
75
|
+
self.ocr_processor = ocr_processor
|
|
76
|
+
self.processors = processors or []
|
|
77
|
+
self.chunker = chunker
|
|
78
|
+
self.embedder = embedder
|
|
79
|
+
|
|
80
|
+
# Set up logging with more explicit configuration
|
|
81
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
82
|
+
|
|
83
|
+
self._transcript = InMemoryLogHandler(level=logging.DEBUG)
|
|
84
|
+
root_logger = logging.getLogger()
|
|
85
|
+
root_logger.addHandler(self._transcript)
|
|
86
|
+
|
|
87
|
+
# Ensure we're generating logs
|
|
88
|
+
self.logger.info("DocumentPipeline initialized")
|
|
89
|
+
|
|
90
|
+
def add_processor(self, processor: BaseProcessor) -> None:
|
|
91
|
+
"""Add a processor to the pipeline.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
processor: The document processor to add
|
|
95
|
+
"""
|
|
96
|
+
self.processors.append(processor)
|
|
97
|
+
self.logger.info(f"Added processor: {processor}")
|
|
98
|
+
|
|
99
|
+
def _fresh_buffer(self):
|
|
100
|
+
self._transcript.clear()
|
|
101
|
+
# Ensure the handler is still attached
|
|
102
|
+
root_logger = logging.getLogger()
|
|
103
|
+
if self._transcript not in root_logger.handlers:
|
|
104
|
+
root_logger.addHandler(self._transcript)
|
|
105
|
+
|
|
106
|
+
def run_ocr(self, pdf_filepath: str) -> OCRResult:
|
|
107
|
+
"""Perform OCR on a PDF file using the configured OCR processor.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
pdf_filepath: Path to the PDF file to process
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
OCRResult: The OCR result from the processor
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If ocr_processor is not configured
|
|
117
|
+
"""
|
|
118
|
+
if not self.ocr_processor:
|
|
119
|
+
raise ValueError("ocr_processor is required for OCR operation")
|
|
120
|
+
return self.ocr_processor.process(pdf_filepath)
|
|
121
|
+
|
|
122
|
+
def run_processors(
|
|
123
|
+
self,
|
|
124
|
+
ocr_result: OCRResult,
|
|
125
|
+
) -> list[ProcessingResult]:
|
|
126
|
+
"""Process a document through the processors pipeline.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
ocr_result: The initial OCR result to process
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A list of ProcessingResult with additional metadata
|
|
133
|
+
from all processors
|
|
134
|
+
"""
|
|
135
|
+
processing_results: list[ProcessingResult] = []
|
|
136
|
+
|
|
137
|
+
# Start the processor chain with the raw OCRResult; each processor
|
|
138
|
+
# is responsible for converting it to ProcessingResult if needed.
|
|
139
|
+
prev_result = ocr_result
|
|
140
|
+
|
|
141
|
+
# Process the document through each processor in sequence
|
|
142
|
+
for processor in self.processors:
|
|
143
|
+
self.logger.info(f"Running processor: {processor}")
|
|
144
|
+
|
|
145
|
+
# Give each processor its own usage tracker
|
|
146
|
+
proc_usage = TokenUsageTracker()
|
|
147
|
+
|
|
148
|
+
start_processing = time.perf_counter()
|
|
149
|
+
proc_result = processor.process(prev_result, proc_usage)
|
|
150
|
+
end_processing = time.perf_counter()
|
|
151
|
+
|
|
152
|
+
# Attach token usage summary to metrics
|
|
153
|
+
proc_result.metrics = proc_usage.get_summary()
|
|
154
|
+
proc_duration = end_processing - start_processing
|
|
155
|
+
proc_result.processing_duration = proc_duration
|
|
156
|
+
|
|
157
|
+
processing_results.append(proc_result)
|
|
158
|
+
prev_result = proc_result
|
|
159
|
+
self.logger.info(f"{processor} completed successfully")
|
|
160
|
+
|
|
161
|
+
return processing_results
|
|
162
|
+
|
|
163
|
+
def run_chunker(
|
|
164
|
+
self,
|
|
165
|
+
doc: ProcessingResult | OCRResult,
|
|
166
|
+
) -> ChunkingResult:
|
|
167
|
+
"""Chunk the document using the configured chunker.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
doc: The ProcessingResult or OCRResult to chunk
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
A ChunkingResult object
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
ValueError: If chunker is not configured
|
|
177
|
+
"""
|
|
178
|
+
if not self.chunker:
|
|
179
|
+
raise ValueError("chunker is required for chunking operation")
|
|
180
|
+
proc_result = (
|
|
181
|
+
doc
|
|
182
|
+
if isinstance(doc, ProcessingResult)
|
|
183
|
+
else BaseProcessor.ensure_processing_result(doc)
|
|
184
|
+
)
|
|
185
|
+
usage_tracker = TokenUsageTracker()
|
|
186
|
+
# Generate chunks via chunker -> returns ChunkingResult
|
|
187
|
+
chunk_result = self.chunker.chunk(proc_result, usage_tracker)
|
|
188
|
+
return chunk_result
|
|
189
|
+
|
|
190
|
+
def run_embedder(
|
|
191
|
+
self,
|
|
192
|
+
chunk_result: ChunkingResult,
|
|
193
|
+
) -> EmbeddingResult:
|
|
194
|
+
"""Embed chunks using the configured embedder.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
chunk_result: The ChunkingResult to embed
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
An EmbeddingResult containing embeddings for each chunk
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
ValueError: If embedder is not configured
|
|
204
|
+
"""
|
|
205
|
+
if not self.embedder:
|
|
206
|
+
raise ValueError("embedder is required for embedding operation")
|
|
207
|
+
usage_tracker = TokenUsageTracker()
|
|
208
|
+
embedding_result = self.embedder.embed_chunks(
|
|
209
|
+
chunk_result, usage_tracker
|
|
210
|
+
)
|
|
211
|
+
return embedding_result
|
|
212
|
+
|
|
213
|
+
def _run_step(
|
|
214
|
+
self,
|
|
215
|
+
step: _PipelineStep,
|
|
216
|
+
dpr: DocumentPipelineResult,
|
|
217
|
+
start_total: float,
|
|
218
|
+
) -> tuple[bool, object | None]:
|
|
219
|
+
key = step.key # e.g. "ocr"
|
|
220
|
+
self.logger.info(f"Starting {key} step…")
|
|
221
|
+
start = time.perf_counter()
|
|
222
|
+
try:
|
|
223
|
+
result = step.run()
|
|
224
|
+
setattr(dpr.step_report, key, StepStatus.success)
|
|
225
|
+
setattr(dpr.timings, key, time.perf_counter() - start)
|
|
226
|
+
step.on_success(result)
|
|
227
|
+
self.logger.info(f"Step {key} completed")
|
|
228
|
+
return True, result
|
|
229
|
+
except Exception as exc:
|
|
230
|
+
tb = traceback.format_exc()
|
|
231
|
+
self.logger.error(f"Step {key} failed: {exc}\n{tb}")
|
|
232
|
+
setattr(dpr.step_report, key, StepStatus.failed)
|
|
233
|
+
setattr(dpr.timings, key, time.perf_counter() - start)
|
|
234
|
+
dpr.timings.total_duration = time.perf_counter() - start_total
|
|
235
|
+
return False, None
|
|
236
|
+
|
|
237
|
+
def process(
|
|
238
|
+
self,
|
|
239
|
+
pdf_filepath: str
|
|
240
|
+
) -> DocumentPipelineResult:
|
|
241
|
+
"""Run the configured pipeline steps in order.
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
ValueError: If any required component is not configured
|
|
245
|
+
"""
|
|
246
|
+
# Validate all components are present for full pipeline execution
|
|
247
|
+
if not self.ocr_processor:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
"ocr_processor is required for full pipeline execution"
|
|
250
|
+
)
|
|
251
|
+
if not self.chunker:
|
|
252
|
+
raise ValueError(
|
|
253
|
+
"chunker is required for full pipeline execution"
|
|
254
|
+
)
|
|
255
|
+
if not self.embedder:
|
|
256
|
+
raise ValueError(
|
|
257
|
+
"embedder is required for full pipeline execution"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
start_total = time.perf_counter()
|
|
261
|
+
dpr = DocumentPipelineResult(
|
|
262
|
+
source_file_path=pdf_filepath,
|
|
263
|
+
processed_at=datetime.now(timezone.utc),
|
|
264
|
+
pipeline_config={
|
|
265
|
+
"ocr": str(self.ocr_processor),
|
|
266
|
+
"processors": [str(p) for p in self.processors],
|
|
267
|
+
"chunker": str(self.chunker),
|
|
268
|
+
"embedder": str(self.embedder),
|
|
269
|
+
},
|
|
270
|
+
timings=TimingMetrics(),
|
|
271
|
+
total_metrics=[],
|
|
272
|
+
step_report=StepReport(),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# ---------------- helpers ----------------
|
|
276
|
+
def _on_success(attr):
|
|
277
|
+
def handler(res):
|
|
278
|
+
# 1. Set the result (save the step result to DPR)
|
|
279
|
+
setattr(dpr, attr, res)
|
|
280
|
+
# 2. Set the metrics of the result to total metrics
|
|
281
|
+
# - res may be a single result or a list of results
|
|
282
|
+
if isinstance(res, list):
|
|
283
|
+
dpr.total_metrics.extend(
|
|
284
|
+
[r.metrics for r in res if r.metrics]
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
if isinstance(res.metrics, list):
|
|
288
|
+
dpr.total_metrics.extend(res.metrics or [])
|
|
289
|
+
else:
|
|
290
|
+
dpr.total_metrics.append(res.metrics)
|
|
291
|
+
return handler
|
|
292
|
+
|
|
293
|
+
# placeholders for passing results between steps
|
|
294
|
+
ocr_res: OCRResult | None = None
|
|
295
|
+
proc_results: list[ProcessingResult] | None = None
|
|
296
|
+
chunk_res: ChunkingResult | None = None
|
|
297
|
+
|
|
298
|
+
# ---------------- step table ----------------
|
|
299
|
+
steps = [
|
|
300
|
+
self._PipelineStep(
|
|
301
|
+
"ocr",
|
|
302
|
+
lambda: self.run_ocr(pdf_filepath),
|
|
303
|
+
_on_success("ocr_result"),
|
|
304
|
+
),
|
|
305
|
+
self._PipelineStep(
|
|
306
|
+
"processing",
|
|
307
|
+
lambda: self.run_processors(ocr_res),
|
|
308
|
+
_on_success("processing_results"),
|
|
309
|
+
),
|
|
310
|
+
self._PipelineStep(
|
|
311
|
+
"chunking",
|
|
312
|
+
lambda: self.run_chunker(
|
|
313
|
+
proc_results[-1] if proc_results else ocr_res
|
|
314
|
+
),
|
|
315
|
+
_on_success("chunking_result"),
|
|
316
|
+
),
|
|
317
|
+
self._PipelineStep(
|
|
318
|
+
"embedding",
|
|
319
|
+
lambda: self.run_embedder(chunk_res),
|
|
320
|
+
_on_success("embedding_result"),
|
|
321
|
+
),
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
for st in steps:
|
|
326
|
+
ok, res = self._run_step(st, dpr, start_total)
|
|
327
|
+
if not ok:
|
|
328
|
+
return dpr
|
|
329
|
+
|
|
330
|
+
# propagate outputs for later steps
|
|
331
|
+
if st.key == "ocr":
|
|
332
|
+
ocr_res = res # type: ignore
|
|
333
|
+
elif st.key == "processing":
|
|
334
|
+
proc_results = res # type: ignore
|
|
335
|
+
elif st.key == "chunking":
|
|
336
|
+
chunk_res = res # type: ignore
|
|
337
|
+
|
|
338
|
+
# aggregate total cost once
|
|
339
|
+
dpr.total_cost_usd = sum(
|
|
340
|
+
m.total_cost_usd # type: ignore[attr-defined]
|
|
341
|
+
for m in dpr.total_metrics
|
|
342
|
+
if m and getattr(m, "total_cost_usd", None) is not None
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
dpr.timings.total_duration = time.perf_counter() - start_total
|
|
346
|
+
self.logger.info("Document processing completed.")
|
|
347
|
+
return dpr
|
|
348
|
+
finally:
|
|
349
|
+
dpr.logs = self._transcript.dump()
|
|
350
|
+
logging.getLogger().removeHandler(self._transcript)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document embedders for generating vector representations of document chunks.
|
|
3
|
+
|
|
4
|
+
This module provides embedders that convert document chunks
|
|
5
|
+
into vector embeddings for semantic search and similarity comparison.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ragbandit.documents.embedders.base_embedder import BaseEmbedder
|
|
9
|
+
from ragbandit.documents.embedders.mistral_embedder import MistralEmbedder
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"BaseEmbedder",
|
|
13
|
+
"MistralEmbedder"
|
|
14
|
+
]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
# Third-party
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# Project
|
|
7
|
+
from ragbandit.schema import (
|
|
8
|
+
ChunkingResult,
|
|
9
|
+
EmbeddingResult,
|
|
10
|
+
)
|
|
11
|
+
from ragbandit.utils.token_usage_tracker import TokenUsageTracker
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseEmbedder(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract base class for document embedders.
|
|
17
|
+
|
|
18
|
+
This class defines the interface for embedding document chunks.
|
|
19
|
+
Concrete implementations should handle the specifics of
|
|
20
|
+
generating embeddings using different models or providers.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, name: str = None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the document embedder.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
name: Optional name for the embedder
|
|
29
|
+
"""
|
|
30
|
+
self.name = name or self.__class__.__name__
|
|
31
|
+
|
|
32
|
+
# Set up logging
|
|
33
|
+
import logging
|
|
34
|
+
self.logger = logging.getLogger(f"{self.__class__.__name__}")
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def embed_chunks(
|
|
38
|
+
self,
|
|
39
|
+
chunk_result: ChunkingResult,
|
|
40
|
+
usage_tracker: TokenUsageTracker | None = None,
|
|
41
|
+
) -> EmbeddingResult:
|
|
42
|
+
"""
|
|
43
|
+
Generate embeddings for a ChunkingResult.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
chunk_result: The ChunkingResult whose chunks will be embedded
|
|
47
|
+
usage_tracker: Optional tracker for token usage
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
An EmbeddingResult containing embedded chunks
|
|
51
|
+
"""
|
|
52
|
+
raise NotImplementedError
|
|
53
|
+
|
|
54
|
+
def cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
|
|
55
|
+
"""
|
|
56
|
+
Calculate cosine similarity between two embedding vectors.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
a: First embedding vector
|
|
60
|
+
b: Second embedding vector
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Cosine similarity (1 = identical, 0 = orthogonal, -1 = opposite)
|
|
64
|
+
"""
|
|
65
|
+
return (a @ b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
66
|
+
|
|
67
|
+
def cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float:
|
|
68
|
+
"""
|
|
69
|
+
Calculate cosine distance between two embedding vectors.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
a: First embedding vector
|
|
73
|
+
b: Second embedding vector
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Cosine distance (0 = identical, 2 = opposite)
|
|
77
|
+
"""
|
|
78
|
+
return 1 - self.cosine_similarity(a, b)
|
|
79
|
+
|
|
80
|
+
def __repr__(self) -> str:
|
|
81
|
+
"""Return string representation of the embedder."""
|
|
82
|
+
return f"{self.__class__.__name__}"
|