ragbandit-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ragbandit/__init__.py +26 -0
  2. ragbandit/config/__init__.py +3 -0
  3. ragbandit/config/llms.py +34 -0
  4. ragbandit/config/pricing.py +38 -0
  5. ragbandit/documents/__init__.py +66 -0
  6. ragbandit/documents/chunkers/__init__.py +18 -0
  7. ragbandit/documents/chunkers/base_chunker.py +201 -0
  8. ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
  9. ragbandit/documents/chunkers/semantic_chunker.py +205 -0
  10. ragbandit/documents/document_pipeline.py +350 -0
  11. ragbandit/documents/embedders/__init__.py +14 -0
  12. ragbandit/documents/embedders/base_embedder.py +82 -0
  13. ragbandit/documents/embedders/mistral_embedder.py +129 -0
  14. ragbandit/documents/ocr/__init__.py +13 -0
  15. ragbandit/documents/ocr/base_ocr.py +136 -0
  16. ragbandit/documents/ocr/mistral_ocr.py +147 -0
  17. ragbandit/documents/processors/__init__.py +16 -0
  18. ragbandit/documents/processors/base_processor.py +88 -0
  19. ragbandit/documents/processors/footnotes_processor.py +353 -0
  20. ragbandit/documents/processors/references_processor.py +408 -0
  21. ragbandit/documents/utils/__init__.py +11 -0
  22. ragbandit/documents/utils/secure_file_handler.py +95 -0
  23. ragbandit/prompt_tools/__init__.py +27 -0
  24. ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
  25. ragbandit/prompt_tools/prompt_tool.py +118 -0
  26. ragbandit/prompt_tools/references_processor_tools.py +31 -0
  27. ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
  28. ragbandit/schema.py +206 -0
  29. ragbandit/utils/__init__.py +19 -0
  30. ragbandit/utils/in_memory_log_handler.py +33 -0
  31. ragbandit/utils/llm_utils.py +188 -0
  32. ragbandit/utils/mistral_client.py +76 -0
  33. ragbandit/utils/token_usage_tracker.py +220 -0
  34. ragbandit_core-0.1.1.dist-info/METADATA +145 -0
  35. ragbandit_core-0.1.1.dist-info/RECORD +38 -0
  36. ragbandit_core-0.1.1.dist-info/WHEEL +5 -0
  37. ragbandit_core-0.1.1.dist-info/licenses/LICENSE.md +9 -0
  38. ragbandit_core-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,205 @@
1
+ from datetime import datetime, timezone
2
+ from pydantic import BaseModel
3
+ from ragbandit.schema import (
4
+ ProcessingResult,
5
+ Chunk,
6
+ ChunkMetadata,
7
+ ChunkingResult,
8
+ )
9
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
10
+
11
+ from ragbandit.prompt_tools.semantic_chunker_tools import (
12
+ find_semantic_break_tool,
13
+ )
14
+
15
+ from ragbandit.documents.chunkers.base_chunker import BaseChunker
16
+
17
+
18
+ class SemanticBreak(BaseModel):
19
+ semantic_break: str
20
+
21
+
22
+ class SemanticChunker(BaseChunker):
23
+ """
24
+ A document chunker that uses semantic understanding to split documents
25
+ into coherent chunks based on content.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ min_chunk_size: int = 500,
31
+ name: str | None = None,
32
+ api_key: str | None = None
33
+ ):
34
+ """
35
+ Initialize the semantic chunker.
36
+
37
+ Args:
38
+ min_chunk_size: Minimum size for chunks
39
+ (smaller chunks will be merged)
40
+ name: Optional name for the chunker
41
+ api_key: Mistral API Key
42
+ """
43
+ super().__init__(name, api_key)
44
+ self.min_chunk_size = min_chunk_size
45
+
46
+ def semantic_chunk_pages(
47
+ self, pages: list, usage_tracker: TokenUsageTracker | None = None
48
+ ) -> list[Chunk]:
49
+ """
50
+ Chunk pages semantically using LLM-based semantic breaks.
51
+
52
+ Args:
53
+ pages: List of page dictionaries with markdown content
54
+ usage_tracker: Optional tracker for token usage
55
+
56
+ Returns:
57
+ A list of chunk dictionaries
58
+ """
59
+ i = 0
60
+ full_text = pages[i].markdown
61
+ chunks: list[Chunk] = []
62
+
63
+ while i < len(pages):
64
+ # If we have "remainder" from the last iteration,
65
+ # it might be appended here
66
+ break_lead = find_semantic_break_tool(
67
+ api_key=self.api_key,
68
+ text=full_text,
69
+ usage_tracker=usage_tracker
70
+ )
71
+
72
+ if break_lead == "NO_BREAK":
73
+ # This means the LLM found no break;
74
+ # treat the entire `full_text` as one chunk
75
+ meta = ChunkMetadata(page_index=i, images=[], extra={})
76
+ chunks.append(Chunk(text=full_text, metadata=meta))
77
+ # Move to the next page
78
+ i += 1
79
+ if i < len(pages):
80
+ full_text = pages[i].markdown
81
+ else:
82
+ break
83
+ else:
84
+ # Attempt to find the snippet in the text
85
+ idx = full_text.find(break_lead)
86
+
87
+ # If exact match fails, try progressively shorter versions
88
+ if idx == -1 and len(break_lead) > 0:
89
+ current_break_lead = break_lead
90
+ min_length = 10 # Minimum characters to try matching
91
+
92
+ # Try progressively shorter versions
93
+ # of the break_lead until we find a match
94
+ # or reach the minimum length
95
+ while idx == -1 and len(current_break_lead) >= min_length:
96
+ # Cut the break_lead in half and try again
97
+ current_break_lead = current_break_lead[
98
+ : len(current_break_lead) // 2
99
+ ]
100
+ idx = full_text.find(current_break_lead)
101
+
102
+ if idx == -1:
103
+ # If we still can't find the snippet after
104
+ # trying shorter versions,
105
+ # fallback: chunk everything as is
106
+ meta = ChunkMetadata(page_index=i, images=[], extra={})
107
+ chunks.append(Chunk(text=full_text, metadata=meta))
108
+ i += 1
109
+ if i < len(pages):
110
+ full_text = pages[i].markdown
111
+ else:
112
+ break
113
+ else:
114
+ # We found a break
115
+ chunk_text = full_text[:idx]
116
+ remainder = full_text[idx:]
117
+ meta = ChunkMetadata(page_index=i, images=[], extra={})
118
+ chunks.append(Chunk(text=chunk_text, metadata=meta))
119
+
120
+ # Now we see if remainder is too small
121
+ if len(remainder) < 1500: # ~some threshold
122
+ i += 1
123
+ if i < len(pages):
124
+ # Combine remainder with next page
125
+ remainder += "\n" + pages[i].markdown
126
+ # remainder becomes the new full_text
127
+ full_text = remainder
128
+
129
+ # If we used up the last page, break
130
+ if i >= len(pages):
131
+ # Possibly chunk the remainder if it's not empty
132
+ if len(full_text.strip()) > 0:
133
+ meta = ChunkMetadata(
134
+ page_index=min(i, len(pages) - 1),
135
+ images=[],
136
+ extra={},
137
+ )
138
+ chunks.append(Chunk(text=full_text, metadata=meta))
139
+ break
140
+
141
+ return chunks
142
+
143
+ def chunk(
144
+ self,
145
+ proc_result: ProcessingResult,
146
+ usage_tracker: TokenUsageTracker | None = None,
147
+ ) -> ChunkingResult:
148
+ """
149
+ Chunk the document using semantic chunking.
150
+
151
+ Args:
152
+ proc_result: The ProcessingResult containing
153
+ document content to chunk
154
+ usage_tracker: Tracker for token usage during chunking
155
+
156
+ Returns:
157
+ A list of chunk dictionaries
158
+ """
159
+ self.logger.info("Starting semantic chunking")
160
+
161
+ # Get the pages from the response
162
+ pages = proc_result.pages
163
+
164
+ # Perform semantic chunking
165
+ chunks = self.semantic_chunk_pages(pages, usage_tracker)
166
+
167
+ # Attach image data to chunks using shared helper
168
+ chunks = self.attach_images(chunks, proc_result)
169
+
170
+ # Merge small chunks if needed
171
+ chunks = self.process_chunks(chunks)
172
+
173
+ return ChunkingResult(
174
+ processed_at=datetime.now(timezone.utc),
175
+ chunks=chunks,
176
+ metrics=usage_tracker.get_summary() if usage_tracker else None,
177
+ )
178
+
179
+ def process_chunks(
180
+ self, chunks: list[Chunk]
181
+ ) -> list[Chunk]:
182
+ """
183
+ Process chunks after initial chunking - merge small chunks.
184
+
185
+ Args:
186
+ chunks: The initial chunks produced by the chunk method
187
+
188
+ Returns:
189
+ Processed chunks with small chunks merged
190
+ """
191
+ # Check if any chunks are too small
192
+ min_len = min([len(c.text) for c in chunks]) if chunks else 0
193
+
194
+ # Merge small chunks if needed
195
+ if min_len < self.min_chunk_size:
196
+ self.logger.info(
197
+ f"Found chunks smaller than {self.min_chunk_size} characters. "
198
+ "Merging..."
199
+ )
200
+ chunks = self.merge_small_chunks(
201
+ chunks, min_size=self.min_chunk_size
202
+ )
203
+ self.logger.info(f"After merging: {len(chunks)} chunks")
204
+
205
+ return chunks
@@ -0,0 +1,350 @@
1
+ """
2
+ Document processing pipeline that orchestrates multiple document processors.
3
+
4
+ This module provides the main DocumentPipeline class that manages the execution
5
+ of document processors in sequence, chunking, and embedding.
6
+ """
7
+
8
+ import logging
9
+ import traceback
10
+ from datetime import datetime, timezone
11
+ from typing import Callable
12
+ import time
13
+ from dataclasses import dataclass
14
+ from ragbandit.schema import (
15
+ OCRResult,
16
+ ProcessingResult,
17
+ ChunkingResult,
18
+ EmbeddingResult,
19
+ DocumentPipelineResult,
20
+ TimingMetrics,
21
+ StepReport,
22
+ StepStatus,
23
+ )
24
+
25
+ from ragbandit.documents.ocr import BaseOCR
26
+ from ragbandit.documents.processors.base_processor import BaseProcessor
27
+ from ragbandit.documents.chunkers.base_chunker import BaseChunker
28
+ from ragbandit.documents.embedders.base_embedder import BaseEmbedder
29
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
30
+
31
+ from ragbandit.utils.in_memory_log_handler import InMemoryLogHandler
32
+
33
+
34
+ class DocumentPipeline:
35
+ """Pipeline for processing documents through a
36
+ sequence of document processors, chunkers, and embedders.
37
+
38
+ The pipeline manages the execution of document processors in sequence,
39
+ where each processor receives the output of the previous processor.
40
+ The pipeline also tracks token usage and costs for each document.
41
+ """
42
+
43
+ @dataclass
44
+ class _PipelineStep:
45
+ key: str # "ocr" | "processing" | "chunking" | "embedding"
46
+ run: Callable[[], object]
47
+ on_success: Callable[[object], None]
48
+
49
+ def __init__(
50
+ self,
51
+ ocr_processor: BaseOCR | None = None,
52
+ processors: list[BaseProcessor] | None = None,
53
+ chunker: BaseChunker | None = None,
54
+ embedder: BaseEmbedder | None = None,
55
+ logger: logging.Logger | None = None,
56
+ ):
57
+ """Initialize a new document processing pipeline.
58
+
59
+ All components are optional to allow running
60
+ individual steps independently.
61
+ For full pipeline execution via process(),
62
+ all components must be provided.
63
+
64
+ Args:
65
+ ocr_processor: OCR processor to use (required for run_ocr
66
+ and process)
67
+ processors: List of document processors to execute in
68
+ sequence
69
+ chunker: Chunker to use for document chunking (required
70
+ for run_chunker and process)
71
+ embedder: Embedder to use for chunk embedding (required
72
+ for run_embedder and process)
73
+ logger: Optional logger for pipeline events
74
+ """
75
+ self.ocr_processor = ocr_processor
76
+ self.processors = processors or []
77
+ self.chunker = chunker
78
+ self.embedder = embedder
79
+
80
+ # Set up logging with more explicit configuration
81
+ self.logger = logger or logging.getLogger(__name__)
82
+
83
+ self._transcript = InMemoryLogHandler(level=logging.DEBUG)
84
+ root_logger = logging.getLogger()
85
+ root_logger.addHandler(self._transcript)
86
+
87
+ # Ensure we're generating logs
88
+ self.logger.info("DocumentPipeline initialized")
89
+
90
+ def add_processor(self, processor: BaseProcessor) -> None:
91
+ """Add a processor to the pipeline.
92
+
93
+ Args:
94
+ processor: The document processor to add
95
+ """
96
+ self.processors.append(processor)
97
+ self.logger.info(f"Added processor: {processor}")
98
+
99
+ def _fresh_buffer(self):
100
+ self._transcript.clear()
101
+ # Ensure the handler is still attached
102
+ root_logger = logging.getLogger()
103
+ if self._transcript not in root_logger.handlers:
104
+ root_logger.addHandler(self._transcript)
105
+
106
+ def run_ocr(self, pdf_filepath: str) -> OCRResult:
107
+ """Perform OCR on a PDF file using the configured OCR processor.
108
+
109
+ Args:
110
+ pdf_filepath: Path to the PDF file to process
111
+
112
+ Returns:
113
+ OCRResult: The OCR result from the processor
114
+
115
+ Raises:
116
+ ValueError: If ocr_processor is not configured
117
+ """
118
+ if not self.ocr_processor:
119
+ raise ValueError("ocr_processor is required for OCR operation")
120
+ return self.ocr_processor.process(pdf_filepath)
121
+
122
+ def run_processors(
123
+ self,
124
+ ocr_result: OCRResult,
125
+ ) -> list[ProcessingResult]:
126
+ """Process a document through the processors pipeline.
127
+
128
+ Args:
129
+ ocr_result: The initial OCR result to process
130
+
131
+ Returns:
132
+ A list of ProcessingResult with additional metadata
133
+ from all processors
134
+ """
135
+ processing_results: list[ProcessingResult] = []
136
+
137
+ # Start the processor chain with the raw OCRResult; each processor
138
+ # is responsible for converting it to ProcessingResult if needed.
139
+ prev_result = ocr_result
140
+
141
+ # Process the document through each processor in sequence
142
+ for processor in self.processors:
143
+ self.logger.info(f"Running processor: {processor}")
144
+
145
+ # Give each processor its own usage tracker
146
+ proc_usage = TokenUsageTracker()
147
+
148
+ start_processing = time.perf_counter()
149
+ proc_result = processor.process(prev_result, proc_usage)
150
+ end_processing = time.perf_counter()
151
+
152
+ # Attach token usage summary to metrics
153
+ proc_result.metrics = proc_usage.get_summary()
154
+ proc_duration = end_processing - start_processing
155
+ proc_result.processing_duration = proc_duration
156
+
157
+ processing_results.append(proc_result)
158
+ prev_result = proc_result
159
+ self.logger.info(f"{processor} completed successfully")
160
+
161
+ return processing_results
162
+
163
+ def run_chunker(
164
+ self,
165
+ doc: ProcessingResult | OCRResult,
166
+ ) -> ChunkingResult:
167
+ """Chunk the document using the configured chunker.
168
+
169
+ Args:
170
+ doc: The ProcessingResult or OCRResult to chunk
171
+
172
+ Returns:
173
+ A ChunkingResult object
174
+
175
+ Raises:
176
+ ValueError: If chunker is not configured
177
+ """
178
+ if not self.chunker:
179
+ raise ValueError("chunker is required for chunking operation")
180
+ proc_result = (
181
+ doc
182
+ if isinstance(doc, ProcessingResult)
183
+ else BaseProcessor.ensure_processing_result(doc)
184
+ )
185
+ usage_tracker = TokenUsageTracker()
186
+ # Generate chunks via chunker -> returns ChunkingResult
187
+ chunk_result = self.chunker.chunk(proc_result, usage_tracker)
188
+ return chunk_result
189
+
190
+ def run_embedder(
191
+ self,
192
+ chunk_result: ChunkingResult,
193
+ ) -> EmbeddingResult:
194
+ """Embed chunks using the configured embedder.
195
+
196
+ Args:
197
+ chunk_result: The ChunkingResult to embed
198
+
199
+ Returns:
200
+ An EmbeddingResult containing embeddings for each chunk
201
+
202
+ Raises:
203
+ ValueError: If embedder is not configured
204
+ """
205
+ if not self.embedder:
206
+ raise ValueError("embedder is required for embedding operation")
207
+ usage_tracker = TokenUsageTracker()
208
+ embedding_result = self.embedder.embed_chunks(
209
+ chunk_result, usage_tracker
210
+ )
211
+ return embedding_result
212
+
213
+ def _run_step(
214
+ self,
215
+ step: _PipelineStep,
216
+ dpr: DocumentPipelineResult,
217
+ start_total: float,
218
+ ) -> tuple[bool, object | None]:
219
+ key = step.key # e.g. "ocr"
220
+ self.logger.info(f"Starting {key} step…")
221
+ start = time.perf_counter()
222
+ try:
223
+ result = step.run()
224
+ setattr(dpr.step_report, key, StepStatus.success)
225
+ setattr(dpr.timings, key, time.perf_counter() - start)
226
+ step.on_success(result)
227
+ self.logger.info(f"Step {key} completed")
228
+ return True, result
229
+ except Exception as exc:
230
+ tb = traceback.format_exc()
231
+ self.logger.error(f"Step {key} failed: {exc}\n{tb}")
232
+ setattr(dpr.step_report, key, StepStatus.failed)
233
+ setattr(dpr.timings, key, time.perf_counter() - start)
234
+ dpr.timings.total_duration = time.perf_counter() - start_total
235
+ return False, None
236
+
237
+ def process(
238
+ self,
239
+ pdf_filepath: str
240
+ ) -> DocumentPipelineResult:
241
+ """Run the configured pipeline steps in order.
242
+
243
+ Raises:
244
+ ValueError: If any required component is not configured
245
+ """
246
+ # Validate all components are present for full pipeline execution
247
+ if not self.ocr_processor:
248
+ raise ValueError(
249
+ "ocr_processor is required for full pipeline execution"
250
+ )
251
+ if not self.chunker:
252
+ raise ValueError(
253
+ "chunker is required for full pipeline execution"
254
+ )
255
+ if not self.embedder:
256
+ raise ValueError(
257
+ "embedder is required for full pipeline execution"
258
+ )
259
+
260
+ start_total = time.perf_counter()
261
+ dpr = DocumentPipelineResult(
262
+ source_file_path=pdf_filepath,
263
+ processed_at=datetime.now(timezone.utc),
264
+ pipeline_config={
265
+ "ocr": str(self.ocr_processor),
266
+ "processors": [str(p) for p in self.processors],
267
+ "chunker": str(self.chunker),
268
+ "embedder": str(self.embedder),
269
+ },
270
+ timings=TimingMetrics(),
271
+ total_metrics=[],
272
+ step_report=StepReport(),
273
+ )
274
+
275
+ # ---------------- helpers ----------------
276
+ def _on_success(attr):
277
+ def handler(res):
278
+ # 1. Set the result (save the step result to DPR)
279
+ setattr(dpr, attr, res)
280
+ # 2. Set the metrics of the result to total metrics
281
+ # - res may be a single result or a list of results
282
+ if isinstance(res, list):
283
+ dpr.total_metrics.extend(
284
+ [r.metrics for r in res if r.metrics]
285
+ )
286
+ else:
287
+ if isinstance(res.metrics, list):
288
+ dpr.total_metrics.extend(res.metrics or [])
289
+ else:
290
+ dpr.total_metrics.append(res.metrics)
291
+ return handler
292
+
293
+ # placeholders for passing results between steps
294
+ ocr_res: OCRResult | None = None
295
+ proc_results: list[ProcessingResult] | None = None
296
+ chunk_res: ChunkingResult | None = None
297
+
298
+ # ---------------- step table ----------------
299
+ steps = [
300
+ self._PipelineStep(
301
+ "ocr",
302
+ lambda: self.run_ocr(pdf_filepath),
303
+ _on_success("ocr_result"),
304
+ ),
305
+ self._PipelineStep(
306
+ "processing",
307
+ lambda: self.run_processors(ocr_res),
308
+ _on_success("processing_results"),
309
+ ),
310
+ self._PipelineStep(
311
+ "chunking",
312
+ lambda: self.run_chunker(
313
+ proc_results[-1] if proc_results else ocr_res
314
+ ),
315
+ _on_success("chunking_result"),
316
+ ),
317
+ self._PipelineStep(
318
+ "embedding",
319
+ lambda: self.run_embedder(chunk_res),
320
+ _on_success("embedding_result"),
321
+ ),
322
+ ]
323
+
324
+ try:
325
+ for st in steps:
326
+ ok, res = self._run_step(st, dpr, start_total)
327
+ if not ok:
328
+ return dpr
329
+
330
+ # propagate outputs for later steps
331
+ if st.key == "ocr":
332
+ ocr_res = res # type: ignore
333
+ elif st.key == "processing":
334
+ proc_results = res # type: ignore
335
+ elif st.key == "chunking":
336
+ chunk_res = res # type: ignore
337
+
338
+ # aggregate total cost once
339
+ dpr.total_cost_usd = sum(
340
+ m.total_cost_usd # type: ignore[attr-defined]
341
+ for m in dpr.total_metrics
342
+ if m and getattr(m, "total_cost_usd", None) is not None
343
+ )
344
+
345
+ dpr.timings.total_duration = time.perf_counter() - start_total
346
+ self.logger.info("Document processing completed.")
347
+ return dpr
348
+ finally:
349
+ dpr.logs = self._transcript.dump()
350
+ logging.getLogger().removeHandler(self._transcript)
@@ -0,0 +1,14 @@
1
+ """
2
+ Document embedders for generating vector representations of document chunks.
3
+
4
+ This module provides embedders that convert document chunks
5
+ into vector embeddings for semantic search and similarity comparison.
6
+ """
7
+
8
+ from ragbandit.documents.embedders.base_embedder import BaseEmbedder
9
+ from ragbandit.documents.embedders.mistral_embedder import MistralEmbedder
10
+
11
+ __all__ = [
12
+ "BaseEmbedder",
13
+ "MistralEmbedder"
14
+ ]
@@ -0,0 +1,82 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ # Third-party
4
+ import numpy as np
5
+
6
+ # Project
7
+ from ragbandit.schema import (
8
+ ChunkingResult,
9
+ EmbeddingResult,
10
+ )
11
+ from ragbandit.utils.token_usage_tracker import TokenUsageTracker
12
+
13
+
14
+ class BaseEmbedder(ABC):
15
+ """
16
+ Abstract base class for document embedders.
17
+
18
+ This class defines the interface for embedding document chunks.
19
+ Concrete implementations should handle the specifics of
20
+ generating embeddings using different models or providers.
21
+ """
22
+
23
+ def __init__(self, name: str = None):
24
+ """
25
+ Initialize the document embedder.
26
+
27
+ Args:
28
+ name: Optional name for the embedder
29
+ """
30
+ self.name = name or self.__class__.__name__
31
+
32
+ # Set up logging
33
+ import logging
34
+ self.logger = logging.getLogger(f"{self.__class__.__name__}")
35
+
36
+ @abstractmethod
37
+ def embed_chunks(
38
+ self,
39
+ chunk_result: ChunkingResult,
40
+ usage_tracker: TokenUsageTracker | None = None,
41
+ ) -> EmbeddingResult:
42
+ """
43
+ Generate embeddings for a ChunkingResult.
44
+
45
+ Args:
46
+ chunk_result: The ChunkingResult whose chunks will be embedded
47
+ usage_tracker: Optional tracker for token usage
48
+
49
+ Returns:
50
+ An EmbeddingResult containing embedded chunks
51
+ """
52
+ raise NotImplementedError
53
+
54
+ def cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
55
+ """
56
+ Calculate cosine similarity between two embedding vectors.
57
+
58
+ Args:
59
+ a: First embedding vector
60
+ b: Second embedding vector
61
+
62
+ Returns:
63
+ Cosine similarity (1 = identical, 0 = orthogonal, -1 = opposite)
64
+ """
65
+ return (a @ b) / (np.linalg.norm(a) * np.linalg.norm(b))
66
+
67
+ def cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float:
68
+ """
69
+ Calculate cosine distance between two embedding vectors.
70
+
71
+ Args:
72
+ a: First embedding vector
73
+ b: Second embedding vector
74
+
75
+ Returns:
76
+ Cosine distance (0 = identical, 2 = opposite)
77
+ """
78
+ return 1 - self.cosine_similarity(a, b)
79
+
80
+ def __repr__(self) -> str:
81
+ """Return string representation of the embedder."""
82
+ return f"{self.__class__.__name__}"