ragit 0.8__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,572 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit Experiment - Core RAG optimization engine.
7
+
8
+ This module provides the main experiment class for optimizing RAG hyperparameters.
9
+ """
10
+
11
+ import time
12
+ from collections.abc import Callable
13
+ from dataclasses import dataclass, field
14
+ from itertools import product
15
+ from typing import Any
16
+
17
+ import numpy as np
18
+ from tqdm import tqdm
19
+
20
+ from ragit.core.experiment.results import EvaluationResult
21
+ from ragit.providers.base import BaseEmbeddingProvider, BaseLLMProvider
22
+ from ragit.providers.function_adapter import FunctionProvider
23
+
24
+
25
+ @dataclass
26
+ class RAGConfig:
27
+ """Configuration for a RAG pattern."""
28
+
29
+ name: str
30
+ chunk_size: int
31
+ chunk_overlap: int
32
+ num_chunks: int # Number of chunks to retrieve
33
+ embedding_model: str
34
+ llm_model: str
35
+
36
+
37
+ @dataclass
38
+ class Document:
39
+ """A document in the knowledge base."""
40
+
41
+ id: str
42
+ content: str
43
+ metadata: dict[str, Any] = field(default_factory=dict)
44
+
45
+
46
+ @dataclass
47
+ class Chunk:
48
+ """A document chunk."""
49
+
50
+ content: str
51
+ doc_id: str
52
+ chunk_index: int
53
+ embedding: tuple[float, ...] | list[float] | None = None
54
+
55
+
56
+ @dataclass
57
+ class BenchmarkQuestion:
58
+ """A benchmark question for evaluation."""
59
+
60
+ question: str
61
+ ground_truth: str
62
+ relevant_doc_ids: list[str] = field(default_factory=list)
63
+
64
+
65
+ @dataclass
66
+ class EvaluationScores:
67
+ """Scores from evaluating a RAG response."""
68
+
69
+ answer_correctness: float
70
+ context_relevance: float
71
+ faithfulness: float
72
+
73
+ @property
74
+ def combined_score(self) -> float:
75
+ """Combined score (weighted average)."""
76
+ return 0.4 * self.answer_correctness + 0.3 * self.context_relevance + 0.3 * self.faithfulness
77
+
78
+
79
+ class SimpleVectorStore:
80
+ """Simple in-memory vector store with pre-normalized embeddings for fast search.
81
+
82
+ Note: This class is NOT thread-safe.
83
+ """
84
+
85
+ def __init__(self) -> None:
86
+ self.chunks: list[Chunk] = []
87
+ self._embedding_matrix: np.ndarray[Any, np.dtype[np.float64]] | None = None # Pre-normalized
88
+
89
+ def add(self, chunks: list[Chunk]) -> None:
90
+ """Add chunks to the store and rebuild pre-normalized embedding matrix."""
91
+ self.chunks.extend(chunks)
92
+ self._rebuild_matrix()
93
+
94
+ def _rebuild_matrix(self) -> None:
95
+ """Rebuild and pre-normalize the embedding matrix from chunks."""
96
+ embeddings = [c.embedding for c in self.chunks if c.embedding is not None]
97
+ if embeddings:
98
+ matrix = np.array(embeddings, dtype=np.float64)
99
+ # Pre-normalize for fast cosine similarity
100
+ norms = np.linalg.norm(matrix, axis=1, keepdims=True)
101
+ norms[norms == 0] = 1 # Avoid division by zero
102
+ self._embedding_matrix = matrix / norms
103
+ else:
104
+ self._embedding_matrix = None
105
+
106
+ def clear(self) -> None:
107
+ """Clear all chunks."""
108
+ self.chunks = []
109
+ self._embedding_matrix = None
110
+
111
+ def search(self, query_embedding: tuple[float, ...] | list[float], top_k: int = 5) -> list[tuple[Chunk, float]]:
112
+ """Search for similar chunks using pre-normalized cosine similarity."""
113
+ if not self.chunks or self._embedding_matrix is None:
114
+ return []
115
+
116
+ # Normalize query vector
117
+ query_vec = np.array(query_embedding, dtype=np.float64)
118
+ query_norm = np.linalg.norm(query_vec)
119
+ if query_norm == 0:
120
+ return []
121
+ query_normalized = query_vec / query_norm
122
+
123
+ # Fast cosine similarity: matrix is pre-normalized, just dot product
124
+ similarities = self._embedding_matrix @ query_normalized
125
+
126
+ # Get top_k indices efficiently
127
+ if len(similarities) <= top_k:
128
+ top_indices = np.argsort(similarities)[::-1]
129
+ else:
130
+ top_indices = np.argpartition(similarities, -top_k)[-top_k:]
131
+ top_indices = top_indices[np.argsort(similarities[top_indices])[::-1]]
132
+
133
+ return [(self.chunks[i], float(similarities[i])) for i in top_indices]
134
+
135
+
136
+ class RagitExperiment:
137
+ """
138
+ Ragit Experiment - Automatic RAG Hyperparameter Optimization.
139
+
140
+ This class orchestrates the optimization of RAG pipeline hyperparameters
141
+ by systematically evaluating different configurations.
142
+
143
+ Parameters
144
+ ----------
145
+ documents : list[Document]
146
+ Documents to use as the knowledge base.
147
+ benchmark : list[BenchmarkQuestion]
148
+ Benchmark questions for evaluation.
149
+ embed_fn : Callable[[str], list[float]], optional
150
+ Function that takes text and returns an embedding vector.
151
+ generate_fn : Callable, optional
152
+ Function for text generation.
153
+ provider : BaseEmbeddingProvider, optional
154
+ Provider for embeddings and LLM. If embed_fn is provided, this is
155
+ ignored for embeddings but can be used for LLM.
156
+
157
+ Raises
158
+ ------
159
+ ValueError
160
+ If neither embed_fn nor provider is provided.
161
+
162
+ Examples
163
+ --------
164
+ >>> # With custom functions
165
+ >>> experiment = RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)
166
+ >>>
167
+ >>> # With explicit provider
168
+ >>> from ragit.providers import OllamaProvider
169
+ >>> experiment = RagitExperiment(docs, benchmark, provider=OllamaProvider())
170
+ >>>
171
+ >>> results = experiment.run()
172
+ >>> print(results[0].config) # Best configuration
173
+ """
174
+
175
+ def __init__(
176
+ self,
177
+ documents: list[Document],
178
+ benchmark: list[BenchmarkQuestion],
179
+ embed_fn: Callable[[str], list[float]] | None = None,
180
+ generate_fn: Callable[..., str] | None = None,
181
+ provider: BaseEmbeddingProvider | BaseLLMProvider | None = None,
182
+ ):
183
+ self.documents = documents
184
+ self.benchmark = benchmark
185
+ self.vector_store = SimpleVectorStore()
186
+ self.results: list[EvaluationResult] = []
187
+
188
+ # Resolve provider from functions or explicit provider
189
+ self._embedding_provider: BaseEmbeddingProvider
190
+ self._llm_provider: BaseLLMProvider | None = None
191
+
192
+ if embed_fn is not None:
193
+ # Create FunctionProvider from provided functions
194
+ function_provider = FunctionProvider(
195
+ embed_fn=embed_fn,
196
+ generate_fn=generate_fn,
197
+ )
198
+ self._embedding_provider = function_provider
199
+ if generate_fn is not None:
200
+ self._llm_provider = function_provider
201
+ elif provider is not None and isinstance(provider, BaseLLMProvider):
202
+ self._llm_provider = provider
203
+ elif provider is not None:
204
+ if not isinstance(provider, BaseEmbeddingProvider):
205
+ raise ValueError(
206
+ "Provider must implement BaseEmbeddingProvider for embeddings. "
207
+ "Alternatively, provide embed_fn."
208
+ )
209
+ self._embedding_provider = provider
210
+ if isinstance(provider, BaseLLMProvider):
211
+ self._llm_provider = provider
212
+ else:
213
+ raise ValueError(
214
+ "Must provide embed_fn or provider for embeddings. "
215
+ "Examples:\n"
216
+ " RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)\n"
217
+ " RagitExperiment(docs, benchmark, provider=OllamaProvider())"
218
+ )
219
+
220
+ # LLM is required for evaluation
221
+ if self._llm_provider is None:
222
+ raise ValueError(
223
+ "RagitExperiment requires LLM for evaluation. "
224
+ "Provide generate_fn or a provider with LLM support."
225
+ )
226
+
227
+ @property
228
+ def provider(self) -> BaseEmbeddingProvider:
229
+ """Return the embedding provider (for backwards compatibility)."""
230
+ return self._embedding_provider
231
+
232
+ def define_search_space(
233
+ self,
234
+ chunk_sizes: list[int] | None = None,
235
+ chunk_overlaps: list[int] | None = None,
236
+ num_chunks_options: list[int] | None = None,
237
+ embedding_models: list[str] | None = None,
238
+ llm_models: list[str] | None = None,
239
+ ) -> list[RAGConfig]:
240
+ """
241
+ Define the hyperparameter search space.
242
+
243
+ Parameters
244
+ ----------
245
+ chunk_sizes : list[int], optional
246
+ Chunk sizes to test. Default: [256, 512]
247
+ chunk_overlaps : list[int], optional
248
+ Chunk overlaps to test. Default: [50, 100]
249
+ num_chunks_options : list[int], optional
250
+ Number of chunks to retrieve. Default: [2, 3]
251
+ embedding_models : list[str], optional
252
+ Embedding models to test. Default: ["default"]
253
+ llm_models : list[str], optional
254
+ LLM models to test. Default: ["default"]
255
+
256
+ Returns
257
+ -------
258
+ list[RAGConfig]
259
+ List of configurations to evaluate.
260
+ """
261
+ chunk_sizes = chunk_sizes or [256, 512]
262
+ chunk_overlaps = chunk_overlaps or [50, 100]
263
+ num_chunks_options = num_chunks_options or [2, 3]
264
+ embedding_models = embedding_models or ["default"]
265
+ llm_models = llm_models or ["default"]
266
+
267
+ configs = []
268
+ pattern_num = 1
269
+
270
+ for cs, co, nc, em, lm in product(
271
+ chunk_sizes, chunk_overlaps, num_chunks_options, embedding_models, llm_models
272
+ ):
273
+ # Ensure overlap is less than chunk size
274
+ if co >= cs:
275
+ continue
276
+
277
+ configs.append(
278
+ RAGConfig(
279
+ name=f"Pattern_{pattern_num}",
280
+ chunk_size=cs,
281
+ chunk_overlap=co,
282
+ num_chunks=nc,
283
+ embedding_model=em,
284
+ llm_model=lm,
285
+ )
286
+ )
287
+ pattern_num += 1
288
+
289
+ return configs
290
+
291
+ def _chunk_document(self, doc: Document, chunk_size: int, overlap: int) -> list[Chunk]:
292
+ """Split document into overlapping chunks."""
293
+ chunks = []
294
+ text = doc.content
295
+ start = 0
296
+ chunk_idx = 0
297
+
298
+ while start < len(text):
299
+ end = start + chunk_size
300
+ chunk_text = text[start:end].strip()
301
+
302
+ if chunk_text:
303
+ chunks.append(
304
+ Chunk(
305
+ content=chunk_text,
306
+ doc_id=doc.id,
307
+ chunk_index=chunk_idx,
308
+ )
309
+ )
310
+ chunk_idx += 1
311
+
312
+ start = end - overlap
313
+ if start >= len(text) - overlap:
314
+ break
315
+
316
+ return chunks
317
+
318
+ def _build_index(self, config: RAGConfig) -> None:
319
+ """Build vector index with given configuration using batch embedding."""
320
+ self.vector_store.clear()
321
+ all_chunks: list[Chunk] = []
322
+
323
+ # Chunk all documents
324
+ for doc in self.documents:
325
+ chunks = self._chunk_document(doc, config.chunk_size, config.chunk_overlap)
326
+ all_chunks.extend(chunks)
327
+
328
+ if not all_chunks:
329
+ return
330
+
331
+ # Batch embed all chunks at once (single API call)
332
+ texts = [chunk.content for chunk in all_chunks]
333
+ responses = self._embedding_provider.embed_batch(texts, config.embedding_model)
334
+
335
+ for chunk, response in zip(all_chunks, responses, strict=True):
336
+ chunk.embedding = response.embedding
337
+
338
+ self.vector_store.add(all_chunks)
339
+
340
+ def _retrieve(self, query: str, config: RAGConfig) -> list[Chunk]:
341
+ """Retrieve relevant chunks for a query."""
342
+ query_response = self._embedding_provider.embed(query, config.embedding_model)
343
+ results = self.vector_store.search(query_response.embedding, top_k=config.num_chunks)
344
+ return [chunk for chunk, _ in results]
345
+
346
+ def _generate(self, question: str, context: str, config: RAGConfig) -> str:
347
+ """Generate answer using RAG."""
348
+ if self._llm_provider is None:
349
+ raise ValueError("LLM provider is required for generation")
350
+
351
+ system_prompt = """You are a helpful assistant. Answer questions based ONLY on the provided context.
352
+ If the context doesn't contain enough information, say so. Be concise and accurate."""
353
+
354
+ prompt = f"""Context:
355
+ {context}
356
+
357
+ Question: {question}
358
+
359
+ Answer:"""
360
+
361
+ response = self._llm_provider.generate(
362
+ prompt=prompt,
363
+ model=config.llm_model,
364
+ system_prompt=system_prompt,
365
+ temperature=0.7,
366
+ )
367
+ return response.text
368
+
369
+ def _evaluate_response(
370
+ self,
371
+ question: str,
372
+ generated: str,
373
+ ground_truth: str,
374
+ context: str,
375
+ config: RAGConfig,
376
+ ) -> EvaluationScores:
377
+ """Evaluate a RAG response using LLM-as-judge."""
378
+ if self._llm_provider is None:
379
+ raise ValueError("LLM provider is required for evaluation")
380
+
381
+ def extract_score(response: str) -> float:
382
+ """Extract numeric score from LLM response."""
383
+ try:
384
+ # Find first number in response
385
+ nums = "".join(c for c in response if c.isdigit() or c == ".")
386
+ if nums:
387
+ score = float(nums.split(".")[0]) # Take integer part
388
+ return min(100, max(0, score)) / 100
389
+ except (ValueError, IndexError):
390
+ pass
391
+ return 0.5
392
+
393
+ # Evaluate answer correctness
394
+ correctness_prompt = f"""Rate how correct this answer is compared to ground truth (0-100):
395
+
396
+ Question: {question}
397
+ Ground Truth: {ground_truth}
398
+ Generated Answer: {generated}
399
+
400
+ Respond with ONLY a number 0-100."""
401
+
402
+ resp = self._llm_provider.generate(correctness_prompt, config.llm_model)
403
+ correctness = extract_score(resp.text)
404
+
405
+ # Evaluate context relevance
406
+ relevance_prompt = f"""Rate how relevant this context is for answering the question (0-100):
407
+
408
+ Question: {question}
409
+ Context: {context[:1000]}
410
+
411
+ Respond with ONLY a number 0-100."""
412
+
413
+ resp = self._llm_provider.generate(relevance_prompt, config.llm_model)
414
+ relevance = extract_score(resp.text)
415
+
416
+ # Evaluate faithfulness
417
+ faithfulness_prompt = f"""Rate if this answer is grounded in the context (0-100):
418
+
419
+ Context: {context[:1000]}
420
+ Answer: {generated}
421
+
422
+ Respond with ONLY a number 0-100."""
423
+
424
+ resp = self._llm_provider.generate(faithfulness_prompt, config.llm_model)
425
+ faithfulness = extract_score(resp.text)
426
+
427
+ return EvaluationScores(
428
+ answer_correctness=correctness,
429
+ context_relevance=relevance,
430
+ faithfulness=faithfulness,
431
+ )
432
+
433
+ def evaluate_config(self, config: RAGConfig, verbose: bool = False) -> EvaluationResult:
434
+ """
435
+ Evaluate a single RAG configuration.
436
+
437
+ Parameters
438
+ ----------
439
+ config : RAGConfig
440
+ Configuration to evaluate.
441
+ verbose : bool
442
+ Print progress information.
443
+
444
+ Returns
445
+ -------
446
+ EvaluationResult
447
+ Evaluation results for this configuration.
448
+ """
449
+ if verbose:
450
+ print(f"\nEvaluating {config.name}:")
451
+ print(f" chunk_size={config.chunk_size}, overlap={config.chunk_overlap}, num_chunks={config.num_chunks}")
452
+
453
+ start_time = time.time()
454
+
455
+ # Build index
456
+ self._build_index(config)
457
+
458
+ # Evaluate on benchmark
459
+ all_scores = []
460
+
461
+ for qa in self.benchmark:
462
+ # Retrieve
463
+ chunks = self._retrieve(qa.question, config)
464
+ context = "\n\n".join(f"[{c.doc_id}]: {c.content}" for c in chunks)
465
+
466
+ # Generate
467
+ answer = self._generate(qa.question, context, config)
468
+
469
+ # Evaluate
470
+ scores = self._evaluate_response(qa.question, answer, qa.ground_truth, context, config)
471
+ all_scores.append(scores)
472
+
473
+ # Aggregate scores (use generators for memory efficiency)
474
+ avg_correctness = np.mean([s.answer_correctness for s in all_scores])
475
+ avg_relevance = np.mean([s.context_relevance for s in all_scores])
476
+ avg_faithfulness = np.mean([s.faithfulness for s in all_scores])
477
+ combined = float(np.mean([s.combined_score for s in all_scores]))
478
+
479
+ execution_time = time.time() - start_time
480
+
481
+ if verbose:
482
+ print(
483
+ f" Scores: correctness={avg_correctness:.2f}, "
484
+ f"relevance={avg_relevance:.2f}, faithfulness={avg_faithfulness:.2f}"
485
+ )
486
+ print(f" Combined: {combined:.3f} | Time: {execution_time:.1f}s")
487
+
488
+ return EvaluationResult(
489
+ pattern_name=config.name,
490
+ indexing_params={
491
+ "chunk_size": config.chunk_size,
492
+ "chunk_overlap": config.chunk_overlap,
493
+ "embedding_model": config.embedding_model,
494
+ },
495
+ inference_params={
496
+ "num_chunks": config.num_chunks,
497
+ "llm_model": config.llm_model,
498
+ },
499
+ scores={
500
+ "answer_correctness": {"mean": float(avg_correctness)},
501
+ "context_relevance": {"mean": float(avg_relevance)},
502
+ "faithfulness": {"mean": float(avg_faithfulness)},
503
+ },
504
+ execution_time=execution_time,
505
+ final_score=float(combined),
506
+ )
507
+
508
+ def run(
509
+ self,
510
+ configs: list[RAGConfig] | None = None,
511
+ max_configs: int | None = None,
512
+ verbose: bool = True,
513
+ ) -> list[EvaluationResult]:
514
+ """
515
+ Run the RAG optimization experiment.
516
+
517
+ Parameters
518
+ ----------
519
+ configs : list[RAGConfig], optional
520
+ Configurations to evaluate. If None, uses default search space.
521
+ max_configs : int, optional
522
+ Maximum number of configurations to evaluate.
523
+ verbose : bool
524
+ Print progress information.
525
+
526
+ Returns
527
+ -------
528
+ list[EvaluationResult]
529
+ Results sorted by combined score (best first).
530
+ """
531
+ if configs is None:
532
+ configs = self.define_search_space()
533
+
534
+ if max_configs:
535
+ configs = configs[:max_configs]
536
+
537
+ if verbose:
538
+ print("=" * 60)
539
+ print("RAGIT: RAG Optimization Experiment")
540
+ print("=" * 60)
541
+ print(f"Configurations to test: {len(configs)}")
542
+ print(f"Documents: {len(self.documents)}")
543
+ print(f"Benchmark questions: {len(self.benchmark)}")
544
+ print()
545
+
546
+ self.results = []
547
+
548
+ for cfg in tqdm(configs, desc="Evaluating configs", disable=not verbose):
549
+ result = self.evaluate_config(cfg, verbose=verbose)
550
+ self.results.append(result)
551
+
552
+ # Sort by combined score (best first)
553
+ self.results.sort(key=lambda x: x.final_score, reverse=True)
554
+
555
+ if verbose:
556
+ print("\n" + "=" * 60)
557
+ print("RESULTS (sorted by score)")
558
+ print("=" * 60)
559
+ for i, result in enumerate(self.results[:5], 1):
560
+ print(f"{i}. {result.pattern_name}: {result.final_score:.3f}")
561
+ print(
562
+ f" chunk_size={result.indexing_params['chunk_size']}, "
563
+ f"num_chunks={result.inference_params['num_chunks']}"
564
+ )
565
+
566
+ return self.results
567
+
568
+ def get_best_config(self) -> EvaluationResult | None:
569
+ """Get the best configuration from results."""
570
+ if not self.results:
571
+ return None
572
+ return self.results[0]
@@ -0,0 +1,131 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit experiment results.
7
+ """
8
+
9
+ from collections.abc import Iterator
10
+ from dataclasses import asdict, dataclass, field
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class EvaluationResult:
16
+ """
17
+ Result from evaluating a single RAG configuration.
18
+
19
+ Parameters
20
+ ----------
21
+ pattern_name : str
22
+ Name of the RAG pattern (e.g., "Pattern_1").
23
+ indexing_params : dict[str, Any]
24
+ Hyperparameters used during indexing (chunk_size, overlap, etc.).
25
+ inference_params : dict[str, Any]
26
+ Hyperparameters used during inference (num_chunks, llm_model, etc.).
27
+ scores : dict[str, dict]
28
+ Evaluation scores (answer_correctness, context_relevance, faithfulness).
29
+ execution_time : float
30
+ Time taken for evaluation in seconds.
31
+ final_score : float
32
+ Combined score for optimization ranking.
33
+ """
34
+
35
+ pattern_name: str
36
+ indexing_params: dict[str, Any]
37
+ inference_params: dict[str, Any]
38
+ scores: dict[str, dict[str, float]]
39
+ execution_time: float
40
+ final_score: float
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary."""
44
+ return asdict(self)
45
+
46
+ def __repr__(self) -> str:
47
+ return (
48
+ f"EvaluationResult(name={self.pattern_name}, score={self.final_score:.3f}, time={self.execution_time:.1f}s)"
49
+ )
50
+
51
+
52
+ @dataclass
53
+ class ExperimentResults:
54
+ """
55
+ Collection of evaluation results from an optimization experiment.
56
+
57
+ Attributes
58
+ ----------
59
+ evaluations : list[EvaluationResult]
60
+ All evaluation results.
61
+ """
62
+
63
+ evaluations: list[EvaluationResult] = field(default_factory=list)
64
+
65
+ def __len__(self) -> int:
66
+ return len(self.evaluations)
67
+
68
+ def __iter__(self) -> Iterator[EvaluationResult]:
69
+ yield from self.evaluations
70
+
71
+ def __bool__(self) -> bool:
72
+ return bool(self.evaluations)
73
+
74
+ def add(self, result: EvaluationResult) -> None:
75
+ """Add an evaluation result."""
76
+ self.evaluations.append(result)
77
+
78
+ def is_cached(
79
+ self,
80
+ indexing_params: dict[str, Any],
81
+ inference_params: dict[str, Any],
82
+ ) -> float | None:
83
+ """
84
+ Check if this configuration was already evaluated.
85
+
86
+ Returns
87
+ -------
88
+ float or None
89
+ Final score if cached, None otherwise.
90
+ """
91
+ for ev in self.evaluations:
92
+ if ev.indexing_params == indexing_params and ev.inference_params == inference_params:
93
+ return ev.final_score
94
+ return None
95
+
96
+ @property
97
+ def scores(self) -> list[float]:
98
+ """All final scores."""
99
+ return [ev.final_score for ev in self.evaluations]
100
+
101
+ def sorted(self, reverse: bool = True) -> list[EvaluationResult]:
102
+ """
103
+ Get results sorted by final score.
104
+
105
+ Parameters
106
+ ----------
107
+ reverse : bool
108
+ If True (default), best scores first.
109
+
110
+ Returns
111
+ -------
112
+ list[EvaluationResult]
113
+ Sorted results.
114
+ """
115
+ return sorted(self.evaluations, key=lambda x: x.final_score, reverse=reverse)
116
+
117
+ def get_best(self, k: int = 1) -> list[EvaluationResult]:
118
+ """
119
+ Get k best results.
120
+
121
+ Parameters
122
+ ----------
123
+ k : int
124
+ Number of results to return.
125
+
126
+ Returns
127
+ -------
128
+ list[EvaluationResult]
129
+ Top k results by score.
130
+ """
131
+ return self.sorted()[:k]