ragit 0.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,577 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit Experiment - Core RAG optimization engine.
7
+
8
+ This module provides the main experiment class for optimizing RAG hyperparameters.
9
+ """
10
+
11
+ import time
12
+ from collections.abc import Callable
13
+ from dataclasses import dataclass, field
14
+ from itertools import product
15
+ from typing import Any
16
+
17
+ import numpy as np
18
+ from tqdm import tqdm
19
+
20
+ from ragit.core.experiment.results import EvaluationResult
21
+ from ragit.providers.base import BaseEmbeddingProvider, BaseLLMProvider
22
+ from ragit.providers.function_adapter import FunctionProvider
23
+
24
+
25
+ @dataclass
26
+ class RAGConfig:
27
+ """Configuration for a RAG pattern."""
28
+
29
+ name: str
30
+ chunk_size: int
31
+ chunk_overlap: int
32
+ num_chunks: int # Number of chunks to retrieve
33
+ embedding_model: str
34
+ llm_model: str
35
+
36
+
37
+ @dataclass
38
+ class Document:
39
+ """A document in the knowledge base."""
40
+
41
+ id: str
42
+ content: str
43
+ metadata: dict[str, Any] = field(default_factory=dict)
44
+
45
+
46
+ @dataclass
47
+ class Chunk:
48
+ """A document chunk with optional rich metadata.
49
+
50
+ Metadata can include:
51
+ - document_id: SHA256 hash for deduplication and window search
52
+ - sequence_number: Order within the document
53
+ - chunk_start/chunk_end: Character positions in original text
54
+ """
55
+
56
+ content: str
57
+ doc_id: str
58
+ chunk_index: int
59
+ embedding: tuple[float, ...] | list[float] | None = None
60
+ metadata: dict[str, Any] = field(default_factory=dict)
61
+
62
+
63
+ @dataclass
64
+ class BenchmarkQuestion:
65
+ """A benchmark question for evaluation."""
66
+
67
+ question: str
68
+ ground_truth: str
69
+ relevant_doc_ids: list[str] = field(default_factory=list)
70
+
71
+
72
+ @dataclass
73
+ class EvaluationScores:
74
+ """Scores from evaluating a RAG response."""
75
+
76
+ answer_correctness: float
77
+ context_relevance: float
78
+ faithfulness: float
79
+
80
+ @property
81
+ def combined_score(self) -> float:
82
+ """Combined score (weighted average)."""
83
+ return 0.4 * self.answer_correctness + 0.3 * self.context_relevance + 0.3 * self.faithfulness
84
+
85
+
86
+ class SimpleVectorStore:
87
+ """Simple in-memory vector store with pre-normalized embeddings for fast search.
88
+
89
+ Note: This class is NOT thread-safe.
90
+ """
91
+
92
+ def __init__(self) -> None:
93
+ self.chunks: list[Chunk] = []
94
+ self._embedding_matrix: np.ndarray[Any, np.dtype[np.float64]] | None = None # Pre-normalized
95
+
96
+ def add(self, chunks: list[Chunk]) -> None:
97
+ """Add chunks to the store and rebuild pre-normalized embedding matrix."""
98
+ self.chunks.extend(chunks)
99
+ self._rebuild_matrix()
100
+
101
+ def _rebuild_matrix(self) -> None:
102
+ """Rebuild and pre-normalize the embedding matrix from chunks."""
103
+ embeddings = [c.embedding for c in self.chunks if c.embedding is not None]
104
+ if embeddings:
105
+ matrix = np.array(embeddings, dtype=np.float64)
106
+ # Pre-normalize for fast cosine similarity
107
+ norms = np.linalg.norm(matrix, axis=1, keepdims=True)
108
+ norms[norms == 0] = 1 # Avoid division by zero
109
+ self._embedding_matrix = matrix / norms
110
+ else:
111
+ self._embedding_matrix = None
112
+
113
+ def clear(self) -> None:
114
+ """Clear all chunks."""
115
+ self.chunks = []
116
+ self._embedding_matrix = None
117
+
118
+ def search(self, query_embedding: tuple[float, ...] | list[float], top_k: int = 5) -> list[tuple[Chunk, float]]:
119
+ """Search for similar chunks using pre-normalized cosine similarity."""
120
+ if not self.chunks or self._embedding_matrix is None:
121
+ return []
122
+
123
+ # Normalize query vector
124
+ query_vec = np.array(query_embedding, dtype=np.float64)
125
+ query_norm = np.linalg.norm(query_vec)
126
+ if query_norm == 0:
127
+ return []
128
+ query_normalized = query_vec / query_norm
129
+
130
+ # Fast cosine similarity: matrix is pre-normalized, just dot product
131
+ similarities = self._embedding_matrix @ query_normalized
132
+
133
+ # Get top_k indices efficiently
134
+ if len(similarities) <= top_k:
135
+ top_indices = np.argsort(similarities)[::-1]
136
+ else:
137
+ top_indices = np.argpartition(similarities, -top_k)[-top_k:]
138
+ top_indices = top_indices[np.argsort(similarities[top_indices])[::-1]]
139
+
140
+ return [(self.chunks[i], float(similarities[i])) for i in top_indices]
141
+
142
+
143
+ class RagitExperiment:
144
+ """
145
+ Ragit Experiment - Automatic RAG Hyperparameter Optimization.
146
+
147
+ This class orchestrates the optimization of RAG pipeline hyperparameters
148
+ by systematically evaluating different configurations.
149
+
150
+ Parameters
151
+ ----------
152
+ documents : list[Document]
153
+ Documents to use as the knowledge base.
154
+ benchmark : list[BenchmarkQuestion]
155
+ Benchmark questions for evaluation.
156
+ embed_fn : Callable[[str], list[float]], optional
157
+ Function that takes text and returns an embedding vector.
158
+ generate_fn : Callable, optional
159
+ Function for text generation.
160
+ provider : BaseEmbeddingProvider, optional
161
+ Provider for embeddings and LLM. If embed_fn is provided, this is
162
+ ignored for embeddings but can be used for LLM.
163
+
164
+ Raises
165
+ ------
166
+ ValueError
167
+ If neither embed_fn nor provider is provided.
168
+
169
+ Examples
170
+ --------
171
+ >>> # With custom functions
172
+ >>> experiment = RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)
173
+ >>>
174
+ >>> # With explicit provider
175
+ >>> from ragit.providers import OllamaProvider
176
+ >>> experiment = RagitExperiment(docs, benchmark, provider=OllamaProvider())
177
+ >>>
178
+ >>> results = experiment.run()
179
+ >>> print(results[0].config) # Best configuration
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ documents: list[Document],
185
+ benchmark: list[BenchmarkQuestion],
186
+ embed_fn: Callable[[str], list[float]] | None = None,
187
+ generate_fn: Callable[..., str] | None = None,
188
+ provider: BaseEmbeddingProvider | BaseLLMProvider | None = None,
189
+ ):
190
+ self.documents = documents
191
+ self.benchmark = benchmark
192
+ self.vector_store = SimpleVectorStore()
193
+ self.results: list[EvaluationResult] = []
194
+
195
+ # Resolve provider from functions or explicit provider
196
+ self._embedding_provider: BaseEmbeddingProvider
197
+ self._llm_provider: BaseLLMProvider | None = None
198
+
199
+ if embed_fn is not None:
200
+ # Create FunctionProvider from provided functions
201
+ function_provider = FunctionProvider(
202
+ embed_fn=embed_fn,
203
+ generate_fn=generate_fn,
204
+ )
205
+ self._embedding_provider = function_provider
206
+ if generate_fn is not None:
207
+ self._llm_provider = function_provider
208
+ elif provider is not None and isinstance(provider, BaseLLMProvider):
209
+ self._llm_provider = provider
210
+ elif provider is not None:
211
+ if not isinstance(provider, BaseEmbeddingProvider):
212
+ raise ValueError(
213
+ "Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
214
+ )
215
+ self._embedding_provider = provider
216
+ if isinstance(provider, BaseLLMProvider):
217
+ self._llm_provider = provider
218
+ else:
219
+ raise ValueError(
220
+ "Must provide embed_fn or provider for embeddings. "
221
+ "Examples:\n"
222
+ " RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)\n"
223
+ " RagitExperiment(docs, benchmark, provider=OllamaProvider())"
224
+ )
225
+
226
+ # LLM is required for evaluation
227
+ if self._llm_provider is None:
228
+ raise ValueError(
229
+ "RagitExperiment requires LLM for evaluation. Provide generate_fn or a provider with LLM support."
230
+ )
231
+
232
+ @property
233
+ def provider(self) -> BaseEmbeddingProvider:
234
+ """Return the embedding provider (for backwards compatibility)."""
235
+ return self._embedding_provider
236
+
237
+ def define_search_space(
238
+ self,
239
+ chunk_sizes: list[int] | None = None,
240
+ chunk_overlaps: list[int] | None = None,
241
+ num_chunks_options: list[int] | None = None,
242
+ embedding_models: list[str] | None = None,
243
+ llm_models: list[str] | None = None,
244
+ ) -> list[RAGConfig]:
245
+ """
246
+ Define the hyperparameter search space.
247
+
248
+ Parameters
249
+ ----------
250
+ chunk_sizes : list[int], optional
251
+ Chunk sizes to test. Default: [256, 512]
252
+ chunk_overlaps : list[int], optional
253
+ Chunk overlaps to test. Default: [50, 100]
254
+ num_chunks_options : list[int], optional
255
+ Number of chunks to retrieve. Default: [2, 3]
256
+ embedding_models : list[str], optional
257
+ Embedding models to test. Default: ["default"]
258
+ llm_models : list[str], optional
259
+ LLM models to test. Default: ["default"]
260
+
261
+ Returns
262
+ -------
263
+ list[RAGConfig]
264
+ List of configurations to evaluate.
265
+ """
266
+ chunk_sizes = chunk_sizes or [256, 512]
267
+ chunk_overlaps = chunk_overlaps or [50, 100]
268
+ num_chunks_options = num_chunks_options or [2, 3]
269
+ embedding_models = embedding_models or ["default"]
270
+ llm_models = llm_models or ["default"]
271
+
272
+ configs = []
273
+ pattern_num = 1
274
+
275
+ for cs, co, nc, em, lm in product(
276
+ chunk_sizes, chunk_overlaps, num_chunks_options, embedding_models, llm_models
277
+ ):
278
+ # Ensure overlap is less than chunk size
279
+ if co >= cs:
280
+ continue
281
+
282
+ configs.append(
283
+ RAGConfig(
284
+ name=f"Pattern_{pattern_num}",
285
+ chunk_size=cs,
286
+ chunk_overlap=co,
287
+ num_chunks=nc,
288
+ embedding_model=em,
289
+ llm_model=lm,
290
+ )
291
+ )
292
+ pattern_num += 1
293
+
294
+ return configs
295
+
296
+ def _chunk_document(self, doc: Document, chunk_size: int, overlap: int) -> list[Chunk]:
297
+ """Split document into overlapping chunks."""
298
+ chunks = []
299
+ text = doc.content
300
+ start = 0
301
+ chunk_idx = 0
302
+
303
+ while start < len(text):
304
+ end = start + chunk_size
305
+ chunk_text = text[start:end].strip()
306
+
307
+ if chunk_text:
308
+ chunks.append(
309
+ Chunk(
310
+ content=chunk_text,
311
+ doc_id=doc.id,
312
+ chunk_index=chunk_idx,
313
+ )
314
+ )
315
+ chunk_idx += 1
316
+
317
+ start = end - overlap
318
+ if start >= len(text) - overlap:
319
+ break
320
+
321
+ return chunks
322
+
323
+ def _build_index(self, config: RAGConfig) -> None:
324
+ """Build vector index with given configuration using batch embedding."""
325
+ self.vector_store.clear()
326
+ all_chunks: list[Chunk] = []
327
+
328
+ # Chunk all documents
329
+ for doc in self.documents:
330
+ chunks = self._chunk_document(doc, config.chunk_size, config.chunk_overlap)
331
+ all_chunks.extend(chunks)
332
+
333
+ if not all_chunks:
334
+ return
335
+
336
+ # Batch embed all chunks at once (single API call)
337
+ texts = [chunk.content for chunk in all_chunks]
338
+ responses = self._embedding_provider.embed_batch(texts, config.embedding_model)
339
+
340
+ for chunk, response in zip(all_chunks, responses, strict=True):
341
+ chunk.embedding = response.embedding
342
+
343
+ self.vector_store.add(all_chunks)
344
+
345
+ def _retrieve(self, query: str, config: RAGConfig) -> list[Chunk]:
346
+ """Retrieve relevant chunks for a query."""
347
+ query_response = self._embedding_provider.embed(query, config.embedding_model)
348
+ results = self.vector_store.search(query_response.embedding, top_k=config.num_chunks)
349
+ return [chunk for chunk, _ in results]
350
+
351
+ def _generate(self, question: str, context: str, config: RAGConfig) -> str:
352
+ """Generate answer using RAG."""
353
+ if self._llm_provider is None:
354
+ raise ValueError("LLM provider is required for generation")
355
+
356
+ system_prompt = """You are a helpful assistant. Answer questions based ONLY on the provided context.
357
+ If the context doesn't contain enough information, say so. Be concise and accurate."""
358
+
359
+ prompt = f"""Context:
360
+ {context}
361
+
362
+ Question: {question}
363
+
364
+ Answer:"""
365
+
366
+ response = self._llm_provider.generate(
367
+ prompt=prompt,
368
+ model=config.llm_model,
369
+ system_prompt=system_prompt,
370
+ temperature=0.7,
371
+ )
372
+ return response.text
373
+
374
+ def _evaluate_response(
375
+ self,
376
+ question: str,
377
+ generated: str,
378
+ ground_truth: str,
379
+ context: str,
380
+ config: RAGConfig,
381
+ ) -> EvaluationScores:
382
+ """Evaluate a RAG response using LLM-as-judge."""
383
+ if self._llm_provider is None:
384
+ raise ValueError("LLM provider is required for evaluation")
385
+
386
+ def extract_score(response: str) -> float:
387
+ """Extract numeric score from LLM response."""
388
+ try:
389
+ # Find first number in response
390
+ nums = "".join(c for c in response if c.isdigit() or c == ".")
391
+ if nums:
392
+ score = float(nums.split(".")[0]) # Take integer part
393
+ return min(100, max(0, score)) / 100
394
+ except (ValueError, IndexError):
395
+ pass
396
+ return 0.5
397
+
398
+ # Evaluate answer correctness
399
+ correctness_prompt = f"""Rate how correct this answer is compared to ground truth (0-100):
400
+
401
+ Question: {question}
402
+ Ground Truth: {ground_truth}
403
+ Generated Answer: {generated}
404
+
405
+ Respond with ONLY a number 0-100."""
406
+
407
+ resp = self._llm_provider.generate(correctness_prompt, config.llm_model)
408
+ correctness = extract_score(resp.text)
409
+
410
+ # Evaluate context relevance
411
+ relevance_prompt = f"""Rate how relevant this context is for answering the question (0-100):
412
+
413
+ Question: {question}
414
+ Context: {context[:1000]}
415
+
416
+ Respond with ONLY a number 0-100."""
417
+
418
+ resp = self._llm_provider.generate(relevance_prompt, config.llm_model)
419
+ relevance = extract_score(resp.text)
420
+
421
+ # Evaluate faithfulness
422
+ faithfulness_prompt = f"""Rate if this answer is grounded in the context (0-100):
423
+
424
+ Context: {context[:1000]}
425
+ Answer: {generated}
426
+
427
+ Respond with ONLY a number 0-100."""
428
+
429
+ resp = self._llm_provider.generate(faithfulness_prompt, config.llm_model)
430
+ faithfulness = extract_score(resp.text)
431
+
432
+ return EvaluationScores(
433
+ answer_correctness=correctness,
434
+ context_relevance=relevance,
435
+ faithfulness=faithfulness,
436
+ )
437
+
438
+ def evaluate_config(self, config: RAGConfig, verbose: bool = False) -> EvaluationResult:
439
+ """
440
+ Evaluate a single RAG configuration.
441
+
442
+ Parameters
443
+ ----------
444
+ config : RAGConfig
445
+ Configuration to evaluate.
446
+ verbose : bool
447
+ Print progress information.
448
+
449
+ Returns
450
+ -------
451
+ EvaluationResult
452
+ Evaluation results for this configuration.
453
+ """
454
+ if verbose:
455
+ print(f"\nEvaluating {config.name}:")
456
+ print(f" chunk_size={config.chunk_size}, overlap={config.chunk_overlap}, num_chunks={config.num_chunks}")
457
+
458
+ start_time = time.time()
459
+
460
+ # Build index
461
+ self._build_index(config)
462
+
463
+ # Evaluate on benchmark
464
+ all_scores = []
465
+
466
+ for qa in self.benchmark:
467
+ # Retrieve
468
+ chunks = self._retrieve(qa.question, config)
469
+ context = "\n\n".join(f"[{c.doc_id}]: {c.content}" for c in chunks)
470
+
471
+ # Generate
472
+ answer = self._generate(qa.question, context, config)
473
+
474
+ # Evaluate
475
+ scores = self._evaluate_response(qa.question, answer, qa.ground_truth, context, config)
476
+ all_scores.append(scores)
477
+
478
+ # Aggregate scores (use generators for memory efficiency)
479
+ avg_correctness = np.mean([s.answer_correctness for s in all_scores])
480
+ avg_relevance = np.mean([s.context_relevance for s in all_scores])
481
+ avg_faithfulness = np.mean([s.faithfulness for s in all_scores])
482
+ combined = float(np.mean([s.combined_score for s in all_scores]))
483
+
484
+ execution_time = time.time() - start_time
485
+
486
+ if verbose:
487
+ print(
488
+ f" Scores: correctness={avg_correctness:.2f}, "
489
+ f"relevance={avg_relevance:.2f}, faithfulness={avg_faithfulness:.2f}"
490
+ )
491
+ print(f" Combined: {combined:.3f} | Time: {execution_time:.1f}s")
492
+
493
+ return EvaluationResult(
494
+ pattern_name=config.name,
495
+ indexing_params={
496
+ "chunk_size": config.chunk_size,
497
+ "chunk_overlap": config.chunk_overlap,
498
+ "embedding_model": config.embedding_model,
499
+ },
500
+ inference_params={
501
+ "num_chunks": config.num_chunks,
502
+ "llm_model": config.llm_model,
503
+ },
504
+ scores={
505
+ "answer_correctness": {"mean": float(avg_correctness)},
506
+ "context_relevance": {"mean": float(avg_relevance)},
507
+ "faithfulness": {"mean": float(avg_faithfulness)},
508
+ },
509
+ execution_time=execution_time,
510
+ final_score=float(combined),
511
+ )
512
+
513
+ def run(
514
+ self,
515
+ configs: list[RAGConfig] | None = None,
516
+ max_configs: int | None = None,
517
+ verbose: bool = True,
518
+ ) -> list[EvaluationResult]:
519
+ """
520
+ Run the RAG optimization experiment.
521
+
522
+ Parameters
523
+ ----------
524
+ configs : list[RAGConfig], optional
525
+ Configurations to evaluate. If None, uses default search space.
526
+ max_configs : int, optional
527
+ Maximum number of configurations to evaluate.
528
+ verbose : bool
529
+ Print progress information.
530
+
531
+ Returns
532
+ -------
533
+ list[EvaluationResult]
534
+ Results sorted by combined score (best first).
535
+ """
536
+ if configs is None:
537
+ configs = self.define_search_space()
538
+
539
+ if max_configs:
540
+ configs = configs[:max_configs]
541
+
542
+ if verbose:
543
+ print("=" * 60)
544
+ print("RAGIT: RAG Optimization Experiment")
545
+ print("=" * 60)
546
+ print(f"Configurations to test: {len(configs)}")
547
+ print(f"Documents: {len(self.documents)}")
548
+ print(f"Benchmark questions: {len(self.benchmark)}")
549
+ print()
550
+
551
+ self.results = []
552
+
553
+ for cfg in tqdm(configs, desc="Evaluating configs", disable=not verbose):
554
+ result = self.evaluate_config(cfg, verbose=verbose)
555
+ self.results.append(result)
556
+
557
+ # Sort by combined score (best first)
558
+ self.results.sort(key=lambda x: x.final_score, reverse=True)
559
+
560
+ if verbose:
561
+ print("\n" + "=" * 60)
562
+ print("RESULTS (sorted by score)")
563
+ print("=" * 60)
564
+ for i, result in enumerate(self.results[:5], 1):
565
+ print(f"{i}. {result.pattern_name}: {result.final_score:.3f}")
566
+ print(
567
+ f" chunk_size={result.indexing_params['chunk_size']}, "
568
+ f"num_chunks={result.inference_params['num_chunks']}"
569
+ )
570
+
571
+ return self.results
572
+
573
+ def get_best_config(self) -> EvaluationResult | None:
574
+ """Get the best configuration from results."""
575
+ if not self.results:
576
+ return None
577
+ return self.results[0]
@@ -0,0 +1,131 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit experiment results.
7
+ """
8
+
9
+ from collections.abc import Iterator
10
+ from dataclasses import asdict, dataclass, field
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class EvaluationResult:
16
+ """
17
+ Result from evaluating a single RAG configuration.
18
+
19
+ Parameters
20
+ ----------
21
+ pattern_name : str
22
+ Name of the RAG pattern (e.g., "Pattern_1").
23
+ indexing_params : dict[str, Any]
24
+ Hyperparameters used during indexing (chunk_size, overlap, etc.).
25
+ inference_params : dict[str, Any]
26
+ Hyperparameters used during inference (num_chunks, llm_model, etc.).
27
+ scores : dict[str, dict]
28
+ Evaluation scores (answer_correctness, context_relevance, faithfulness).
29
+ execution_time : float
30
+ Time taken for evaluation in seconds.
31
+ final_score : float
32
+ Combined score for optimization ranking.
33
+ """
34
+
35
+ pattern_name: str
36
+ indexing_params: dict[str, Any]
37
+ inference_params: dict[str, Any]
38
+ scores: dict[str, dict[str, float]]
39
+ execution_time: float
40
+ final_score: float
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary."""
44
+ return asdict(self)
45
+
46
+ def __repr__(self) -> str:
47
+ return (
48
+ f"EvaluationResult(name={self.pattern_name}, score={self.final_score:.3f}, time={self.execution_time:.1f}s)"
49
+ )
50
+
51
+
52
+ @dataclass
53
+ class ExperimentResults:
54
+ """
55
+ Collection of evaluation results from an optimization experiment.
56
+
57
+ Attributes
58
+ ----------
59
+ evaluations : list[EvaluationResult]
60
+ All evaluation results.
61
+ """
62
+
63
+ evaluations: list[EvaluationResult] = field(default_factory=list)
64
+
65
+ def __len__(self) -> int:
66
+ return len(self.evaluations)
67
+
68
+ def __iter__(self) -> Iterator[EvaluationResult]:
69
+ yield from self.evaluations
70
+
71
+ def __bool__(self) -> bool:
72
+ return bool(self.evaluations)
73
+
74
+ def add(self, result: EvaluationResult) -> None:
75
+ """Add an evaluation result."""
76
+ self.evaluations.append(result)
77
+
78
+ def is_cached(
79
+ self,
80
+ indexing_params: dict[str, Any],
81
+ inference_params: dict[str, Any],
82
+ ) -> float | None:
83
+ """
84
+ Check if this configuration was already evaluated.
85
+
86
+ Returns
87
+ -------
88
+ float or None
89
+ Final score if cached, None otherwise.
90
+ """
91
+ for ev in self.evaluations:
92
+ if ev.indexing_params == indexing_params and ev.inference_params == inference_params:
93
+ return ev.final_score
94
+ return None
95
+
96
+ @property
97
+ def scores(self) -> list[float]:
98
+ """All final scores."""
99
+ return [ev.final_score for ev in self.evaluations]
100
+
101
+ def sorted(self, reverse: bool = True) -> list[EvaluationResult]:
102
+ """
103
+ Get results sorted by final score.
104
+
105
+ Parameters
106
+ ----------
107
+ reverse : bool
108
+ If True (default), best scores first.
109
+
110
+ Returns
111
+ -------
112
+ list[EvaluationResult]
113
+ Sorted results.
114
+ """
115
+ return sorted(self.evaluations, key=lambda x: x.final_score, reverse=reverse)
116
+
117
+ def get_best(self, k: int = 1) -> list[EvaluationResult]:
118
+ """
119
+ Get k best results.
120
+
121
+ Parameters
122
+ ----------
123
+ k : int
124
+ Number of results to return.
125
+
126
+ Returns
127
+ -------
128
+ list[EvaluationResult]
129
+ Top k results by score.
130
+ """
131
+ return self.sorted()[:k]