ragit 0.7__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,507 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit Experiment - Core RAG optimization engine.
7
+
8
+ This module provides the main experiment class for optimizing RAG hyperparameters.
9
+ """
10
+
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from itertools import product
14
+ from typing import Any
15
+
16
+ import numpy as np
17
+ from tqdm import tqdm
18
+
19
+ from ragit.config import config
20
+ from ragit.core.experiment.results import EvaluationResult
21
+ from ragit.providers import OllamaProvider
22
+
23
+
24
+ @dataclass
25
+ class RAGConfig:
26
+ """Configuration for a RAG pattern."""
27
+
28
+ name: str
29
+ chunk_size: int
30
+ chunk_overlap: int
31
+ num_chunks: int # Number of chunks to retrieve
32
+ embedding_model: str
33
+ llm_model: str
34
+
35
+
36
+ @dataclass
37
+ class Document:
38
+ """A document in the knowledge base."""
39
+
40
+ id: str
41
+ content: str
42
+ metadata: dict[str, Any] = field(default_factory=dict)
43
+
44
+
45
+ @dataclass
46
+ class Chunk:
47
+ """A document chunk."""
48
+
49
+ content: str
50
+ doc_id: str
51
+ chunk_index: int
52
+ embedding: tuple[float, ...] | list[float] | None = None
53
+
54
+
55
+ @dataclass
56
+ class BenchmarkQuestion:
57
+ """A benchmark question for evaluation."""
58
+
59
+ question: str
60
+ ground_truth: str
61
+ relevant_doc_ids: list[str] = field(default_factory=list)
62
+
63
+
64
+ @dataclass
65
+ class EvaluationScores:
66
+ """Scores from evaluating a RAG response."""
67
+
68
+ answer_correctness: float
69
+ context_relevance: float
70
+ faithfulness: float
71
+
72
+ @property
73
+ def combined_score(self) -> float:
74
+ """Combined score (weighted average)."""
75
+ return 0.4 * self.answer_correctness + 0.3 * self.context_relevance + 0.3 * self.faithfulness
76
+
77
+
78
+ class SimpleVectorStore:
79
+ """Simple in-memory vector store with pre-normalized embeddings for fast search.
80
+
81
+ Note: This class is NOT thread-safe.
82
+ """
83
+
84
+ def __init__(self) -> None:
85
+ self.chunks: list[Chunk] = []
86
+ self._embedding_matrix: np.ndarray[Any, np.dtype[np.float64]] | None = None # Pre-normalized
87
+
88
+ def add(self, chunks: list[Chunk]) -> None:
89
+ """Add chunks to the store and rebuild pre-normalized embedding matrix."""
90
+ self.chunks.extend(chunks)
91
+ self._rebuild_matrix()
92
+
93
+ def _rebuild_matrix(self) -> None:
94
+ """Rebuild and pre-normalize the embedding matrix from chunks."""
95
+ embeddings = [c.embedding for c in self.chunks if c.embedding is not None]
96
+ if embeddings:
97
+ matrix = np.array(embeddings, dtype=np.float64)
98
+ # Pre-normalize for fast cosine similarity
99
+ norms = np.linalg.norm(matrix, axis=1, keepdims=True)
100
+ norms[norms == 0] = 1 # Avoid division by zero
101
+ self._embedding_matrix = matrix / norms
102
+ else:
103
+ self._embedding_matrix = None
104
+
105
+ def clear(self) -> None:
106
+ """Clear all chunks."""
107
+ self.chunks = []
108
+ self._embedding_matrix = None
109
+
110
+ def search(self, query_embedding: tuple[float, ...] | list[float], top_k: int = 5) -> list[tuple[Chunk, float]]:
111
+ """Search for similar chunks using pre-normalized cosine similarity."""
112
+ if not self.chunks or self._embedding_matrix is None:
113
+ return []
114
+
115
+ # Normalize query vector
116
+ query_vec = np.array(query_embedding, dtype=np.float64)
117
+ query_norm = np.linalg.norm(query_vec)
118
+ if query_norm == 0:
119
+ return []
120
+ query_normalized = query_vec / query_norm
121
+
122
+ # Fast cosine similarity: matrix is pre-normalized, just dot product
123
+ similarities = self._embedding_matrix @ query_normalized
124
+
125
+ # Get top_k indices efficiently
126
+ if len(similarities) <= top_k:
127
+ top_indices = np.argsort(similarities)[::-1]
128
+ else:
129
+ top_indices = np.argpartition(similarities, -top_k)[-top_k:]
130
+ top_indices = top_indices[np.argsort(similarities[top_indices])[::-1]]
131
+
132
+ return [(self.chunks[i], float(similarities[i])) for i in top_indices]
133
+
134
+
135
+ class RagitExperiment:
136
+ """
137
+ Ragit Experiment - Automatic RAG Hyperparameter Optimization.
138
+
139
+ This class orchestrates the optimization of RAG pipeline hyperparameters
140
+ by systematically evaluating different configurations.
141
+
142
+ Parameters
143
+ ----------
144
+ documents : list[Document]
145
+ Documents to use as the knowledge base.
146
+ benchmark : list[BenchmarkQuestion]
147
+ Benchmark questions for evaluation.
148
+ provider : OllamaProvider, optional
149
+ LLM/Embedding provider. Defaults to OllamaProvider().
150
+
151
+ Examples
152
+ --------
153
+ >>> documents = [Document(id="doc1", content="...")]
154
+ >>> benchmark = [BenchmarkQuestion(question="...", ground_truth="...")]
155
+ >>> experiment = RagitExperiment(documents, benchmark)
156
+ >>> results = experiment.run()
157
+ >>> print(results[0].config) # Best configuration
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ documents: list[Document],
163
+ benchmark: list[BenchmarkQuestion],
164
+ provider: OllamaProvider | None = None,
165
+ ):
166
+ self.documents = documents
167
+ self.benchmark = benchmark
168
+ self.provider = provider or OllamaProvider()
169
+ self.vector_store = SimpleVectorStore()
170
+ self.results: list[EvaluationResult] = []
171
+
172
+ def define_search_space(
173
+ self,
174
+ chunk_sizes: list[int] | None = None,
175
+ chunk_overlaps: list[int] | None = None,
176
+ num_chunks_options: list[int] | None = None,
177
+ embedding_models: list[str] | None = None,
178
+ llm_models: list[str] | None = None,
179
+ ) -> list[RAGConfig]:
180
+ """
181
+ Define the hyperparameter search space.
182
+
183
+ Parameters
184
+ ----------
185
+ chunk_sizes : list[int], optional
186
+ Chunk sizes to test. Default: [256, 512]
187
+ chunk_overlaps : list[int], optional
188
+ Chunk overlaps to test. Default: [50, 100]
189
+ num_chunks_options : list[int], optional
190
+ Number of chunks to retrieve. Default: [2, 3, 5]
191
+ embedding_models : list[str], optional
192
+ Embedding models to test. Default: from RAGIT_DEFAULT_EMBEDDING_MODEL env var
193
+ llm_models : list[str], optional
194
+ LLM models to test. Default: from RAGIT_DEFAULT_LLM_MODEL env var
195
+
196
+ Returns
197
+ -------
198
+ list[RAGConfig]
199
+ List of configurations to evaluate.
200
+ """
201
+ chunk_sizes = chunk_sizes or [256, 512]
202
+ chunk_overlaps = chunk_overlaps or [50, 100]
203
+ num_chunks_options = num_chunks_options or [2, 3]
204
+ embedding_models = embedding_models or [config.DEFAULT_EMBEDDING_MODEL]
205
+ llm_models = llm_models or [config.DEFAULT_LLM_MODEL]
206
+
207
+ configs = []
208
+ pattern_num = 1
209
+
210
+ for cs, co, nc, em, lm in product(
211
+ chunk_sizes, chunk_overlaps, num_chunks_options, embedding_models, llm_models
212
+ ):
213
+ # Ensure overlap is less than chunk size
214
+ if co >= cs:
215
+ continue
216
+
217
+ configs.append(
218
+ RAGConfig(
219
+ name=f"Pattern_{pattern_num}",
220
+ chunk_size=cs,
221
+ chunk_overlap=co,
222
+ num_chunks=nc,
223
+ embedding_model=em,
224
+ llm_model=lm,
225
+ )
226
+ )
227
+ pattern_num += 1
228
+
229
+ return configs
230
+
231
+ def _chunk_document(self, doc: Document, chunk_size: int, overlap: int) -> list[Chunk]:
232
+ """Split document into overlapping chunks."""
233
+ chunks = []
234
+ text = doc.content
235
+ start = 0
236
+ chunk_idx = 0
237
+
238
+ while start < len(text):
239
+ end = start + chunk_size
240
+ chunk_text = text[start:end].strip()
241
+
242
+ if chunk_text:
243
+ chunks.append(
244
+ Chunk(
245
+ content=chunk_text,
246
+ doc_id=doc.id,
247
+ chunk_index=chunk_idx,
248
+ )
249
+ )
250
+ chunk_idx += 1
251
+
252
+ start = end - overlap
253
+ if start >= len(text) - overlap:
254
+ break
255
+
256
+ return chunks
257
+
258
+ def _build_index(self, config: RAGConfig) -> None:
259
+ """Build vector index with given configuration using batch embedding."""
260
+ self.vector_store.clear()
261
+ all_chunks: list[Chunk] = []
262
+
263
+ # Chunk all documents
264
+ for doc in self.documents:
265
+ chunks = self._chunk_document(doc, config.chunk_size, config.chunk_overlap)
266
+ all_chunks.extend(chunks)
267
+
268
+ if not all_chunks:
269
+ return
270
+
271
+ # Batch embed all chunks at once (single API call)
272
+ texts = [chunk.content for chunk in all_chunks]
273
+ responses = self.provider.embed_batch(texts, config.embedding_model)
274
+
275
+ for chunk, response in zip(all_chunks, responses, strict=True):
276
+ chunk.embedding = response.embedding
277
+
278
+ self.vector_store.add(all_chunks)
279
+
280
+ def _retrieve(self, query: str, config: RAGConfig) -> list[Chunk]:
281
+ """Retrieve relevant chunks for a query."""
282
+ query_response = self.provider.embed(query, config.embedding_model)
283
+ results = self.vector_store.search(query_response.embedding, top_k=config.num_chunks)
284
+ return [chunk for chunk, _ in results]
285
+
286
+ def _generate(self, question: str, context: str, config: RAGConfig) -> str:
287
+ """Generate answer using RAG."""
288
+ system_prompt = """You are a helpful assistant. Answer questions based ONLY on the provided context.
289
+ If the context doesn't contain enough information, say so. Be concise and accurate."""
290
+
291
+ prompt = f"""Context:
292
+ {context}
293
+
294
+ Question: {question}
295
+
296
+ Answer:"""
297
+
298
+ response = self.provider.generate(
299
+ prompt=prompt,
300
+ model=config.llm_model,
301
+ system_prompt=system_prompt,
302
+ temperature=0.7,
303
+ )
304
+ return response.text
305
+
306
+ def _evaluate_response(
307
+ self,
308
+ question: str,
309
+ generated: str,
310
+ ground_truth: str,
311
+ context: str,
312
+ config: RAGConfig,
313
+ ) -> EvaluationScores:
314
+ """Evaluate a RAG response using LLM-as-judge."""
315
+
316
+ def extract_score(response: str) -> float:
317
+ """Extract numeric score from LLM response."""
318
+ try:
319
+ # Find first number in response
320
+ nums = "".join(c for c in response if c.isdigit() or c == ".")
321
+ if nums:
322
+ score = float(nums.split(".")[0]) # Take integer part
323
+ return min(100, max(0, score)) / 100
324
+ except (ValueError, IndexError):
325
+ pass
326
+ return 0.5
327
+
328
+ # Evaluate answer correctness
329
+ correctness_prompt = f"""Rate how correct this answer is compared to ground truth (0-100):
330
+
331
+ Question: {question}
332
+ Ground Truth: {ground_truth}
333
+ Generated Answer: {generated}
334
+
335
+ Respond with ONLY a number 0-100."""
336
+
337
+ resp = self.provider.generate(correctness_prompt, config.llm_model)
338
+ correctness = extract_score(resp.text)
339
+
340
+ # Evaluate context relevance
341
+ relevance_prompt = f"""Rate how relevant this context is for answering the question (0-100):
342
+
343
+ Question: {question}
344
+ Context: {context[:1000]}
345
+
346
+ Respond with ONLY a number 0-100."""
347
+
348
+ resp = self.provider.generate(relevance_prompt, config.llm_model)
349
+ relevance = extract_score(resp.text)
350
+
351
+ # Evaluate faithfulness
352
+ faithfulness_prompt = f"""Rate if this answer is grounded in the context (0-100):
353
+
354
+ Context: {context[:1000]}
355
+ Answer: {generated}
356
+
357
+ Respond with ONLY a number 0-100."""
358
+
359
+ resp = self.provider.generate(faithfulness_prompt, config.llm_model)
360
+ faithfulness = extract_score(resp.text)
361
+
362
+ return EvaluationScores(
363
+ answer_correctness=correctness,
364
+ context_relevance=relevance,
365
+ faithfulness=faithfulness,
366
+ )
367
+
368
+ def evaluate_config(self, config: RAGConfig, verbose: bool = False) -> EvaluationResult:
369
+ """
370
+ Evaluate a single RAG configuration.
371
+
372
+ Parameters
373
+ ----------
374
+ config : RAGConfig
375
+ Configuration to evaluate.
376
+ verbose : bool
377
+ Print progress information.
378
+
379
+ Returns
380
+ -------
381
+ EvaluationResult
382
+ Evaluation results for this configuration.
383
+ """
384
+ if verbose:
385
+ print(f"\nEvaluating {config.name}:")
386
+ print(f" chunk_size={config.chunk_size}, overlap={config.chunk_overlap}, num_chunks={config.num_chunks}")
387
+
388
+ start_time = time.time()
389
+
390
+ # Build index
391
+ self._build_index(config)
392
+
393
+ # Evaluate on benchmark
394
+ all_scores = []
395
+
396
+ for qa in self.benchmark:
397
+ # Retrieve
398
+ chunks = self._retrieve(qa.question, config)
399
+ context = "\n\n".join(f"[{c.doc_id}]: {c.content}" for c in chunks)
400
+
401
+ # Generate
402
+ answer = self._generate(qa.question, context, config)
403
+
404
+ # Evaluate
405
+ scores = self._evaluate_response(qa.question, answer, qa.ground_truth, context, config)
406
+ all_scores.append(scores)
407
+
408
+ # Aggregate scores (use generators for memory efficiency)
409
+ avg_correctness = np.mean([s.answer_correctness for s in all_scores])
410
+ avg_relevance = np.mean([s.context_relevance for s in all_scores])
411
+ avg_faithfulness = np.mean([s.faithfulness for s in all_scores])
412
+ combined = float(np.mean([s.combined_score for s in all_scores]))
413
+
414
+ execution_time = time.time() - start_time
415
+
416
+ if verbose:
417
+ print(
418
+ f" Scores: correctness={avg_correctness:.2f}, "
419
+ f"relevance={avg_relevance:.2f}, faithfulness={avg_faithfulness:.2f}"
420
+ )
421
+ print(f" Combined: {combined:.3f} | Time: {execution_time:.1f}s")
422
+
423
+ return EvaluationResult(
424
+ pattern_name=config.name,
425
+ indexing_params={
426
+ "chunk_size": config.chunk_size,
427
+ "chunk_overlap": config.chunk_overlap,
428
+ "embedding_model": config.embedding_model,
429
+ },
430
+ inference_params={
431
+ "num_chunks": config.num_chunks,
432
+ "llm_model": config.llm_model,
433
+ },
434
+ scores={
435
+ "answer_correctness": {"mean": float(avg_correctness)},
436
+ "context_relevance": {"mean": float(avg_relevance)},
437
+ "faithfulness": {"mean": float(avg_faithfulness)},
438
+ },
439
+ execution_time=execution_time,
440
+ final_score=float(combined),
441
+ )
442
+
443
+ def run(
444
+ self,
445
+ configs: list[RAGConfig] | None = None,
446
+ max_configs: int | None = None,
447
+ verbose: bool = True,
448
+ ) -> list[EvaluationResult]:
449
+ """
450
+ Run the RAG optimization experiment.
451
+
452
+ Parameters
453
+ ----------
454
+ configs : list[RAGConfig], optional
455
+ Configurations to evaluate. If None, uses default search space.
456
+ max_configs : int, optional
457
+ Maximum number of configurations to evaluate.
458
+ verbose : bool
459
+ Print progress information.
460
+
461
+ Returns
462
+ -------
463
+ list[EvaluationResult]
464
+ Results sorted by combined score (best first).
465
+ """
466
+ if configs is None:
467
+ configs = self.define_search_space()
468
+
469
+ if max_configs:
470
+ configs = configs[:max_configs]
471
+
472
+ if verbose:
473
+ print("=" * 60)
474
+ print("RAGIT: RAG Optimization Experiment")
475
+ print("=" * 60)
476
+ print(f"Configurations to test: {len(configs)}")
477
+ print(f"Documents: {len(self.documents)}")
478
+ print(f"Benchmark questions: {len(self.benchmark)}")
479
+ print()
480
+
481
+ self.results = []
482
+
483
+ for cfg in tqdm(configs, desc="Evaluating configs", disable=not verbose):
484
+ result = self.evaluate_config(cfg, verbose=verbose)
485
+ self.results.append(result)
486
+
487
+ # Sort by combined score (best first)
488
+ self.results.sort(key=lambda x: x.final_score, reverse=True)
489
+
490
+ if verbose:
491
+ print("\n" + "=" * 60)
492
+ print("RESULTS (sorted by score)")
493
+ print("=" * 60)
494
+ for i, result in enumerate(self.results[:5], 1):
495
+ print(f"{i}. {result.pattern_name}: {result.final_score:.3f}")
496
+ print(
497
+ f" chunk_size={result.indexing_params['chunk_size']}, "
498
+ f"num_chunks={result.inference_params['num_chunks']}"
499
+ )
500
+
501
+ return self.results
502
+
503
+ def get_best_config(self) -> EvaluationResult | None:
504
+ """Get the best configuration from results."""
505
+ if not self.results:
506
+ return None
507
+ return self.results[0]
@@ -0,0 +1,131 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit experiment results.
7
+ """
8
+
9
+ from collections.abc import Iterator
10
+ from dataclasses import asdict, dataclass, field
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class EvaluationResult:
16
+ """
17
+ Result from evaluating a single RAG configuration.
18
+
19
+ Parameters
20
+ ----------
21
+ pattern_name : str
22
+ Name of the RAG pattern (e.g., "Pattern_1").
23
+ indexing_params : dict[str, Any]
24
+ Hyperparameters used during indexing (chunk_size, overlap, etc.).
25
+ inference_params : dict[str, Any]
26
+ Hyperparameters used during inference (num_chunks, llm_model, etc.).
27
+ scores : dict[str, dict]
28
+ Evaluation scores (answer_correctness, context_relevance, faithfulness).
29
+ execution_time : float
30
+ Time taken for evaluation in seconds.
31
+ final_score : float
32
+ Combined score for optimization ranking.
33
+ """
34
+
35
+ pattern_name: str
36
+ indexing_params: dict[str, Any]
37
+ inference_params: dict[str, Any]
38
+ scores: dict[str, dict[str, float]]
39
+ execution_time: float
40
+ final_score: float
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary."""
44
+ return asdict(self)
45
+
46
+ def __repr__(self) -> str:
47
+ return (
48
+ f"EvaluationResult(name={self.pattern_name}, score={self.final_score:.3f}, time={self.execution_time:.1f}s)"
49
+ )
50
+
51
+
52
+ @dataclass
53
+ class ExperimentResults:
54
+ """
55
+ Collection of evaluation results from an optimization experiment.
56
+
57
+ Attributes
58
+ ----------
59
+ evaluations : list[EvaluationResult]
60
+ All evaluation results.
61
+ """
62
+
63
+ evaluations: list[EvaluationResult] = field(default_factory=list)
64
+
65
+ def __len__(self) -> int:
66
+ return len(self.evaluations)
67
+
68
+ def __iter__(self) -> Iterator[EvaluationResult]:
69
+ yield from self.evaluations
70
+
71
+ def __bool__(self) -> bool:
72
+ return bool(self.evaluations)
73
+
74
+ def add(self, result: EvaluationResult) -> None:
75
+ """Add an evaluation result."""
76
+ self.evaluations.append(result)
77
+
78
+ def is_cached(
79
+ self,
80
+ indexing_params: dict[str, Any],
81
+ inference_params: dict[str, Any],
82
+ ) -> float | None:
83
+ """
84
+ Check if this configuration was already evaluated.
85
+
86
+ Returns
87
+ -------
88
+ float or None
89
+ Final score if cached, None otherwise.
90
+ """
91
+ for ev in self.evaluations:
92
+ if ev.indexing_params == indexing_params and ev.inference_params == inference_params:
93
+ return ev.final_score
94
+ return None
95
+
96
+ @property
97
+ def scores(self) -> list[float]:
98
+ """All final scores."""
99
+ return [ev.final_score for ev in self.evaluations]
100
+
101
+ def sorted(self, reverse: bool = True) -> list[EvaluationResult]:
102
+ """
103
+ Get results sorted by final score.
104
+
105
+ Parameters
106
+ ----------
107
+ reverse : bool
108
+ If True (default), best scores first.
109
+
110
+ Returns
111
+ -------
112
+ list[EvaluationResult]
113
+ Sorted results.
114
+ """
115
+ return sorted(self.evaluations, key=lambda x: x.final_score, reverse=reverse)
116
+
117
+ def get_best(self, k: int = 1) -> list[EvaluationResult]:
118
+ """
119
+ Get k best results.
120
+
121
+ Parameters
122
+ ----------
123
+ k : int
124
+ Number of results to return.
125
+
126
+ Returns
127
+ -------
128
+ list[EvaluationResult]
129
+ Top k results by score.
130
+ """
131
+ return self.sorted()[:k]