ragit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragit/__init__.py +61 -0
- ragit/core/__init__.py +5 -0
- ragit/core/experiment/__init__.py +22 -0
- ragit/core/experiment/experiment.py +468 -0
- ragit/core/experiment/results.py +132 -0
- ragit/providers/__init__.py +20 -0
- ragit/providers/base.py +146 -0
- ragit/providers/ollama.py +250 -0
- ragit/utils/__init__.py +105 -0
- ragit/version.py +5 -0
- ragit-0.0.1.dist-info/METADATA +83 -0
- ragit-0.0.1.dist-info/RECORD +15 -0
- ragit-0.0.1.dist-info/WHEEL +5 -0
- ragit-0.0.1.dist-info/licenses/LICENSE +201 -0
- ragit-0.0.1.dist-info/top_level.txt +1 -0
ragit/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit - Automatic RAG Pattern Optimization Engine
|
|
7
|
+
|
|
8
|
+
A tool for automatically finding optimal hyperparameters for RAG
|
|
9
|
+
(Retrieval-Augmented Generation) pipelines.
|
|
10
|
+
|
|
11
|
+
Supports:
|
|
12
|
+
- Ollama (local LLM and embeddings)
|
|
13
|
+
- Future: Gemini, Claude, OpenAI
|
|
14
|
+
|
|
15
|
+
Example
|
|
16
|
+
-------
|
|
17
|
+
>>> from ragit import RagitExperiment, Document, BenchmarkQuestion, OllamaProvider
|
|
18
|
+
>>>
|
|
19
|
+
>>> docs = [Document(id="doc1", content="Machine learning is...")]
|
|
20
|
+
>>> benchmark = [BenchmarkQuestion(question="What is ML?", ground_truth="...")]
|
|
21
|
+
>>>
|
|
22
|
+
>>> experiment = RagitExperiment(docs, benchmark)
|
|
23
|
+
>>> results = experiment.run()
|
|
24
|
+
>>> print(results[0]) # Best configuration
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
|
|
30
|
+
from ragit.version import __version__
|
|
31
|
+
|
|
32
|
+
# Set up logging
|
|
33
|
+
logger = logging.getLogger("ragit")
|
|
34
|
+
logger.setLevel(os.getenv("RAGIT_LOG_LEVEL", "INFO"))
|
|
35
|
+
|
|
36
|
+
if not logger.handlers:
|
|
37
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
38
|
+
handler = logging.StreamHandler()
|
|
39
|
+
handler.setFormatter(formatter)
|
|
40
|
+
logger.addHandler(handler)
|
|
41
|
+
|
|
42
|
+
# Public API
|
|
43
|
+
from ragit.providers import OllamaProvider
|
|
44
|
+
from ragit.core.experiment.experiment import (
|
|
45
|
+
RagitExperiment,
|
|
46
|
+
Document,
|
|
47
|
+
BenchmarkQuestion,
|
|
48
|
+
RAGConfig,
|
|
49
|
+
)
|
|
50
|
+
from ragit.core.experiment.results import EvaluationResult, ExperimentResults
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"__version__",
|
|
54
|
+
"RagitExperiment",
|
|
55
|
+
"Document",
|
|
56
|
+
"BenchmarkQuestion",
|
|
57
|
+
"RAGConfig",
|
|
58
|
+
"EvaluationResult",
|
|
59
|
+
"ExperimentResults",
|
|
60
|
+
"OllamaProvider",
|
|
61
|
+
]
|
ragit/core/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""Ragit experiment module."""
|
|
6
|
+
|
|
7
|
+
from ragit.core.experiment.experiment import (
|
|
8
|
+
RagitExperiment,
|
|
9
|
+
Document,
|
|
10
|
+
BenchmarkQuestion,
|
|
11
|
+
RAGConfig,
|
|
12
|
+
)
|
|
13
|
+
from ragit.core.experiment.results import EvaluationResult, ExperimentResults
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"RagitExperiment",
|
|
17
|
+
"Document",
|
|
18
|
+
"BenchmarkQuestion",
|
|
19
|
+
"RAGConfig",
|
|
20
|
+
"EvaluationResult",
|
|
21
|
+
"ExperimentResults",
|
|
22
|
+
]
|
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit Experiment - Core RAG optimization engine.
|
|
7
|
+
|
|
8
|
+
This module provides the main experiment class for optimizing RAG hyperparameters.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from itertools import product
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
from ragit.providers import OllamaProvider
|
|
20
|
+
from ragit.core.experiment.results import EvaluationResult
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RAGConfig:
|
|
25
|
+
"""Configuration for a RAG pattern."""
|
|
26
|
+
name: str
|
|
27
|
+
chunk_size: int
|
|
28
|
+
chunk_overlap: int
|
|
29
|
+
num_chunks: int # Number of chunks to retrieve
|
|
30
|
+
embedding_model: str
|
|
31
|
+
llm_model: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Document:
|
|
36
|
+
"""A document in the knowledge base."""
|
|
37
|
+
id: str
|
|
38
|
+
content: str
|
|
39
|
+
metadata: dict = field(default_factory=dict)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class Chunk:
|
|
44
|
+
"""A document chunk."""
|
|
45
|
+
content: str
|
|
46
|
+
doc_id: str
|
|
47
|
+
chunk_index: int
|
|
48
|
+
embedding: Optional[list[float]] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class BenchmarkQuestion:
|
|
53
|
+
"""A benchmark question for evaluation."""
|
|
54
|
+
question: str
|
|
55
|
+
ground_truth: str
|
|
56
|
+
relevant_doc_ids: list[str] = field(default_factory=list)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class EvaluationScores:
|
|
61
|
+
"""Scores from evaluating a RAG response."""
|
|
62
|
+
answer_correctness: float
|
|
63
|
+
context_relevance: float
|
|
64
|
+
faithfulness: float
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def combined_score(self) -> float:
|
|
68
|
+
"""Combined score (weighted average)."""
|
|
69
|
+
return 0.4 * self.answer_correctness + 0.3 * self.context_relevance + 0.3 * self.faithfulness
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class SimpleVectorStore:
|
|
73
|
+
"""Simple in-memory vector store."""
|
|
74
|
+
|
|
75
|
+
def __init__(self):
|
|
76
|
+
self.chunks: list[Chunk] = []
|
|
77
|
+
|
|
78
|
+
def add(self, chunks: list[Chunk]):
|
|
79
|
+
"""Add chunks to the store."""
|
|
80
|
+
self.chunks.extend(chunks)
|
|
81
|
+
|
|
82
|
+
def clear(self):
|
|
83
|
+
"""Clear all chunks."""
|
|
84
|
+
self.chunks = []
|
|
85
|
+
|
|
86
|
+
def search(self, query_embedding: list[float], top_k: int = 5) -> list[tuple[Chunk, float]]:
|
|
87
|
+
"""Search for similar chunks."""
|
|
88
|
+
if not self.chunks:
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
scores = []
|
|
92
|
+
query = np.array(query_embedding)
|
|
93
|
+
|
|
94
|
+
for chunk in self.chunks:
|
|
95
|
+
if chunk.embedding:
|
|
96
|
+
chunk_emb = np.array(chunk.embedding)
|
|
97
|
+
# Cosine similarity
|
|
98
|
+
similarity = np.dot(query, chunk_emb) / (np.linalg.norm(query) * np.linalg.norm(chunk_emb))
|
|
99
|
+
scores.append((chunk, float(similarity)))
|
|
100
|
+
|
|
101
|
+
# Sort by similarity descending
|
|
102
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
103
|
+
return scores[:top_k]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class RagitExperiment:
|
|
107
|
+
"""
|
|
108
|
+
Ragit Experiment - Automatic RAG Hyperparameter Optimization.
|
|
109
|
+
|
|
110
|
+
This class orchestrates the optimization of RAG pipeline hyperparameters
|
|
111
|
+
by systematically evaluating different configurations.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
documents : list[Document]
|
|
116
|
+
Documents to use as the knowledge base.
|
|
117
|
+
benchmark : list[BenchmarkQuestion]
|
|
118
|
+
Benchmark questions for evaluation.
|
|
119
|
+
provider : OllamaProvider, optional
|
|
120
|
+
LLM/Embedding provider. Defaults to OllamaProvider().
|
|
121
|
+
|
|
122
|
+
Examples
|
|
123
|
+
--------
|
|
124
|
+
>>> documents = [Document(id="doc1", content="...")]
|
|
125
|
+
>>> benchmark = [BenchmarkQuestion(question="...", ground_truth="...")]
|
|
126
|
+
>>> experiment = RagitExperiment(documents, benchmark)
|
|
127
|
+
>>> results = experiment.run()
|
|
128
|
+
>>> print(results[0].config) # Best configuration
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
documents: list[Document],
|
|
134
|
+
benchmark: list[BenchmarkQuestion],
|
|
135
|
+
provider: Optional[OllamaProvider] = None,
|
|
136
|
+
):
|
|
137
|
+
self.documents = documents
|
|
138
|
+
self.benchmark = benchmark
|
|
139
|
+
self.provider = provider or OllamaProvider()
|
|
140
|
+
self.vector_store = SimpleVectorStore()
|
|
141
|
+
self.results: list[EvaluationResult] = []
|
|
142
|
+
|
|
143
|
+
def define_search_space(
|
|
144
|
+
self,
|
|
145
|
+
chunk_sizes: list[int] = None,
|
|
146
|
+
chunk_overlaps: list[int] = None,
|
|
147
|
+
num_chunks_options: list[int] = None,
|
|
148
|
+
embedding_models: list[str] = None,
|
|
149
|
+
llm_models: list[str] = None,
|
|
150
|
+
) -> list[RAGConfig]:
|
|
151
|
+
"""
|
|
152
|
+
Define the hyperparameter search space.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
chunk_sizes : list[int], optional
|
|
157
|
+
Chunk sizes to test. Default: [256, 512]
|
|
158
|
+
chunk_overlaps : list[int], optional
|
|
159
|
+
Chunk overlaps to test. Default: [50, 100]
|
|
160
|
+
num_chunks_options : list[int], optional
|
|
161
|
+
Number of chunks to retrieve. Default: [2, 3, 5]
|
|
162
|
+
embedding_models : list[str], optional
|
|
163
|
+
Embedding models to test. Default: ["nomic-embed-text"]
|
|
164
|
+
llm_models : list[str], optional
|
|
165
|
+
LLM models to test. Default: ["qwen3-vl:235b-instruct-cloud"]
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
list[RAGConfig]
|
|
170
|
+
List of configurations to evaluate.
|
|
171
|
+
"""
|
|
172
|
+
chunk_sizes = chunk_sizes or [256, 512]
|
|
173
|
+
chunk_overlaps = chunk_overlaps or [50, 100]
|
|
174
|
+
num_chunks_options = num_chunks_options or [2, 3]
|
|
175
|
+
embedding_models = embedding_models or ["nomic-embed-text"]
|
|
176
|
+
llm_models = llm_models or ["qwen3-vl:235b-instruct-cloud"]
|
|
177
|
+
|
|
178
|
+
configs = []
|
|
179
|
+
pattern_num = 1
|
|
180
|
+
|
|
181
|
+
for cs, co, nc, em, lm in product(
|
|
182
|
+
chunk_sizes, chunk_overlaps, num_chunks_options, embedding_models, llm_models
|
|
183
|
+
):
|
|
184
|
+
# Ensure overlap is less than chunk size
|
|
185
|
+
if co >= cs:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
configs.append(RAGConfig(
|
|
189
|
+
name=f"Pattern_{pattern_num}",
|
|
190
|
+
chunk_size=cs,
|
|
191
|
+
chunk_overlap=co,
|
|
192
|
+
num_chunks=nc,
|
|
193
|
+
embedding_model=em,
|
|
194
|
+
llm_model=lm,
|
|
195
|
+
))
|
|
196
|
+
pattern_num += 1
|
|
197
|
+
|
|
198
|
+
return configs
|
|
199
|
+
|
|
200
|
+
def _chunk_document(self, doc: Document, chunk_size: int, overlap: int) -> list[Chunk]:
|
|
201
|
+
"""Split document into overlapping chunks."""
|
|
202
|
+
chunks = []
|
|
203
|
+
text = doc.content
|
|
204
|
+
start = 0
|
|
205
|
+
chunk_idx = 0
|
|
206
|
+
|
|
207
|
+
while start < len(text):
|
|
208
|
+
end = start + chunk_size
|
|
209
|
+
chunk_text = text[start:end].strip()
|
|
210
|
+
|
|
211
|
+
if chunk_text:
|
|
212
|
+
chunks.append(Chunk(
|
|
213
|
+
content=chunk_text,
|
|
214
|
+
doc_id=doc.id,
|
|
215
|
+
chunk_index=chunk_idx,
|
|
216
|
+
))
|
|
217
|
+
chunk_idx += 1
|
|
218
|
+
|
|
219
|
+
start = end - overlap
|
|
220
|
+
if start >= len(text) - overlap:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
return chunks
|
|
224
|
+
|
|
225
|
+
def _build_index(self, config: RAGConfig) -> None:
|
|
226
|
+
"""Build vector index with given configuration."""
|
|
227
|
+
self.vector_store.clear()
|
|
228
|
+
all_chunks = []
|
|
229
|
+
|
|
230
|
+
# Chunk all documents
|
|
231
|
+
for doc in self.documents:
|
|
232
|
+
chunks = self._chunk_document(doc, config.chunk_size, config.chunk_overlap)
|
|
233
|
+
all_chunks.extend(chunks)
|
|
234
|
+
|
|
235
|
+
# Embed all chunks
|
|
236
|
+
for chunk in all_chunks:
|
|
237
|
+
response = self.provider.embed(chunk.content, config.embedding_model)
|
|
238
|
+
chunk.embedding = response.embedding
|
|
239
|
+
|
|
240
|
+
self.vector_store.add(all_chunks)
|
|
241
|
+
|
|
242
|
+
def _retrieve(self, query: str, config: RAGConfig) -> list[Chunk]:
|
|
243
|
+
"""Retrieve relevant chunks for a query."""
|
|
244
|
+
query_response = self.provider.embed(query, config.embedding_model)
|
|
245
|
+
results = self.vector_store.search(query_response.embedding, top_k=config.num_chunks)
|
|
246
|
+
return [chunk for chunk, _ in results]
|
|
247
|
+
|
|
248
|
+
def _generate(self, question: str, context: str, config: RAGConfig) -> str:
|
|
249
|
+
"""Generate answer using RAG."""
|
|
250
|
+
system_prompt = """You are a helpful assistant. Answer questions based ONLY on the provided context.
|
|
251
|
+
If the context doesn't contain enough information, say so. Be concise and accurate."""
|
|
252
|
+
|
|
253
|
+
prompt = f"""Context:
|
|
254
|
+
{context}
|
|
255
|
+
|
|
256
|
+
Question: {question}
|
|
257
|
+
|
|
258
|
+
Answer:"""
|
|
259
|
+
|
|
260
|
+
response = self.provider.generate(
|
|
261
|
+
prompt=prompt,
|
|
262
|
+
model=config.llm_model,
|
|
263
|
+
system_prompt=system_prompt,
|
|
264
|
+
temperature=0.7,
|
|
265
|
+
)
|
|
266
|
+
return response.text
|
|
267
|
+
|
|
268
|
+
def _evaluate_response(
|
|
269
|
+
self,
|
|
270
|
+
question: str,
|
|
271
|
+
generated: str,
|
|
272
|
+
ground_truth: str,
|
|
273
|
+
context: str,
|
|
274
|
+
config: RAGConfig,
|
|
275
|
+
) -> EvaluationScores:
|
|
276
|
+
"""Evaluate a RAG response using LLM-as-judge."""
|
|
277
|
+
|
|
278
|
+
def extract_score(response: str) -> float:
|
|
279
|
+
"""Extract numeric score from LLM response."""
|
|
280
|
+
try:
|
|
281
|
+
# Find first number in response
|
|
282
|
+
nums = ''.join(c for c in response if c.isdigit() or c == '.')
|
|
283
|
+
if nums:
|
|
284
|
+
score = float(nums.split('.')[0]) # Take integer part
|
|
285
|
+
return min(100, max(0, score)) / 100
|
|
286
|
+
except:
|
|
287
|
+
pass
|
|
288
|
+
return 0.5
|
|
289
|
+
|
|
290
|
+
# Evaluate answer correctness
|
|
291
|
+
correctness_prompt = f"""Rate how correct this answer is compared to ground truth (0-100):
|
|
292
|
+
|
|
293
|
+
Question: {question}
|
|
294
|
+
Ground Truth: {ground_truth}
|
|
295
|
+
Generated Answer: {generated}
|
|
296
|
+
|
|
297
|
+
Respond with ONLY a number 0-100."""
|
|
298
|
+
|
|
299
|
+
resp = self.provider.generate(correctness_prompt, config.llm_model)
|
|
300
|
+
correctness = extract_score(resp.text)
|
|
301
|
+
|
|
302
|
+
# Evaluate context relevance
|
|
303
|
+
relevance_prompt = f"""Rate how relevant this context is for answering the question (0-100):
|
|
304
|
+
|
|
305
|
+
Question: {question}
|
|
306
|
+
Context: {context[:1000]}
|
|
307
|
+
|
|
308
|
+
Respond with ONLY a number 0-100."""
|
|
309
|
+
|
|
310
|
+
resp = self.provider.generate(relevance_prompt, config.llm_model)
|
|
311
|
+
relevance = extract_score(resp.text)
|
|
312
|
+
|
|
313
|
+
# Evaluate faithfulness
|
|
314
|
+
faithfulness_prompt = f"""Rate if this answer is grounded in the context (0-100):
|
|
315
|
+
|
|
316
|
+
Context: {context[:1000]}
|
|
317
|
+
Answer: {generated}
|
|
318
|
+
|
|
319
|
+
Respond with ONLY a number 0-100."""
|
|
320
|
+
|
|
321
|
+
resp = self.provider.generate(faithfulness_prompt, config.llm_model)
|
|
322
|
+
faithfulness = extract_score(resp.text)
|
|
323
|
+
|
|
324
|
+
return EvaluationScores(
|
|
325
|
+
answer_correctness=correctness,
|
|
326
|
+
context_relevance=relevance,
|
|
327
|
+
faithfulness=faithfulness,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def evaluate_config(self, config: RAGConfig, verbose: bool = False) -> EvaluationResult:
|
|
331
|
+
"""
|
|
332
|
+
Evaluate a single RAG configuration.
|
|
333
|
+
|
|
334
|
+
Parameters
|
|
335
|
+
----------
|
|
336
|
+
config : RAGConfig
|
|
337
|
+
Configuration to evaluate.
|
|
338
|
+
verbose : bool
|
|
339
|
+
Print progress information.
|
|
340
|
+
|
|
341
|
+
Returns
|
|
342
|
+
-------
|
|
343
|
+
EvaluationResult
|
|
344
|
+
Evaluation results for this configuration.
|
|
345
|
+
"""
|
|
346
|
+
if verbose:
|
|
347
|
+
print(f"\nEvaluating {config.name}:")
|
|
348
|
+
print(f" chunk_size={config.chunk_size}, overlap={config.chunk_overlap}, "
|
|
349
|
+
f"num_chunks={config.num_chunks}")
|
|
350
|
+
|
|
351
|
+
start_time = time.time()
|
|
352
|
+
|
|
353
|
+
# Build index
|
|
354
|
+
self._build_index(config)
|
|
355
|
+
|
|
356
|
+
# Evaluate on benchmark
|
|
357
|
+
all_scores = []
|
|
358
|
+
|
|
359
|
+
for qa in self.benchmark:
|
|
360
|
+
# Retrieve
|
|
361
|
+
chunks = self._retrieve(qa.question, config)
|
|
362
|
+
context = "\n\n".join([f"[{c.doc_id}]: {c.content}" for c in chunks])
|
|
363
|
+
|
|
364
|
+
# Generate
|
|
365
|
+
answer = self._generate(qa.question, context, config)
|
|
366
|
+
|
|
367
|
+
# Evaluate
|
|
368
|
+
scores = self._evaluate_response(
|
|
369
|
+
qa.question, answer, qa.ground_truth, context, config
|
|
370
|
+
)
|
|
371
|
+
all_scores.append(scores)
|
|
372
|
+
|
|
373
|
+
# Aggregate scores
|
|
374
|
+
avg_correctness = np.mean([s.answer_correctness for s in all_scores])
|
|
375
|
+
avg_relevance = np.mean([s.context_relevance for s in all_scores])
|
|
376
|
+
avg_faithfulness = np.mean([s.faithfulness for s in all_scores])
|
|
377
|
+
combined = np.mean([s.combined_score for s in all_scores])
|
|
378
|
+
|
|
379
|
+
execution_time = time.time() - start_time
|
|
380
|
+
|
|
381
|
+
if verbose:
|
|
382
|
+
print(f" Scores: correctness={avg_correctness:.2f}, "
|
|
383
|
+
f"relevance={avg_relevance:.2f}, faithfulness={avg_faithfulness:.2f}")
|
|
384
|
+
print(f" Combined: {combined:.3f} | Time: {execution_time:.1f}s")
|
|
385
|
+
|
|
386
|
+
return EvaluationResult(
|
|
387
|
+
pattern_name=config.name,
|
|
388
|
+
indexing_params={
|
|
389
|
+
"chunk_size": config.chunk_size,
|
|
390
|
+
"chunk_overlap": config.chunk_overlap,
|
|
391
|
+
"embedding_model": config.embedding_model,
|
|
392
|
+
},
|
|
393
|
+
inference_params={
|
|
394
|
+
"num_chunks": config.num_chunks,
|
|
395
|
+
"llm_model": config.llm_model,
|
|
396
|
+
},
|
|
397
|
+
scores={
|
|
398
|
+
"answer_correctness": {"mean": avg_correctness},
|
|
399
|
+
"context_relevance": {"mean": avg_relevance},
|
|
400
|
+
"faithfulness": {"mean": avg_faithfulness},
|
|
401
|
+
},
|
|
402
|
+
execution_time=execution_time,
|
|
403
|
+
final_score=combined,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def run(
|
|
407
|
+
self,
|
|
408
|
+
configs: list[RAGConfig] = None,
|
|
409
|
+
max_configs: int = None,
|
|
410
|
+
verbose: bool = True,
|
|
411
|
+
) -> list[EvaluationResult]:
|
|
412
|
+
"""
|
|
413
|
+
Run the RAG optimization experiment.
|
|
414
|
+
|
|
415
|
+
Parameters
|
|
416
|
+
----------
|
|
417
|
+
configs : list[RAGConfig], optional
|
|
418
|
+
Configurations to evaluate. If None, uses default search space.
|
|
419
|
+
max_configs : int, optional
|
|
420
|
+
Maximum number of configurations to evaluate.
|
|
421
|
+
verbose : bool
|
|
422
|
+
Print progress information.
|
|
423
|
+
|
|
424
|
+
Returns
|
|
425
|
+
-------
|
|
426
|
+
list[EvaluationResult]
|
|
427
|
+
Results sorted by combined score (best first).
|
|
428
|
+
"""
|
|
429
|
+
if configs is None:
|
|
430
|
+
configs = self.define_search_space()
|
|
431
|
+
|
|
432
|
+
if max_configs:
|
|
433
|
+
configs = configs[:max_configs]
|
|
434
|
+
|
|
435
|
+
if verbose:
|
|
436
|
+
print("=" * 60)
|
|
437
|
+
print("RAGIT: RAG Optimization Experiment")
|
|
438
|
+
print("=" * 60)
|
|
439
|
+
print(f"Configurations to test: {len(configs)}")
|
|
440
|
+
print(f"Documents: {len(self.documents)}")
|
|
441
|
+
print(f"Benchmark questions: {len(self.benchmark)}")
|
|
442
|
+
print()
|
|
443
|
+
|
|
444
|
+
self.results = []
|
|
445
|
+
|
|
446
|
+
for config in tqdm(configs, desc="Evaluating configs", disable=not verbose):
|
|
447
|
+
result = self.evaluate_config(config, verbose=verbose)
|
|
448
|
+
self.results.append(result)
|
|
449
|
+
|
|
450
|
+
# Sort by combined score (best first)
|
|
451
|
+
self.results.sort(key=lambda x: x.final_score, reverse=True)
|
|
452
|
+
|
|
453
|
+
if verbose:
|
|
454
|
+
print("\n" + "=" * 60)
|
|
455
|
+
print("RESULTS (sorted by score)")
|
|
456
|
+
print("=" * 60)
|
|
457
|
+
for i, result in enumerate(self.results[:5], 1):
|
|
458
|
+
print(f"{i}. {result.pattern_name}: {result.final_score:.3f}")
|
|
459
|
+
print(f" chunk_size={result.indexing_params['chunk_size']}, "
|
|
460
|
+
f"num_chunks={result.inference_params['num_chunks']}")
|
|
461
|
+
|
|
462
|
+
return self.results
|
|
463
|
+
|
|
464
|
+
def get_best_config(self) -> Optional[EvaluationResult]:
|
|
465
|
+
"""Get the best configuration from results."""
|
|
466
|
+
if not self.results:
|
|
467
|
+
return None
|
|
468
|
+
return self.results[0]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit experiment results.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import asdict, dataclass, field
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class EvaluationResult:
|
|
15
|
+
"""
|
|
16
|
+
Result from evaluating a single RAG configuration.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
pattern_name : str
|
|
21
|
+
Name of the RAG pattern (e.g., "Pattern_1").
|
|
22
|
+
indexing_params : dict[str, Any]
|
|
23
|
+
Hyperparameters used during indexing (chunk_size, overlap, etc.).
|
|
24
|
+
inference_params : dict[str, Any]
|
|
25
|
+
Hyperparameters used during inference (num_chunks, llm_model, etc.).
|
|
26
|
+
scores : dict[str, dict]
|
|
27
|
+
Evaluation scores (answer_correctness, context_relevance, faithfulness).
|
|
28
|
+
execution_time : float
|
|
29
|
+
Time taken for evaluation in seconds.
|
|
30
|
+
final_score : float
|
|
31
|
+
Combined score for optimization ranking.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
pattern_name: str
|
|
35
|
+
indexing_params: dict[str, Any]
|
|
36
|
+
inference_params: dict[str, Any]
|
|
37
|
+
scores: dict[str, dict]
|
|
38
|
+
execution_time: float
|
|
39
|
+
final_score: float
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
|
+
"""Convert to dictionary."""
|
|
43
|
+
return asdict(self)
|
|
44
|
+
|
|
45
|
+
def __repr__(self) -> str:
|
|
46
|
+
return (
|
|
47
|
+
f"EvaluationResult(name={self.pattern_name}, "
|
|
48
|
+
f"score={self.final_score:.3f}, "
|
|
49
|
+
f"time={self.execution_time:.1f}s)"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class ExperimentResults:
|
|
55
|
+
"""
|
|
56
|
+
Collection of evaluation results from an optimization experiment.
|
|
57
|
+
|
|
58
|
+
Attributes
|
|
59
|
+
----------
|
|
60
|
+
evaluations : list[EvaluationResult]
|
|
61
|
+
All evaluation results.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
evaluations: list[EvaluationResult] = field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
def __len__(self) -> int:
|
|
67
|
+
return len(self.evaluations)
|
|
68
|
+
|
|
69
|
+
def __iter__(self):
|
|
70
|
+
yield from self.evaluations
|
|
71
|
+
|
|
72
|
+
def __bool__(self) -> bool:
|
|
73
|
+
return bool(self.evaluations)
|
|
74
|
+
|
|
75
|
+
def add(self, result: EvaluationResult) -> None:
|
|
76
|
+
"""Add an evaluation result."""
|
|
77
|
+
self.evaluations.append(result)
|
|
78
|
+
|
|
79
|
+
def is_cached(
|
|
80
|
+
self,
|
|
81
|
+
indexing_params: dict[str, Any],
|
|
82
|
+
inference_params: dict[str, Any],
|
|
83
|
+
) -> Optional[float]:
|
|
84
|
+
"""
|
|
85
|
+
Check if this configuration was already evaluated.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
float or None
|
|
90
|
+
Final score if cached, None otherwise.
|
|
91
|
+
"""
|
|
92
|
+
for ev in self.evaluations:
|
|
93
|
+
if ev.indexing_params == indexing_params and ev.inference_params == inference_params:
|
|
94
|
+
return ev.final_score
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def scores(self) -> list[float]:
|
|
99
|
+
"""All final scores."""
|
|
100
|
+
return [ev.final_score for ev in self.evaluations]
|
|
101
|
+
|
|
102
|
+
def sorted(self, reverse: bool = True) -> list[EvaluationResult]:
|
|
103
|
+
"""
|
|
104
|
+
Get results sorted by final score.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
reverse : bool
|
|
109
|
+
If True (default), best scores first.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
list[EvaluationResult]
|
|
114
|
+
Sorted results.
|
|
115
|
+
"""
|
|
116
|
+
return sorted(self.evaluations, key=lambda x: x.final_score, reverse=reverse)
|
|
117
|
+
|
|
118
|
+
def get_best(self, k: int = 1) -> list[EvaluationResult]:
|
|
119
|
+
"""
|
|
120
|
+
Get k best results.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
k : int
|
|
125
|
+
Number of results to return.
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
list[EvaluationResult]
|
|
130
|
+
Top k results by score.
|
|
131
|
+
"""
|
|
132
|
+
return self.sorted()[:k]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit Providers - LLM and Embedding providers for RAG optimization.
|
|
7
|
+
|
|
8
|
+
Supported providers:
|
|
9
|
+
- Ollama (local)
|
|
10
|
+
- Future: Gemini, Claude, OpenAI
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from ragit.providers.base import BaseLLMProvider, BaseEmbeddingProvider
|
|
14
|
+
from ragit.providers.ollama import OllamaProvider
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BaseLLMProvider",
|
|
18
|
+
"BaseEmbeddingProvider",
|
|
19
|
+
"OllamaProvider",
|
|
20
|
+
]
|