ragit 0.8__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragit/__init__.py +116 -2
- ragit/assistant.py +577 -0
- ragit/config.py +60 -0
- ragit/core/__init__.py +5 -0
- ragit/core/experiment/__init__.py +22 -0
- ragit/core/experiment/experiment.py +571 -0
- ragit/core/experiment/results.py +131 -0
- ragit/loaders.py +245 -0
- ragit/providers/__init__.py +47 -0
- ragit/providers/base.py +147 -0
- ragit/providers/function_adapter.py +237 -0
- ragit/providers/ollama.py +446 -0
- ragit/providers/sentence_transformers.py +225 -0
- ragit/utils/__init__.py +105 -0
- ragit/version.py +5 -0
- ragit-0.8.2.dist-info/METADATA +166 -0
- ragit-0.8.2.dist-info/RECORD +20 -0
- {ragit-0.8.dist-info → ragit-0.8.2.dist-info}/WHEEL +1 -1
- ragit-0.8.2.dist-info/licenses/LICENSE +201 -0
- {ragit-0.8.dist-info → ragit-0.8.2.dist-info}/top_level.txt +0 -0
- ragit/main.py +0 -354
- ragit-0.8.dist-info/LICENSE +0 -21
- ragit-0.8.dist-info/METADATA +0 -176
- ragit-0.8.dist-info/RECORD +0 -7
ragit/config.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit configuration management.
|
|
7
|
+
|
|
8
|
+
Loads configuration from environment variables and .env files.
|
|
9
|
+
|
|
10
|
+
Note: As of v0.8.0, ragit no longer has default LLM or embedding models.
|
|
11
|
+
Users must explicitly configure providers.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from dotenv import load_dotenv
|
|
18
|
+
|
|
19
|
+
# Load .env file from current working directory or project root
|
|
20
|
+
_env_path = Path.cwd() / ".env"
|
|
21
|
+
if _env_path.exists():
|
|
22
|
+
load_dotenv(_env_path)
|
|
23
|
+
else:
|
|
24
|
+
# Try to find .env in parent directories
|
|
25
|
+
for parent in Path.cwd().parents:
|
|
26
|
+
_env_path = parent / ".env"
|
|
27
|
+
if _env_path.exists():
|
|
28
|
+
load_dotenv(_env_path)
|
|
29
|
+
break
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Config:
|
|
33
|
+
"""Ragit configuration loaded from environment variables.
|
|
34
|
+
|
|
35
|
+
Note: As of v0.8.0, DEFAULT_LLM_MODEL and DEFAULT_EMBEDDING_MODEL are
|
|
36
|
+
no longer used as defaults. They are only read from environment variables
|
|
37
|
+
for backwards compatibility with user configurations.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Ollama LLM API Configuration (used when explicitly using OllamaProvider)
|
|
41
|
+
OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
|
42
|
+
OLLAMA_API_KEY: str | None = os.getenv("OLLAMA_API_KEY")
|
|
43
|
+
OLLAMA_TIMEOUT: int = int(os.getenv("OLLAMA_TIMEOUT", "120"))
|
|
44
|
+
|
|
45
|
+
# Ollama Embedding API Configuration
|
|
46
|
+
OLLAMA_EMBEDDING_URL: str = os.getenv(
|
|
47
|
+
"OLLAMA_EMBEDDING_URL", os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Model settings (only used if explicitly requested, no defaults)
|
|
51
|
+
# These can still be set via environment variables for convenience
|
|
52
|
+
DEFAULT_LLM_MODEL: str | None = os.getenv("RAGIT_DEFAULT_LLM_MODEL")
|
|
53
|
+
DEFAULT_EMBEDDING_MODEL: str | None = os.getenv("RAGIT_DEFAULT_EMBEDDING_MODEL")
|
|
54
|
+
|
|
55
|
+
# Logging
|
|
56
|
+
LOG_LEVEL: str = os.getenv("RAGIT_LOG_LEVEL", "INFO")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Singleton instance
|
|
60
|
+
config = Config()
|
ragit/core/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""Ragit experiment module."""
|
|
6
|
+
|
|
7
|
+
from ragit.core.experiment.experiment import (
|
|
8
|
+
BenchmarkQuestion,
|
|
9
|
+
Document,
|
|
10
|
+
RAGConfig,
|
|
11
|
+
RagitExperiment,
|
|
12
|
+
)
|
|
13
|
+
from ragit.core.experiment.results import EvaluationResult, ExperimentResults
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"RagitExperiment",
|
|
17
|
+
"Document",
|
|
18
|
+
"BenchmarkQuestion",
|
|
19
|
+
"RAGConfig",
|
|
20
|
+
"EvaluationResult",
|
|
21
|
+
"ExperimentResults",
|
|
22
|
+
]
|
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit Experiment - Core RAG optimization engine.
|
|
7
|
+
|
|
8
|
+
This module provides the main experiment class for optimizing RAG hyperparameters.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from itertools import product
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from tqdm import tqdm
|
|
19
|
+
|
|
20
|
+
from ragit.core.experiment.results import EvaluationResult
|
|
21
|
+
from ragit.providers.base import BaseEmbeddingProvider, BaseLLMProvider
|
|
22
|
+
from ragit.providers.function_adapter import FunctionProvider
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class RAGConfig:
|
|
27
|
+
"""Configuration for a RAG pattern."""
|
|
28
|
+
|
|
29
|
+
name: str
|
|
30
|
+
chunk_size: int
|
|
31
|
+
chunk_overlap: int
|
|
32
|
+
num_chunks: int # Number of chunks to retrieve
|
|
33
|
+
embedding_model: str
|
|
34
|
+
llm_model: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Document:
|
|
39
|
+
"""A document in the knowledge base."""
|
|
40
|
+
|
|
41
|
+
id: str
|
|
42
|
+
content: str
|
|
43
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Chunk:
|
|
48
|
+
"""A document chunk."""
|
|
49
|
+
|
|
50
|
+
content: str
|
|
51
|
+
doc_id: str
|
|
52
|
+
chunk_index: int
|
|
53
|
+
embedding: tuple[float, ...] | list[float] | None = None
|
|
54
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class BenchmarkQuestion:
|
|
59
|
+
"""A benchmark question for evaluation."""
|
|
60
|
+
|
|
61
|
+
question: str
|
|
62
|
+
ground_truth: str
|
|
63
|
+
relevant_doc_ids: list[str] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class EvaluationScores:
|
|
68
|
+
"""Scores from evaluating a RAG response."""
|
|
69
|
+
|
|
70
|
+
answer_correctness: float
|
|
71
|
+
context_relevance: float
|
|
72
|
+
faithfulness: float
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def combined_score(self) -> float:
|
|
76
|
+
"""Combined score (weighted average)."""
|
|
77
|
+
return 0.4 * self.answer_correctness + 0.3 * self.context_relevance + 0.3 * self.faithfulness
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class SimpleVectorStore:
|
|
81
|
+
"""Simple in-memory vector store with pre-normalized embeddings for fast search.
|
|
82
|
+
|
|
83
|
+
Note: This class is NOT thread-safe.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self) -> None:
|
|
87
|
+
self.chunks: list[Chunk] = []
|
|
88
|
+
self._embedding_matrix: np.ndarray[Any, np.dtype[np.float64]] | None = None # Pre-normalized
|
|
89
|
+
|
|
90
|
+
def add(self, chunks: list[Chunk]) -> None:
|
|
91
|
+
"""Add chunks to the store and rebuild pre-normalized embedding matrix."""
|
|
92
|
+
self.chunks.extend(chunks)
|
|
93
|
+
self._rebuild_matrix()
|
|
94
|
+
|
|
95
|
+
def _rebuild_matrix(self) -> None:
|
|
96
|
+
"""Rebuild and pre-normalize the embedding matrix from chunks."""
|
|
97
|
+
embeddings = [c.embedding for c in self.chunks if c.embedding is not None]
|
|
98
|
+
if embeddings:
|
|
99
|
+
matrix = np.array(embeddings, dtype=np.float64)
|
|
100
|
+
# Pre-normalize for fast cosine similarity
|
|
101
|
+
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
|
102
|
+
norms[norms == 0] = 1 # Avoid division by zero
|
|
103
|
+
self._embedding_matrix = matrix / norms
|
|
104
|
+
else:
|
|
105
|
+
self._embedding_matrix = None
|
|
106
|
+
|
|
107
|
+
def clear(self) -> None:
|
|
108
|
+
"""Clear all chunks."""
|
|
109
|
+
self.chunks = []
|
|
110
|
+
self._embedding_matrix = None
|
|
111
|
+
|
|
112
|
+
def search(self, query_embedding: tuple[float, ...] | list[float], top_k: int = 5) -> list[tuple[Chunk, float]]:
|
|
113
|
+
"""Search for similar chunks using pre-normalized cosine similarity."""
|
|
114
|
+
if not self.chunks or self._embedding_matrix is None:
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
# Normalize query vector
|
|
118
|
+
query_vec = np.array(query_embedding, dtype=np.float64)
|
|
119
|
+
query_norm = np.linalg.norm(query_vec)
|
|
120
|
+
if query_norm == 0:
|
|
121
|
+
return []
|
|
122
|
+
query_normalized = query_vec / query_norm
|
|
123
|
+
|
|
124
|
+
# Fast cosine similarity: matrix is pre-normalized, just dot product
|
|
125
|
+
similarities = self._embedding_matrix @ query_normalized
|
|
126
|
+
|
|
127
|
+
# Get top_k indices efficiently
|
|
128
|
+
if len(similarities) <= top_k:
|
|
129
|
+
top_indices = np.argsort(similarities)[::-1]
|
|
130
|
+
else:
|
|
131
|
+
top_indices = np.argpartition(similarities, -top_k)[-top_k:]
|
|
132
|
+
top_indices = top_indices[np.argsort(similarities[top_indices])[::-1]]
|
|
133
|
+
|
|
134
|
+
return [(self.chunks[i], float(similarities[i])) for i in top_indices]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class RagitExperiment:
|
|
138
|
+
"""
|
|
139
|
+
Ragit Experiment - Automatic RAG Hyperparameter Optimization.
|
|
140
|
+
|
|
141
|
+
This class orchestrates the optimization of RAG pipeline hyperparameters
|
|
142
|
+
by systematically evaluating different configurations.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
documents : list[Document]
|
|
147
|
+
Documents to use as the knowledge base.
|
|
148
|
+
benchmark : list[BenchmarkQuestion]
|
|
149
|
+
Benchmark questions for evaluation.
|
|
150
|
+
embed_fn : Callable[[str], list[float]], optional
|
|
151
|
+
Function that takes text and returns an embedding vector.
|
|
152
|
+
generate_fn : Callable, optional
|
|
153
|
+
Function for text generation.
|
|
154
|
+
provider : BaseEmbeddingProvider, optional
|
|
155
|
+
Provider for embeddings and LLM. If embed_fn is provided, this is
|
|
156
|
+
ignored for embeddings but can be used for LLM.
|
|
157
|
+
|
|
158
|
+
Raises
|
|
159
|
+
------
|
|
160
|
+
ValueError
|
|
161
|
+
If neither embed_fn nor provider is provided.
|
|
162
|
+
|
|
163
|
+
Examples
|
|
164
|
+
--------
|
|
165
|
+
>>> # With custom functions
|
|
166
|
+
>>> experiment = RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)
|
|
167
|
+
>>>
|
|
168
|
+
>>> # With explicit provider
|
|
169
|
+
>>> from ragit.providers import OllamaProvider
|
|
170
|
+
>>> experiment = RagitExperiment(docs, benchmark, provider=OllamaProvider())
|
|
171
|
+
>>>
|
|
172
|
+
>>> results = experiment.run()
|
|
173
|
+
>>> print(results[0].config) # Best configuration
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
documents: list[Document],
|
|
179
|
+
benchmark: list[BenchmarkQuestion],
|
|
180
|
+
embed_fn: Callable[[str], list[float]] | None = None,
|
|
181
|
+
generate_fn: Callable[..., str] | None = None,
|
|
182
|
+
provider: BaseEmbeddingProvider | BaseLLMProvider | None = None,
|
|
183
|
+
):
|
|
184
|
+
self.documents = documents
|
|
185
|
+
self.benchmark = benchmark
|
|
186
|
+
self.vector_store = SimpleVectorStore()
|
|
187
|
+
self.results: list[EvaluationResult] = []
|
|
188
|
+
|
|
189
|
+
# Resolve provider from functions or explicit provider
|
|
190
|
+
self._embedding_provider: BaseEmbeddingProvider
|
|
191
|
+
self._llm_provider: BaseLLMProvider | None = None
|
|
192
|
+
|
|
193
|
+
if embed_fn is not None:
|
|
194
|
+
# Create FunctionProvider from provided functions
|
|
195
|
+
function_provider = FunctionProvider(
|
|
196
|
+
embed_fn=embed_fn,
|
|
197
|
+
generate_fn=generate_fn,
|
|
198
|
+
)
|
|
199
|
+
self._embedding_provider = function_provider
|
|
200
|
+
if generate_fn is not None:
|
|
201
|
+
self._llm_provider = function_provider
|
|
202
|
+
elif provider is not None and isinstance(provider, BaseLLMProvider):
|
|
203
|
+
self._llm_provider = provider
|
|
204
|
+
elif provider is not None:
|
|
205
|
+
if not isinstance(provider, BaseEmbeddingProvider):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
|
|
208
|
+
)
|
|
209
|
+
self._embedding_provider = provider
|
|
210
|
+
if isinstance(provider, BaseLLMProvider):
|
|
211
|
+
self._llm_provider = provider
|
|
212
|
+
else:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
"Must provide embed_fn or provider for embeddings. "
|
|
215
|
+
"Examples:\n"
|
|
216
|
+
" RagitExperiment(docs, benchmark, embed_fn=my_embed, generate_fn=my_llm)\n"
|
|
217
|
+
" RagitExperiment(docs, benchmark, provider=OllamaProvider())"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# LLM is required for evaluation
|
|
221
|
+
if self._llm_provider is None:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
"RagitExperiment requires LLM for evaluation. Provide generate_fn or a provider with LLM support."
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@property
|
|
227
|
+
def provider(self) -> BaseEmbeddingProvider:
|
|
228
|
+
"""Return the embedding provider (for backwards compatibility)."""
|
|
229
|
+
return self._embedding_provider
|
|
230
|
+
|
|
231
|
+
def define_search_space(
|
|
232
|
+
self,
|
|
233
|
+
chunk_sizes: list[int] | None = None,
|
|
234
|
+
chunk_overlaps: list[int] | None = None,
|
|
235
|
+
num_chunks_options: list[int] | None = None,
|
|
236
|
+
embedding_models: list[str] | None = None,
|
|
237
|
+
llm_models: list[str] | None = None,
|
|
238
|
+
) -> list[RAGConfig]:
|
|
239
|
+
"""
|
|
240
|
+
Define the hyperparameter search space.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
chunk_sizes : list[int], optional
|
|
245
|
+
Chunk sizes to test. Default: [256, 512]
|
|
246
|
+
chunk_overlaps : list[int], optional
|
|
247
|
+
Chunk overlaps to test. Default: [50, 100]
|
|
248
|
+
num_chunks_options : list[int], optional
|
|
249
|
+
Number of chunks to retrieve. Default: [2, 3]
|
|
250
|
+
embedding_models : list[str], optional
|
|
251
|
+
Embedding models to test. Default: ["default"]
|
|
252
|
+
llm_models : list[str], optional
|
|
253
|
+
LLM models to test. Default: ["default"]
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
list[RAGConfig]
|
|
258
|
+
List of configurations to evaluate.
|
|
259
|
+
"""
|
|
260
|
+
chunk_sizes = chunk_sizes or [256, 512]
|
|
261
|
+
chunk_overlaps = chunk_overlaps or [50, 100]
|
|
262
|
+
num_chunks_options = num_chunks_options or [2, 3]
|
|
263
|
+
embedding_models = embedding_models or ["default"]
|
|
264
|
+
llm_models = llm_models or ["default"]
|
|
265
|
+
|
|
266
|
+
configs = []
|
|
267
|
+
pattern_num = 1
|
|
268
|
+
|
|
269
|
+
for cs, co, nc, em, lm in product(
|
|
270
|
+
chunk_sizes, chunk_overlaps, num_chunks_options, embedding_models, llm_models
|
|
271
|
+
):
|
|
272
|
+
# Ensure overlap is less than chunk size
|
|
273
|
+
if co >= cs:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
configs.append(
|
|
277
|
+
RAGConfig(
|
|
278
|
+
name=f"Pattern_{pattern_num}",
|
|
279
|
+
chunk_size=cs,
|
|
280
|
+
chunk_overlap=co,
|
|
281
|
+
num_chunks=nc,
|
|
282
|
+
embedding_model=em,
|
|
283
|
+
llm_model=lm,
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
pattern_num += 1
|
|
287
|
+
|
|
288
|
+
return configs
|
|
289
|
+
|
|
290
|
+
def _chunk_document(self, doc: Document, chunk_size: int, overlap: int) -> list[Chunk]:
|
|
291
|
+
"""Split document into overlapping chunks."""
|
|
292
|
+
chunks = []
|
|
293
|
+
text = doc.content
|
|
294
|
+
start = 0
|
|
295
|
+
chunk_idx = 0
|
|
296
|
+
|
|
297
|
+
while start < len(text):
|
|
298
|
+
end = start + chunk_size
|
|
299
|
+
chunk_text = text[start:end].strip()
|
|
300
|
+
|
|
301
|
+
if chunk_text:
|
|
302
|
+
chunks.append(
|
|
303
|
+
Chunk(
|
|
304
|
+
content=chunk_text,
|
|
305
|
+
doc_id=doc.id,
|
|
306
|
+
chunk_index=chunk_idx,
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
chunk_idx += 1
|
|
310
|
+
|
|
311
|
+
start = end - overlap
|
|
312
|
+
if start >= len(text) - overlap:
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
return chunks
|
|
316
|
+
|
|
317
|
+
def _build_index(self, config: RAGConfig) -> None:
|
|
318
|
+
"""Build vector index with given configuration using batch embedding."""
|
|
319
|
+
self.vector_store.clear()
|
|
320
|
+
all_chunks: list[Chunk] = []
|
|
321
|
+
|
|
322
|
+
# Chunk all documents
|
|
323
|
+
for doc in self.documents:
|
|
324
|
+
chunks = self._chunk_document(doc, config.chunk_size, config.chunk_overlap)
|
|
325
|
+
all_chunks.extend(chunks)
|
|
326
|
+
|
|
327
|
+
if not all_chunks:
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
# Batch embed all chunks at once (single API call)
|
|
331
|
+
texts = [chunk.content for chunk in all_chunks]
|
|
332
|
+
responses = self._embedding_provider.embed_batch(texts, config.embedding_model)
|
|
333
|
+
|
|
334
|
+
for chunk, response in zip(all_chunks, responses, strict=True):
|
|
335
|
+
chunk.embedding = response.embedding
|
|
336
|
+
|
|
337
|
+
self.vector_store.add(all_chunks)
|
|
338
|
+
|
|
339
|
+
def _retrieve(self, query: str, config: RAGConfig) -> list[Chunk]:
|
|
340
|
+
"""Retrieve relevant chunks for a query."""
|
|
341
|
+
query_response = self._embedding_provider.embed(query, config.embedding_model)
|
|
342
|
+
results = self.vector_store.search(query_response.embedding, top_k=config.num_chunks)
|
|
343
|
+
return [chunk for chunk, _ in results]
|
|
344
|
+
|
|
345
|
+
def _generate(self, question: str, context: str, config: RAGConfig) -> str:
|
|
346
|
+
"""Generate answer using RAG."""
|
|
347
|
+
if self._llm_provider is None:
|
|
348
|
+
raise ValueError("LLM provider is required for generation")
|
|
349
|
+
|
|
350
|
+
system_prompt = """You are a helpful assistant. Answer questions based ONLY on the provided context.
|
|
351
|
+
If the context doesn't contain enough information, say so. Be concise and accurate."""
|
|
352
|
+
|
|
353
|
+
prompt = f"""Context:
|
|
354
|
+
{context}
|
|
355
|
+
|
|
356
|
+
Question: {question}
|
|
357
|
+
|
|
358
|
+
Answer:"""
|
|
359
|
+
|
|
360
|
+
response = self._llm_provider.generate(
|
|
361
|
+
prompt=prompt,
|
|
362
|
+
model=config.llm_model,
|
|
363
|
+
system_prompt=system_prompt,
|
|
364
|
+
temperature=0.7,
|
|
365
|
+
)
|
|
366
|
+
return response.text
|
|
367
|
+
|
|
368
|
+
def _evaluate_response(
|
|
369
|
+
self,
|
|
370
|
+
question: str,
|
|
371
|
+
generated: str,
|
|
372
|
+
ground_truth: str,
|
|
373
|
+
context: str,
|
|
374
|
+
config: RAGConfig,
|
|
375
|
+
) -> EvaluationScores:
|
|
376
|
+
"""Evaluate a RAG response using LLM-as-judge."""
|
|
377
|
+
if self._llm_provider is None:
|
|
378
|
+
raise ValueError("LLM provider is required for evaluation")
|
|
379
|
+
|
|
380
|
+
def extract_score(response: str) -> float:
|
|
381
|
+
"""Extract numeric score from LLM response."""
|
|
382
|
+
try:
|
|
383
|
+
# Find first number in response
|
|
384
|
+
nums = "".join(c for c in response if c.isdigit() or c == ".")
|
|
385
|
+
if nums:
|
|
386
|
+
score = float(nums.split(".")[0]) # Take integer part
|
|
387
|
+
return min(100, max(0, score)) / 100
|
|
388
|
+
except (ValueError, IndexError):
|
|
389
|
+
pass
|
|
390
|
+
return 0.5
|
|
391
|
+
|
|
392
|
+
# Evaluate answer correctness
|
|
393
|
+
correctness_prompt = f"""Rate how correct this answer is compared to ground truth (0-100):
|
|
394
|
+
|
|
395
|
+
Question: {question}
|
|
396
|
+
Ground Truth: {ground_truth}
|
|
397
|
+
Generated Answer: {generated}
|
|
398
|
+
|
|
399
|
+
Respond with ONLY a number 0-100."""
|
|
400
|
+
|
|
401
|
+
resp = self._llm_provider.generate(correctness_prompt, config.llm_model)
|
|
402
|
+
correctness = extract_score(resp.text)
|
|
403
|
+
|
|
404
|
+
# Evaluate context relevance
|
|
405
|
+
relevance_prompt = f"""Rate how relevant this context is for answering the question (0-100):
|
|
406
|
+
|
|
407
|
+
Question: {question}
|
|
408
|
+
Context: {context[:1000]}
|
|
409
|
+
|
|
410
|
+
Respond with ONLY a number 0-100."""
|
|
411
|
+
|
|
412
|
+
resp = self._llm_provider.generate(relevance_prompt, config.llm_model)
|
|
413
|
+
relevance = extract_score(resp.text)
|
|
414
|
+
|
|
415
|
+
# Evaluate faithfulness
|
|
416
|
+
faithfulness_prompt = f"""Rate if this answer is grounded in the context (0-100):
|
|
417
|
+
|
|
418
|
+
Context: {context[:1000]}
|
|
419
|
+
Answer: {generated}
|
|
420
|
+
|
|
421
|
+
Respond with ONLY a number 0-100."""
|
|
422
|
+
|
|
423
|
+
resp = self._llm_provider.generate(faithfulness_prompt, config.llm_model)
|
|
424
|
+
faithfulness = extract_score(resp.text)
|
|
425
|
+
|
|
426
|
+
return EvaluationScores(
|
|
427
|
+
answer_correctness=correctness,
|
|
428
|
+
context_relevance=relevance,
|
|
429
|
+
faithfulness=faithfulness,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
def evaluate_config(self, config: RAGConfig, verbose: bool = False) -> EvaluationResult:
|
|
433
|
+
"""
|
|
434
|
+
Evaluate a single RAG configuration.
|
|
435
|
+
|
|
436
|
+
Parameters
|
|
437
|
+
----------
|
|
438
|
+
config : RAGConfig
|
|
439
|
+
Configuration to evaluate.
|
|
440
|
+
verbose : bool
|
|
441
|
+
Print progress information.
|
|
442
|
+
|
|
443
|
+
Returns
|
|
444
|
+
-------
|
|
445
|
+
EvaluationResult
|
|
446
|
+
Evaluation results for this configuration.
|
|
447
|
+
"""
|
|
448
|
+
if verbose:
|
|
449
|
+
print(f"\nEvaluating {config.name}:")
|
|
450
|
+
print(f" chunk_size={config.chunk_size}, overlap={config.chunk_overlap}, num_chunks={config.num_chunks}")
|
|
451
|
+
|
|
452
|
+
start_time = time.time()
|
|
453
|
+
|
|
454
|
+
# Build index
|
|
455
|
+
self._build_index(config)
|
|
456
|
+
|
|
457
|
+
# Evaluate on benchmark
|
|
458
|
+
all_scores = []
|
|
459
|
+
|
|
460
|
+
for qa in self.benchmark:
|
|
461
|
+
# Retrieve
|
|
462
|
+
chunks = self._retrieve(qa.question, config)
|
|
463
|
+
context = "\n\n".join(f"[{c.doc_id}]: {c.content}" for c in chunks)
|
|
464
|
+
|
|
465
|
+
# Generate
|
|
466
|
+
answer = self._generate(qa.question, context, config)
|
|
467
|
+
|
|
468
|
+
# Evaluate
|
|
469
|
+
scores = self._evaluate_response(qa.question, answer, qa.ground_truth, context, config)
|
|
470
|
+
all_scores.append(scores)
|
|
471
|
+
|
|
472
|
+
# Aggregate scores (use generators for memory efficiency)
|
|
473
|
+
avg_correctness = np.mean([s.answer_correctness for s in all_scores])
|
|
474
|
+
avg_relevance = np.mean([s.context_relevance for s in all_scores])
|
|
475
|
+
avg_faithfulness = np.mean([s.faithfulness for s in all_scores])
|
|
476
|
+
combined = float(np.mean([s.combined_score for s in all_scores]))
|
|
477
|
+
|
|
478
|
+
execution_time = time.time() - start_time
|
|
479
|
+
|
|
480
|
+
if verbose:
|
|
481
|
+
print(
|
|
482
|
+
f" Scores: correctness={avg_correctness:.2f}, "
|
|
483
|
+
f"relevance={avg_relevance:.2f}, faithfulness={avg_faithfulness:.2f}"
|
|
484
|
+
)
|
|
485
|
+
print(f" Combined: {combined:.3f} | Time: {execution_time:.1f}s")
|
|
486
|
+
|
|
487
|
+
return EvaluationResult(
|
|
488
|
+
pattern_name=config.name,
|
|
489
|
+
indexing_params={
|
|
490
|
+
"chunk_size": config.chunk_size,
|
|
491
|
+
"chunk_overlap": config.chunk_overlap,
|
|
492
|
+
"embedding_model": config.embedding_model,
|
|
493
|
+
},
|
|
494
|
+
inference_params={
|
|
495
|
+
"num_chunks": config.num_chunks,
|
|
496
|
+
"llm_model": config.llm_model,
|
|
497
|
+
},
|
|
498
|
+
scores={
|
|
499
|
+
"answer_correctness": {"mean": float(avg_correctness)},
|
|
500
|
+
"context_relevance": {"mean": float(avg_relevance)},
|
|
501
|
+
"faithfulness": {"mean": float(avg_faithfulness)},
|
|
502
|
+
},
|
|
503
|
+
execution_time=execution_time,
|
|
504
|
+
final_score=float(combined),
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
def run(
|
|
508
|
+
self,
|
|
509
|
+
configs: list[RAGConfig] | None = None,
|
|
510
|
+
max_configs: int | None = None,
|
|
511
|
+
verbose: bool = True,
|
|
512
|
+
) -> list[EvaluationResult]:
|
|
513
|
+
"""
|
|
514
|
+
Run the RAG optimization experiment.
|
|
515
|
+
|
|
516
|
+
Parameters
|
|
517
|
+
----------
|
|
518
|
+
configs : list[RAGConfig], optional
|
|
519
|
+
Configurations to evaluate. If None, uses default search space.
|
|
520
|
+
max_configs : int, optional
|
|
521
|
+
Maximum number of configurations to evaluate.
|
|
522
|
+
verbose : bool
|
|
523
|
+
Print progress information.
|
|
524
|
+
|
|
525
|
+
Returns
|
|
526
|
+
-------
|
|
527
|
+
list[EvaluationResult]
|
|
528
|
+
Results sorted by combined score (best first).
|
|
529
|
+
"""
|
|
530
|
+
if configs is None:
|
|
531
|
+
configs = self.define_search_space()
|
|
532
|
+
|
|
533
|
+
if max_configs:
|
|
534
|
+
configs = configs[:max_configs]
|
|
535
|
+
|
|
536
|
+
if verbose:
|
|
537
|
+
print("=" * 60)
|
|
538
|
+
print("RAGIT: RAG Optimization Experiment")
|
|
539
|
+
print("=" * 60)
|
|
540
|
+
print(f"Configurations to test: {len(configs)}")
|
|
541
|
+
print(f"Documents: {len(self.documents)}")
|
|
542
|
+
print(f"Benchmark questions: {len(self.benchmark)}")
|
|
543
|
+
print()
|
|
544
|
+
|
|
545
|
+
self.results = []
|
|
546
|
+
|
|
547
|
+
for cfg in tqdm(configs, desc="Evaluating configs", disable=not verbose):
|
|
548
|
+
result = self.evaluate_config(cfg, verbose=verbose)
|
|
549
|
+
self.results.append(result)
|
|
550
|
+
|
|
551
|
+
# Sort by combined score (best first)
|
|
552
|
+
self.results.sort(key=lambda x: x.final_score, reverse=True)
|
|
553
|
+
|
|
554
|
+
if verbose:
|
|
555
|
+
print("\n" + "=" * 60)
|
|
556
|
+
print("RESULTS (sorted by score)")
|
|
557
|
+
print("=" * 60)
|
|
558
|
+
for i, result in enumerate(self.results[:5], 1):
|
|
559
|
+
print(f"{i}. {result.pattern_name}: {result.final_score:.3f}")
|
|
560
|
+
print(
|
|
561
|
+
f" chunk_size={result.indexing_params['chunk_size']}, "
|
|
562
|
+
f"num_chunks={result.inference_params['num_chunks']}"
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
return self.results
|
|
566
|
+
|
|
567
|
+
def get_best_config(self) -> EvaluationResult | None:
|
|
568
|
+
"""Get the best configuration from results."""
|
|
569
|
+
if not self.results:
|
|
570
|
+
return None
|
|
571
|
+
return self.results[0]
|