ragmint 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragmint might be problematic. Click here for more details.
- {ragmint-0.4.0/src/ragmint.egg-info → ragmint-0.4.2}/PKG-INFO +19 -10
- {ragmint-0.4.0 → ragmint-0.4.2}/README.md +18 -8
- {ragmint-0.4.0 → ragmint-0.4.2}/pyproject.toml +1 -4
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/autotuner.py +33 -15
- ragmint-0.4.2/src/ragmint/explainer.py +88 -0
- ragmint-0.4.2/src/ragmint/leaderboard.py +51 -0
- ragmint-0.4.2/src/ragmint/tests/test_explainer.py +36 -0
- ragmint-0.4.2/src/ragmint/tests/test_leaderboard.py +92 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tuner.py +23 -2
- {ragmint-0.4.0 → ragmint-0.4.2/src/ragmint.egg-info}/PKG-INFO +19 -10
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/SOURCES.txt +0 -1
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/requires.txt +0 -1
- ragmint-0.4.0/src/ragmint/explainer.py +0 -63
- ragmint-0.4.0/src/ragmint/leaderboard.py +0 -45
- ragmint-0.4.0/src/ragmint/tests/test_explainer.py +0 -20
- ragmint-0.4.0/src/ragmint/tests/test_explainer_integration.py +0 -18
- ragmint-0.4.0/src/ragmint/tests/test_leaderboard.py +0 -39
- {ragmint-0.4.0 → ragmint-0.4.2}/LICENSE +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/MANIFEST.in +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/setup.cfg +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/__main__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/chunking.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/embeddings.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/evaluation.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/pipeline.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/reranker.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/retriever.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/experiments/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/experiments/validation_qa.json +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/config_adapter.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/langchain_prebuilder.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/optimization/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/optimization/search.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/conftest.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_autotuner.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_config_adapter.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_embeddings.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_langchain_prebuilder.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_pipeline.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_retriever.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_search.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_tuner.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/__init__.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/caching.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/data_loader.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/logger.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/metrics.py +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/dependency_links.txt +0 -0
- {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragmint
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
5
|
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
|
|
|
27
27
|
Requires-Dist: openai>=1.0.0
|
|
28
28
|
Requires-Dist: google-generativeai>=0.8.0
|
|
29
29
|
Requires-Dist: anthropic>=0.25.0
|
|
30
|
-
Requires-Dist: supabase>=2.4.0
|
|
31
30
|
Requires-Dist: pytest
|
|
32
31
|
Requires-Dist: langchain>=0.2.5
|
|
33
32
|
Requires-Dist: langchain-community>=0.2.5
|
|
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
|
|
|
273
272
|
```python
|
|
274
273
|
from ragmint.leaderboard import Leaderboard
|
|
275
274
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
275
|
+
# Initialize local leaderboard
|
|
276
|
+
leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
|
|
277
|
+
|
|
278
|
+
# Retrieve top 5 runs
|
|
279
|
+
print("\n🏅 Top 5 Experiments:")
|
|
280
|
+
for result in leaderboard.top_results(limit=5):
|
|
281
|
+
print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
|
|
279
282
|
```
|
|
280
283
|
|
|
281
284
|
---
|
|
282
285
|
|
|
283
286
|
## 🧠 Explainability with Gemini / Claude
|
|
284
287
|
|
|
285
|
-
Compare
|
|
288
|
+
Compare RAG configurations and receive **natural language insights** on why one performs better.
|
|
286
289
|
|
|
287
290
|
```python
|
|
291
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
288
292
|
from ragmint.explainer import explain_results
|
|
289
293
|
|
|
290
|
-
|
|
291
|
-
|
|
294
|
+
tuner = AutoRAGTuner(docs_path="data/docs/")
|
|
295
|
+
best, results = tuner.auto_tune(
|
|
296
|
+
validation_set='data/docs/validation_qa.json',
|
|
297
|
+
metric="faithfulness",
|
|
298
|
+
trials=5,
|
|
299
|
+
search_type='bayesian'
|
|
300
|
+
)
|
|
292
301
|
|
|
293
|
-
|
|
294
|
-
print(
|
|
302
|
+
analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
|
|
303
|
+
print(analysis)
|
|
295
304
|
```
|
|
296
305
|
|
|
297
306
|
> Set your API keys in a `.env` file or via environment variables:
|
|
@@ -229,25 +229,35 @@ Track and visualize your best experiments across runs.
|
|
|
229
229
|
```python
|
|
230
230
|
from ragmint.leaderboard import Leaderboard
|
|
231
231
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
232
|
+
# Initialize local leaderboard
|
|
233
|
+
leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
|
|
234
|
+
|
|
235
|
+
# Retrieve top 5 runs
|
|
236
|
+
print("\n🏅 Top 5 Experiments:")
|
|
237
|
+
for result in leaderboard.top_results(limit=5):
|
|
238
|
+
print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
|
|
235
239
|
```
|
|
236
240
|
|
|
237
241
|
---
|
|
238
242
|
|
|
239
243
|
## 🧠 Explainability with Gemini / Claude
|
|
240
244
|
|
|
241
|
-
Compare
|
|
245
|
+
Compare RAG configurations and receive **natural language insights** on why one performs better.
|
|
242
246
|
|
|
243
247
|
```python
|
|
248
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
244
249
|
from ragmint.explainer import explain_results
|
|
245
250
|
|
|
246
|
-
|
|
247
|
-
|
|
251
|
+
tuner = AutoRAGTuner(docs_path="data/docs/")
|
|
252
|
+
best, results = tuner.auto_tune(
|
|
253
|
+
validation_set='data/docs/validation_qa.json',
|
|
254
|
+
metric="faithfulness",
|
|
255
|
+
trials=5,
|
|
256
|
+
search_type='bayesian'
|
|
257
|
+
)
|
|
248
258
|
|
|
249
|
-
|
|
250
|
-
print(
|
|
259
|
+
analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
|
|
260
|
+
print(analysis)
|
|
251
261
|
```
|
|
252
262
|
|
|
253
263
|
> Set your API keys in a `.env` file or via environment variables:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ragmint"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.2"
|
|
8
8
|
description = "A modular framework for evaluating and optimizing RAG pipelines."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache License 2.0" }
|
|
@@ -40,9 +40,6 @@ dependencies = [
|
|
|
40
40
|
"google-generativeai>=0.8.0",
|
|
41
41
|
"anthropic>=0.25.0",
|
|
42
42
|
|
|
43
|
-
# Integration / storage
|
|
44
|
-
"supabase>=2.4.0",
|
|
45
|
-
|
|
46
43
|
# Testing
|
|
47
44
|
"pytest",
|
|
48
45
|
|
|
@@ -63,7 +63,8 @@ class AutoRAGTuner:
|
|
|
63
63
|
def suggest_chunk_sizes(
|
|
64
64
|
self,
|
|
65
65
|
model_name: Optional[str] = None,
|
|
66
|
-
num_pairs: Optional[int] = None
|
|
66
|
+
num_pairs: Optional[int] = None,
|
|
67
|
+
step: int = 10
|
|
67
68
|
) -> List[Tuple[int, int]]:
|
|
68
69
|
if num_pairs is None:
|
|
69
70
|
raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
|
|
@@ -74,21 +75,27 @@ class AutoRAGTuner:
|
|
|
74
75
|
|
|
75
76
|
model = SentenceTransformer(model_name)
|
|
76
77
|
max_tokens = getattr(model, "max_seq_length", 256)
|
|
77
|
-
|
|
78
78
|
approx_words = max(1, int(max_tokens * 0.75))
|
|
79
79
|
avg_len = self.corpus_stats.get("avg_len", 400)
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
81
|
+
max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
|
|
82
|
+
|
|
83
|
+
# Safe chunk and overlap ranges
|
|
84
|
+
chunk_sizes = list(range(50, max_chunk + 1, step))
|
|
85
|
+
overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
|
|
86
|
+
if not overlaps:
|
|
87
|
+
overlaps = [max(1, max_chunk // 4)]
|
|
88
|
+
|
|
89
|
+
candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
|
|
90
|
+
|
|
91
|
+
# Randomly sample requested number of pairs
|
|
92
|
+
if num_pairs >= len(candidates):
|
|
93
|
+
sampled = candidates
|
|
94
|
+
else:
|
|
95
|
+
sampled = random.sample(candidates, num_pairs)
|
|
89
96
|
|
|
90
|
-
logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {
|
|
91
|
-
return
|
|
97
|
+
logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
|
|
98
|
+
return sampled
|
|
92
99
|
|
|
93
100
|
# -----------------------------
|
|
94
101
|
# Recommendation Logic
|
|
@@ -130,7 +137,16 @@ class AutoRAGTuner:
|
|
|
130
137
|
logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
|
|
131
138
|
|
|
132
139
|
# Suggest chunk sizes
|
|
133
|
-
|
|
140
|
+
# Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
|
|
141
|
+
chunk_candidates = self.suggest_chunk_sizes(
|
|
142
|
+
model_name=embedding_model,
|
|
143
|
+
num_pairs=num_chunk_pairs
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Safety check
|
|
147
|
+
if not chunk_candidates:
|
|
148
|
+
raise RuntimeError("No chunk candidates generated.")
|
|
149
|
+
|
|
134
150
|
# Pick the first pair as default recommendation
|
|
135
151
|
chunk_size, overlap = chunk_candidates[0]
|
|
136
152
|
|
|
@@ -176,6 +192,8 @@ class AutoRAGTuner:
|
|
|
176
192
|
"""
|
|
177
193
|
rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
|
|
178
194
|
|
|
195
|
+
chunk_candidates = rec["chunk_candidates"]
|
|
196
|
+
|
|
179
197
|
logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
|
|
180
198
|
|
|
181
199
|
tuner = RAGMint(
|
|
@@ -183,8 +201,8 @@ class AutoRAGTuner:
|
|
|
183
201
|
retrievers=[rec["retriever"]],
|
|
184
202
|
embeddings=[rec["embedding_model"]],
|
|
185
203
|
rerankers=["mmr"],
|
|
186
|
-
chunk_sizes=[
|
|
187
|
-
overlaps=[
|
|
204
|
+
chunk_sizes=[c[0] for c in chunk_candidates],
|
|
205
|
+
overlaps=[c[1] for c in chunk_candidates],
|
|
188
206
|
strategies=[rec["strategy"]],
|
|
189
207
|
)
|
|
190
208
|
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interpretability Layer
|
|
3
|
+
----------------------
|
|
4
|
+
Uses Gemini or Anthropic Claude to explain why a particular RAG configuration
|
|
5
|
+
performed best, considering both optimizer results and corpus characteristics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
|
|
12
|
+
# Load .env if available
|
|
13
|
+
load_dotenv()
|
|
14
|
+
|
|
15
|
+
def explain_results(best_result: dict, all_results: list, corpus_stats: dict = None,
|
|
16
|
+
model: str = "gemini-2.5-flash-lite") -> str:
|
|
17
|
+
"""
|
|
18
|
+
Generate a detailed natural-language explanation for RAG optimization results.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
- best_result: dict containing the best configuration and metrics.
|
|
22
|
+
- all_results: list of all trial results with metrics and configs.
|
|
23
|
+
- corpus_stats: optional dict with corpus info (size, avg_len, num_docs).
|
|
24
|
+
- model: LLM model name (Gemini or Claude).
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A natural-language explanation string.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
31
|
+
google_key = os.getenv("GOOGLE_API_KEY")
|
|
32
|
+
|
|
33
|
+
# Build dynamic context
|
|
34
|
+
corpus_info = json.dumps(corpus_stats or {}, indent=2)
|
|
35
|
+
best_json = json.dumps(best_result, indent=2)
|
|
36
|
+
all_json = json.dumps(list(all_results)[:10], indent=2) #cap for safety
|
|
37
|
+
|
|
38
|
+
prompt = f"""
|
|
39
|
+
You are an expert AI researcher specializing in Retrieval-Augmented Generation (RAG) optimization.
|
|
40
|
+
|
|
41
|
+
A RAG auto-tuner was run on a corpus with these characteristics:
|
|
42
|
+
{corpus_info}
|
|
43
|
+
|
|
44
|
+
The tuner evaluated multiple configurations and metrics. Below are:
|
|
45
|
+
- The BEST configuration:
|
|
46
|
+
{best_json}
|
|
47
|
+
|
|
48
|
+
- A sample of ALL evaluated configurations:
|
|
49
|
+
{all_json}
|
|
50
|
+
|
|
51
|
+
Please:
|
|
52
|
+
1. Explain WHY this best configuration likely performs better than others.
|
|
53
|
+
2. Highlight trade-offs between accuracy, latency, and resource usage.
|
|
54
|
+
3. Suggest potential improvements (different chunking, embedding, retriever, etc.).
|
|
55
|
+
4. Provide a concise summary of which setup you recommend for this corpus.
|
|
56
|
+
Keep it structured, under 300 words, and easy to read.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# --- 1️⃣ Anthropic Claude first ---
|
|
60
|
+
if anthropic_key:
|
|
61
|
+
try:
|
|
62
|
+
from anthropic import Anthropic
|
|
63
|
+
client = Anthropic(api_key=anthropic_key)
|
|
64
|
+
response = client.messages.create(
|
|
65
|
+
model="claude-3-opus-20240229",
|
|
66
|
+
max_tokens=500,
|
|
67
|
+
messages=[{"role": "user", "content": prompt}],
|
|
68
|
+
)
|
|
69
|
+
return response.content[0].text
|
|
70
|
+
except Exception as e:
|
|
71
|
+
return f"[Claude unavailable] {e}"
|
|
72
|
+
|
|
73
|
+
# --- 2️⃣ Gemini fallback ---
|
|
74
|
+
elif google_key:
|
|
75
|
+
try:
|
|
76
|
+
import google.generativeai as genai
|
|
77
|
+
genai.configure(api_key=google_key)
|
|
78
|
+
response = genai.GenerativeModel(model).generate_content(prompt)
|
|
79
|
+
return response.text
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return f"[Gemini unavailable] {e}"
|
|
82
|
+
|
|
83
|
+
# --- 3️⃣ Fallback message ---
|
|
84
|
+
else:
|
|
85
|
+
return (
|
|
86
|
+
"[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
|
|
87
|
+
"to enable interpretability via Claude or Gemini."
|
|
88
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Dict, Any, List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Leaderboard:
|
|
8
|
+
def __init__(self, storage_path: Optional[str] = "leaderboard.jsonl"):
|
|
9
|
+
self.storage_path = storage_path
|
|
10
|
+
os.makedirs(os.path.dirname(self.storage_path) or ".", exist_ok=True)
|
|
11
|
+
|
|
12
|
+
if not os.path.exists(self.storage_path):
|
|
13
|
+
open(self.storage_path, "w", encoding="utf-8").close()
|
|
14
|
+
|
|
15
|
+
def upload(
|
|
16
|
+
self,
|
|
17
|
+
run_id: str,
|
|
18
|
+
best_config: Dict[str, Any],
|
|
19
|
+
best_score: float,
|
|
20
|
+
all_results: List[Dict[str, Any]],
|
|
21
|
+
documents: List[str],
|
|
22
|
+
model: str,
|
|
23
|
+
corpus_stats: Optional[Dict[str, Any]] = None,
|
|
24
|
+
):
|
|
25
|
+
"""Persist a full experiment run to local leaderboard."""
|
|
26
|
+
data = {
|
|
27
|
+
"run_id": run_id,
|
|
28
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
29
|
+
"best_config": best_config,
|
|
30
|
+
"best_score": best_score,
|
|
31
|
+
"all_results": all_results,
|
|
32
|
+
"documents": [os.path.basename(d) for d in documents],
|
|
33
|
+
"model": model,
|
|
34
|
+
"corpus_stats": corpus_stats or {},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
with open(self.storage_path, "a", encoding="utf-8") as f:
|
|
38
|
+
f.write(json.dumps(data) + "\n")
|
|
39
|
+
|
|
40
|
+
return data
|
|
41
|
+
|
|
42
|
+
def all_results(self) -> List[Dict[str, Any]]:
|
|
43
|
+
if not os.path.exists(self.storage_path):
|
|
44
|
+
return []
|
|
45
|
+
with open(self.storage_path, "r", encoding="utf-8") as f:
|
|
46
|
+
return [json.loads(line) for line in f if line.strip()]
|
|
47
|
+
|
|
48
|
+
def top_results(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
49
|
+
"""Return top experiments by score."""
|
|
50
|
+
results = self.all_results()
|
|
51
|
+
return sorted(results, key=lambda x: x.get("best_score", 0.0), reverse=True)[:limit]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import sys
|
|
3
|
+
import types
|
|
4
|
+
from ragmint.explainer import explain_results
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_explain_results_with_claude(monkeypatch):
|
|
8
|
+
"""Claude explanation should use Anthropic API path when ANTHROPIC_API_KEY is set."""
|
|
9
|
+
monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key")
|
|
10
|
+
monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
|
|
11
|
+
|
|
12
|
+
# Create a fake anthropic module with the required interface
|
|
13
|
+
mock_anthropic = types.ModuleType("anthropic")
|
|
14
|
+
|
|
15
|
+
class MockContent:
|
|
16
|
+
text = "Claude: The best configuration performs well due to optimized chunk size."
|
|
17
|
+
|
|
18
|
+
class MockMessages:
|
|
19
|
+
def create(self, *args, **kwargs):
|
|
20
|
+
return type("MockResponse", (), {"content": [MockContent()]})()
|
|
21
|
+
|
|
22
|
+
class MockClient:
|
|
23
|
+
def __init__(self, api_key):
|
|
24
|
+
self.messages = MockMessages()
|
|
25
|
+
|
|
26
|
+
mock_anthropic.Anthropic = MockClient
|
|
27
|
+
sys.modules["anthropic"] = mock_anthropic # Inject fake module
|
|
28
|
+
|
|
29
|
+
best = {"retriever": "Chroma", "metric": 0.9}
|
|
30
|
+
all_results = [{"retriever": "FAISS", "metric": 0.85}]
|
|
31
|
+
corpus_stats = {"size": 10000, "avg_len": 400, "num_docs": 20}
|
|
32
|
+
|
|
33
|
+
result = explain_results(best, all_results, corpus_stats, model="claude-3-opus-20240229")
|
|
34
|
+
|
|
35
|
+
assert isinstance(result, str)
|
|
36
|
+
assert "Claude" in result or "claude" in result
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import tempfile
|
|
4
|
+
import pytest
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from ragmint.leaderboard import Leaderboard
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def temp_leaderboard():
|
|
11
|
+
"""Create a temporary leaderboard file for testing."""
|
|
12
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
13
|
+
path = os.path.join(tmpdir, "leaderboard.jsonl")
|
|
14
|
+
lb = Leaderboard(storage_path=path)
|
|
15
|
+
yield lb, path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_upload_and_persistence(temp_leaderboard):
|
|
19
|
+
lb, path = temp_leaderboard
|
|
20
|
+
|
|
21
|
+
# --- Mock experiment data ---
|
|
22
|
+
run_id = "run_001"
|
|
23
|
+
best_config = {"retriever": "FAISS", "embedding_model": "all-MiniLM"}
|
|
24
|
+
best_score = 0.92
|
|
25
|
+
all_results = [
|
|
26
|
+
{"retriever": "FAISS", "score": 0.92},
|
|
27
|
+
{"retriever": "BM25", "score": 0.85},
|
|
28
|
+
]
|
|
29
|
+
documents = ["docs/a.txt", "docs/b.txt"]
|
|
30
|
+
model = "gemini"
|
|
31
|
+
corpus_stats = {"size": 20000, "avg_len": 400, "num_docs": 10}
|
|
32
|
+
|
|
33
|
+
# --- Upload ---
|
|
34
|
+
record = lb.upload(
|
|
35
|
+
run_id=run_id,
|
|
36
|
+
best_config=best_config,
|
|
37
|
+
best_score=best_score,
|
|
38
|
+
all_results=all_results,
|
|
39
|
+
documents=documents,
|
|
40
|
+
model=model,
|
|
41
|
+
corpus_stats=corpus_stats,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# --- Validate returned record ---
|
|
45
|
+
assert record["run_id"] == run_id
|
|
46
|
+
assert record["model"] == "gemini"
|
|
47
|
+
assert "timestamp" in record
|
|
48
|
+
assert record["best_score"] == 0.92
|
|
49
|
+
assert all(doc in record["documents"] for doc in ["a.txt", "b.txt"])
|
|
50
|
+
|
|
51
|
+
# --- File should contain JSON line ---
|
|
52
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
53
|
+
lines = f.readlines()
|
|
54
|
+
assert len(lines) == 1
|
|
55
|
+
parsed = json.loads(lines[0])
|
|
56
|
+
assert parsed["run_id"] == run_id
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_top_results_ordering(temp_leaderboard):
|
|
60
|
+
lb, _ = temp_leaderboard
|
|
61
|
+
|
|
62
|
+
# Upload multiple runs with varying scores
|
|
63
|
+
for i, score in enumerate([0.8, 0.95, 0.7]):
|
|
64
|
+
lb.upload(
|
|
65
|
+
run_id=f"run_{i}",
|
|
66
|
+
best_config={"retriever": "FAISS"},
|
|
67
|
+
best_score=score,
|
|
68
|
+
all_results=[],
|
|
69
|
+
documents=["file.txt"],
|
|
70
|
+
model="claude",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# --- Get top results ---
|
|
74
|
+
top = lb.top_results(limit=2)
|
|
75
|
+
assert len(top) == 2
|
|
76
|
+
|
|
77
|
+
# --- Ensure ordering descending by score ---
|
|
78
|
+
assert top[0]["best_score"] >= top[1]["best_score"]
|
|
79
|
+
assert top[0]["best_score"] == 0.95
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_all_results_reads_all_entries(temp_leaderboard):
|
|
83
|
+
lb, _ = temp_leaderboard
|
|
84
|
+
|
|
85
|
+
# Add two runs
|
|
86
|
+
lb.upload("run_a", {}, 0.5, [], ["doc1.txt"], "gemini")
|
|
87
|
+
lb.upload("run_b", {}, 0.7, [], ["doc2.txt"], "claude")
|
|
88
|
+
|
|
89
|
+
results = lb.all_results()
|
|
90
|
+
assert len(results) == 2
|
|
91
|
+
run_ids = {r["run_id"] for r in results}
|
|
92
|
+
assert {"run_a", "run_b"} <= run_ids
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
from typing import Any, Dict, List, Tuple
|
|
5
4
|
from time import perf_counter
|
|
@@ -11,6 +10,8 @@ from .core.reranker import Reranker
|
|
|
11
10
|
from .core.evaluation import Evaluator
|
|
12
11
|
from .optimization.search import GridSearch, RandomSearch, BayesianSearch
|
|
13
12
|
from .utils.data_loader import load_validation_set
|
|
13
|
+
from .leaderboard import Leaderboard
|
|
14
|
+
from uuid import uuid4
|
|
14
15
|
|
|
15
16
|
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
16
17
|
|
|
@@ -151,7 +152,7 @@ class RAGMint:
|
|
|
151
152
|
"""Run optimization search over retrievers, embeddings, rerankers, and chunking."""
|
|
152
153
|
validation = load_validation_set(validation_set or "default")
|
|
153
154
|
|
|
154
|
-
#
|
|
155
|
+
# search space
|
|
155
156
|
search_space = {
|
|
156
157
|
"retriever": self.retrievers,
|
|
157
158
|
"embedding_model": self.embeddings,
|
|
@@ -186,4 +187,24 @@ class RAGMint:
|
|
|
186
187
|
best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
|
|
187
188
|
logging.info(f"🏆 Best configuration: {best}")
|
|
188
189
|
|
|
190
|
+
# Save to leaderboard
|
|
191
|
+
run_id = f"run_{uuid4().hex[:8]}"
|
|
192
|
+
leaderboard = Leaderboard()
|
|
193
|
+
|
|
194
|
+
corpus_stats = {
|
|
195
|
+
"num_docs": len(self.documents),
|
|
196
|
+
"avg_len": sum(len(d.split()) for d in self.documents) / max(1, len(self.documents)),
|
|
197
|
+
"corpus_size": sum(len(d) for d in self.documents),
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
leaderboard.upload(
|
|
201
|
+
run_id=run_id,
|
|
202
|
+
best_config=best,
|
|
203
|
+
best_score=best.get(metric, 0.0),
|
|
204
|
+
all_results=results,
|
|
205
|
+
documents=os.listdir(self.docs_path),
|
|
206
|
+
model=best.get("embedding_model", "unknown"),
|
|
207
|
+
corpus_stats=corpus_stats,
|
|
208
|
+
)
|
|
209
|
+
|
|
189
210
|
return best, results
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragmint
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
5
|
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
|
|
|
27
27
|
Requires-Dist: openai>=1.0.0
|
|
28
28
|
Requires-Dist: google-generativeai>=0.8.0
|
|
29
29
|
Requires-Dist: anthropic>=0.25.0
|
|
30
|
-
Requires-Dist: supabase>=2.4.0
|
|
31
30
|
Requires-Dist: pytest
|
|
32
31
|
Requires-Dist: langchain>=0.2.5
|
|
33
32
|
Requires-Dist: langchain-community>=0.2.5
|
|
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
|
|
|
273
272
|
```python
|
|
274
273
|
from ragmint.leaderboard import Leaderboard
|
|
275
274
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
275
|
+
# Initialize local leaderboard
|
|
276
|
+
leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
|
|
277
|
+
|
|
278
|
+
# Retrieve top 5 runs
|
|
279
|
+
print("\n🏅 Top 5 Experiments:")
|
|
280
|
+
for result in leaderboard.top_results(limit=5):
|
|
281
|
+
print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
|
|
279
282
|
```
|
|
280
283
|
|
|
281
284
|
---
|
|
282
285
|
|
|
283
286
|
## 🧠 Explainability with Gemini / Claude
|
|
284
287
|
|
|
285
|
-
Compare
|
|
288
|
+
Compare RAG configurations and receive **natural language insights** on why one performs better.
|
|
286
289
|
|
|
287
290
|
```python
|
|
291
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
288
292
|
from ragmint.explainer import explain_results
|
|
289
293
|
|
|
290
|
-
|
|
291
|
-
|
|
294
|
+
tuner = AutoRAGTuner(docs_path="data/docs/")
|
|
295
|
+
best, results = tuner.auto_tune(
|
|
296
|
+
validation_set='data/docs/validation_qa.json',
|
|
297
|
+
metric="faithfulness",
|
|
298
|
+
trials=5,
|
|
299
|
+
search_type='bayesian'
|
|
300
|
+
)
|
|
292
301
|
|
|
293
|
-
|
|
294
|
-
print(
|
|
302
|
+
analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
|
|
303
|
+
print(analysis)
|
|
295
304
|
```
|
|
296
305
|
|
|
297
306
|
> Set your API keys in a `.env` file or via environment variables:
|
|
@@ -33,7 +33,6 @@ src/ragmint/tests/test_autotuner.py
|
|
|
33
33
|
src/ragmint/tests/test_config_adapter.py
|
|
34
34
|
src/ragmint/tests/test_embeddings.py
|
|
35
35
|
src/ragmint/tests/test_explainer.py
|
|
36
|
-
src/ragmint/tests/test_explainer_integration.py
|
|
37
36
|
src/ragmint/tests/test_integration_autotuner_ragmint.py
|
|
38
37
|
src/ragmint/tests/test_langchain_prebuilder.py
|
|
39
38
|
src/ragmint/tests/test_leaderboard.py
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Interpretability Layer
|
|
3
|
-
----------------------
|
|
4
|
-
Uses Gemini or Anthropic Claude to explain why one RAG configuration
|
|
5
|
-
outperforms another. Falls back gracefully if no API key is provided.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
import json
|
|
10
|
-
from dotenv import load_dotenv
|
|
11
|
-
|
|
12
|
-
# Load environment variables from .env file if available
|
|
13
|
-
load_dotenv()
|
|
14
|
-
|
|
15
|
-
def explain_results(results_a: dict, results_b: dict, model: str = "gemini-2.5-flash-lite") -> str:
|
|
16
|
-
"""
|
|
17
|
-
Generate a natural-language explanation comparing two RAG experiment results.
|
|
18
|
-
Priority:
|
|
19
|
-
1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
|
|
20
|
-
2. Google Gemini (if GOOGLE_API_KEY is set)
|
|
21
|
-
3. Fallback text message
|
|
22
|
-
"""
|
|
23
|
-
prompt = f"""
|
|
24
|
-
You are an AI evaluation expert.
|
|
25
|
-
Compare these two RAG experiment results and explain why one performs better.
|
|
26
|
-
Metrics A: {json.dumps(results_a, indent=2)}
|
|
27
|
-
Metrics B: {json.dumps(results_b, indent=2)}
|
|
28
|
-
Provide a concise, human-friendly explanation and practical improvement tips.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
32
|
-
google_key = os.getenv("GOOGLE_API_KEY") # fixed var name
|
|
33
|
-
|
|
34
|
-
# 1️⃣ Try Anthropic Claude first
|
|
35
|
-
if anthropic_key:
|
|
36
|
-
try:
|
|
37
|
-
from anthropic import Anthropic
|
|
38
|
-
client = Anthropic(api_key=anthropic_key)
|
|
39
|
-
response = client.messages.create(
|
|
40
|
-
model="claude-3-opus-20240229",
|
|
41
|
-
max_tokens=300,
|
|
42
|
-
messages=[{"role": "user", "content": prompt}],
|
|
43
|
-
)
|
|
44
|
-
return response.content[0].text
|
|
45
|
-
except Exception as e:
|
|
46
|
-
return f"[Claude unavailable] {e}"
|
|
47
|
-
|
|
48
|
-
# 2️⃣ Fallback to Google Gemini
|
|
49
|
-
elif google_key:
|
|
50
|
-
try:
|
|
51
|
-
import google.generativeai as genai
|
|
52
|
-
genai.configure(api_key=google_key)
|
|
53
|
-
response = genai.GenerativeModel(model).generate_content(prompt)
|
|
54
|
-
return response.text
|
|
55
|
-
except Exception as e:
|
|
56
|
-
return f"[Gemini unavailable] {e}"
|
|
57
|
-
|
|
58
|
-
# 3️⃣ Fallback if neither key is available
|
|
59
|
-
else:
|
|
60
|
-
return (
|
|
61
|
-
"[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
|
|
62
|
-
"to enable interpretability via Claude or Gemini."
|
|
63
|
-
)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Dict, Any, Optional
|
|
5
|
-
from supabase import create_client
|
|
6
|
-
|
|
7
|
-
class Leaderboard:
|
|
8
|
-
def __init__(self, storage_path: Optional[str] = None):
|
|
9
|
-
self.storage_path = storage_path
|
|
10
|
-
url = os.getenv("SUPABASE_URL")
|
|
11
|
-
key = os.getenv("SUPABASE_KEY")
|
|
12
|
-
self.client = None
|
|
13
|
-
if url and key:
|
|
14
|
-
self.client = create_client(url, key)
|
|
15
|
-
elif not storage_path:
|
|
16
|
-
raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
|
|
17
|
-
|
|
18
|
-
def upload(self, run_id: str, config: Dict[str, Any], score: float):
|
|
19
|
-
data = {
|
|
20
|
-
"run_id": run_id,
|
|
21
|
-
"config": config,
|
|
22
|
-
"score": score,
|
|
23
|
-
"timestamp": datetime.utcnow().isoformat(),
|
|
24
|
-
}
|
|
25
|
-
if self.client:
|
|
26
|
-
return self.client.table("experiments").insert(data).execute()
|
|
27
|
-
else:
|
|
28
|
-
os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
|
|
29
|
-
with open(self.storage_path, "a", encoding="utf-8") as f:
|
|
30
|
-
f.write(json.dumps(data) + "\n")
|
|
31
|
-
return data
|
|
32
|
-
|
|
33
|
-
def top_results(self, limit: int = 10):
|
|
34
|
-
if self.client:
|
|
35
|
-
return (
|
|
36
|
-
self.client.table("experiments")
|
|
37
|
-
.select("*")
|
|
38
|
-
.order("score", desc=True)
|
|
39
|
-
.limit(limit)
|
|
40
|
-
.execute()
|
|
41
|
-
)
|
|
42
|
-
else:
|
|
43
|
-
with open(self.storage_path, "r", encoding="utf-8") as f:
|
|
44
|
-
lines = [json.loads(line) for line in f]
|
|
45
|
-
return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from ragmint.explainer import explain_results
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def test_explain_results_gemini():
|
|
6
|
-
"""Gemini explanation should contain model-specific phrasing."""
|
|
7
|
-
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
8
|
-
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
9
|
-
result = explain_results(config_a, config_b, model="gemini")
|
|
10
|
-
assert isinstance(result, str)
|
|
11
|
-
assert "Gemini" in result or "gemini" in result
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def test_explain_results_claude():
|
|
15
|
-
"""Claude explanation should contain model-specific phrasing."""
|
|
16
|
-
config_a = {"retriever": "FAISS"}
|
|
17
|
-
config_b = {"retriever": "Chroma"}
|
|
18
|
-
result = explain_results(config_a, config_b, model="claude")
|
|
19
|
-
assert isinstance(result, str)
|
|
20
|
-
assert "Claude" in result or "claude" in result
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pytest
|
|
3
|
-
from ragmint.explainer import explain_results
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@pytest.mark.integration
|
|
7
|
-
def test_real_gemini_explanation():
|
|
8
|
-
"""Run real Gemini call if GOOGLE_API_KEY is set."""
|
|
9
|
-
if not os.getenv("GEMINI_API_KEY"):
|
|
10
|
-
pytest.skip("GEMINI_API_KEY not set")
|
|
11
|
-
|
|
12
|
-
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
13
|
-
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
14
|
-
|
|
15
|
-
result = explain_results(config_a, config_b, model="gemini-1.5-pro")
|
|
16
|
-
assert isinstance(result, str)
|
|
17
|
-
assert len(result) > 0
|
|
18
|
-
print("\n[Gemini explanation]:", result[:200], "...")
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import tempfile
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from ragmint.leaderboard import Leaderboard
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_leaderboard_add_and_top(tmp_path):
|
|
8
|
-
"""Ensure local leaderboard persistence works without Supabase."""
|
|
9
|
-
file_path = tmp_path / "leaderboard.jsonl"
|
|
10
|
-
lb = Leaderboard(storage_path=str(file_path))
|
|
11
|
-
|
|
12
|
-
# Add two runs
|
|
13
|
-
lb.upload("run1", {"retriever": "FAISS"}, 0.91)
|
|
14
|
-
lb.upload("run2", {"retriever": "Chroma"}, 0.85)
|
|
15
|
-
|
|
16
|
-
# Verify file content
|
|
17
|
-
assert file_path.exists()
|
|
18
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
19
|
-
lines = [json.loads(line) for line in f]
|
|
20
|
-
assert len(lines) == 2
|
|
21
|
-
|
|
22
|
-
# Get top results
|
|
23
|
-
top = lb.top_results(limit=1)
|
|
24
|
-
assert isinstance(top, list)
|
|
25
|
-
assert len(top) == 1
|
|
26
|
-
assert "score" in top[0]
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def test_leaderboard_append_existing(tmp_path):
|
|
30
|
-
"""Ensure multiple uploads append properly."""
|
|
31
|
-
file_path = tmp_path / "leaderboard.jsonl"
|
|
32
|
-
lb = Leaderboard(storage_path=str(file_path))
|
|
33
|
-
|
|
34
|
-
for i in range(3):
|
|
35
|
-
lb.upload(f"run{i}", {"retriever": "BM25"}, 0.8 + i * 0.05)
|
|
36
|
-
|
|
37
|
-
top = lb.top_results(limit=2)
|
|
38
|
-
assert len(top) == 2
|
|
39
|
-
assert top[0]["score"] >= top[1]["score"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|