ragmint 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragmint-0.1.0/src/ragmint.egg-info → ragmint-0.2.0}/PKG-INFO +124 -30
- ragmint-0.2.0/README.md +284 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/pyproject.toml +4 -2
- ragmint-0.2.0/src/ragmint/autotuner.py +33 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/evaluation.py +11 -0
- ragmint-0.2.0/src/ragmint/explainer.py +61 -0
- ragmint-0.2.0/src/ragmint/leaderboard.py +45 -0
- ragmint-0.2.0/src/ragmint/tests/conftest.py +16 -0
- ragmint-0.2.0/src/ragmint/tests/test_autotuner.py +42 -0
- ragmint-0.2.0/src/ragmint/tests/test_explainer.py +20 -0
- ragmint-0.2.0/src/ragmint/tests/test_explainer_integration.py +18 -0
- ragmint-0.2.0/src/ragmint/tests/test_integration_autotuner_ragmint.py +60 -0
- ragmint-0.2.0/src/ragmint/tests/test_leaderboard.py +39 -0
- ragmint-0.2.0/src/ragmint/tests/test_tuner.py +71 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/tuner.py +1 -1
- ragmint-0.2.0/src/ragmint/utils/data_loader.py +65 -0
- {ragmint-0.1.0 → ragmint-0.2.0/src/ragmint.egg-info}/PKG-INFO +124 -30
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint.egg-info/SOURCES.txt +9 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint.egg-info/requires.txt +2 -0
- ragmint-0.1.0/README.md +0 -192
- ragmint-0.1.0/src/ragmint/tests/test_tuner.py +0 -38
- ragmint-0.1.0/src/ragmint/utils/data_loader.py +0 -35
- {ragmint-0.1.0 → ragmint-0.2.0}/LICENSE +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/setup.cfg +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/__main__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/chunking.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/embeddings.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/pipeline.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/reranker.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/core/retriever.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/experiments/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/optimization/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/optimization/search.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/tests/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/tests/test_pipeline.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/tests/test_retriever.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/tests/test_search.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/utils/__init__.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/utils/caching.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/utils/logger.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint/utils/metrics.py +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint.egg-info/dependency_links.txt +0 -0
- {ragmint-0.1.0 → ragmint-0.2.0}/src/ragmint.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragmint
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
5
|
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -22,6 +22,8 @@ Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
|
22
22
|
Requires-Dist: optuna>=3.0
|
|
23
23
|
Requires-Dist: pytest
|
|
24
24
|
Requires-Dist: colorama
|
|
25
|
+
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
+
Requires-Dist: supabase>=2.4.0
|
|
25
27
|
Dynamic: license-file
|
|
26
28
|
|
|
27
29
|
# Ragmint
|
|
@@ -36,17 +38,19 @@ Dynamic: license-file
|
|
|
36
38
|
|
|
37
39
|
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
38
40
|
|
|
39
|
-
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization
|
|
41
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
|
|
40
42
|
|
|
41
43
|
---
|
|
42
44
|
|
|
43
45
|
## ✨ Features
|
|
44
46
|
|
|
45
47
|
- ✅ **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
|
|
48
|
+
- 🤖 **Auto-RAG Tuner** — dynamically recommends retriever–embedding pairs based on corpus size
|
|
49
|
+
- 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
|
|
50
|
+
- 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
|
|
46
51
|
- 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
|
|
47
52
|
- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
|
|
48
53
|
- 🧩 **Embeddings** — OpenAI, HuggingFace
|
|
49
|
-
- 🧠 **Rerankers** — MMR, CrossEncoder (extensible via plugin interface)
|
|
50
54
|
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
51
55
|
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
52
56
|
|
|
@@ -103,47 +107,133 @@ print(result)
|
|
|
103
107
|
|
|
104
108
|
---
|
|
105
109
|
|
|
110
|
+
## 🧪 Dataset Options
|
|
111
|
+
|
|
112
|
+
Ragmint can automatically load evaluation datasets for your RAG pipeline:
|
|
113
|
+
|
|
114
|
+
| Mode | Example | Description |
|
|
115
|
+
|------|----------|-------------|
|
|
116
|
+
| 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
|
|
117
|
+
| 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
|
|
118
|
+
| 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
|
|
119
|
+
|
|
120
|
+
### Example
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from ragmint.tuner import RAGMint
|
|
124
|
+
|
|
125
|
+
ragmint = RAGMint(
|
|
126
|
+
docs_path="data/docs/",
|
|
127
|
+
retrievers=["faiss", "chroma"],
|
|
128
|
+
embeddings=["text-embedding-3-small"],
|
|
129
|
+
rerankers=["mmr"],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Use built-in default
|
|
133
|
+
ragmint.optimize(validation_set=None)
|
|
134
|
+
|
|
135
|
+
# Use Hugging Face benchmark
|
|
136
|
+
ragmint.optimize(validation_set="squad")
|
|
137
|
+
|
|
138
|
+
# Use your own dataset
|
|
139
|
+
ragmint.optimize(validation_set="data/custom_qa.json")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## 🧠 Auto-RAG Tuner
|
|
145
|
+
|
|
146
|
+
The **AutoRAGTuner** automatically recommends retriever–embedding combinations
|
|
147
|
+
based on corpus size and average document length.
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
151
|
+
|
|
152
|
+
corpus_stats = {"size": 5000, "avg_len": 250}
|
|
153
|
+
tuner = AutoRAGTuner(corpus_stats)
|
|
154
|
+
recommendation = tuner.recommend()
|
|
155
|
+
print(recommendation)
|
|
156
|
+
# Example output: {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## 🏆 Leaderboard Tracking
|
|
162
|
+
|
|
163
|
+
Track and visualize your best experiments across runs.
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from ragmint.leaderboard import Leaderboard
|
|
167
|
+
|
|
168
|
+
lb = Leaderboard("experiments/leaderboard.json")
|
|
169
|
+
lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
|
|
170
|
+
lb.show_top(3)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## 🧠 Explainability with Gemini / Claude
|
|
176
|
+
|
|
177
|
+
Compare two RAG configurations and receive natural language insights
|
|
178
|
+
on **why** one performs better.
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from ragmint.explainer import explain_results
|
|
182
|
+
|
|
183
|
+
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
184
|
+
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
185
|
+
|
|
186
|
+
explanation = explain_results(config_a, config_b, model="gemini")
|
|
187
|
+
print(explanation)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
> Set your API keys in a `.env` file or via environment variables:
|
|
191
|
+
> ```
|
|
192
|
+
> export GOOGLE_API_KEY="your_gemini_key"
|
|
193
|
+
> export ANTHROPIC_API_KEY="your_claude_key"
|
|
194
|
+
> ```
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
106
198
|
## 🧩 Folder Structure
|
|
107
199
|
|
|
108
200
|
```
|
|
109
201
|
ragmint/
|
|
110
202
|
├── core/
|
|
111
|
-
│ ├── pipeline.py
|
|
112
|
-
│ ├── retriever.py
|
|
113
|
-
│ ├── reranker.py
|
|
114
|
-
│
|
|
115
|
-
|
|
116
|
-
├──
|
|
117
|
-
├──
|
|
118
|
-
├──
|
|
119
|
-
├──
|
|
120
|
-
├──
|
|
121
|
-
|
|
203
|
+
│ ├── pipeline.py
|
|
204
|
+
│ ├── retriever.py
|
|
205
|
+
│ ├── reranker.py
|
|
206
|
+
│ ├── embedding.py
|
|
207
|
+
│ └── evaluation.py
|
|
208
|
+
├── autotuner.py
|
|
209
|
+
├── explainer.py
|
|
210
|
+
├── leaderboard.py
|
|
211
|
+
├── tuner.py
|
|
212
|
+
├── utils/
|
|
213
|
+
├── configs/
|
|
214
|
+
├── experiments/
|
|
215
|
+
├── tests/
|
|
216
|
+
└── main.py
|
|
122
217
|
```
|
|
123
218
|
|
|
124
219
|
---
|
|
125
220
|
|
|
126
221
|
## 🧪 Running Tests
|
|
127
222
|
|
|
128
|
-
To verify your setup:
|
|
129
|
-
|
|
130
223
|
```bash
|
|
131
224
|
pytest -v
|
|
132
225
|
```
|
|
133
226
|
|
|
134
|
-
|
|
135
|
-
|
|
227
|
+
To include integration tests with Gemini or Claude APIs:
|
|
136
228
|
```bash
|
|
137
|
-
pytest
|
|
229
|
+
pytest -m integration
|
|
138
230
|
```
|
|
139
231
|
|
|
140
|
-
All tests are designed for **Pytest** and run with lightweight mock data.
|
|
141
|
-
|
|
142
232
|
---
|
|
143
233
|
|
|
144
234
|
## ⚙️ Configuration via `pyproject.toml`
|
|
145
235
|
|
|
146
|
-
Your `pyproject.toml`
|
|
236
|
+
Your `pyproject.toml` includes all required dependencies:
|
|
147
237
|
|
|
148
238
|
```toml
|
|
149
239
|
[project]
|
|
@@ -158,6 +248,8 @@ dependencies = [
|
|
|
158
248
|
"pytest",
|
|
159
249
|
"openai",
|
|
160
250
|
"tqdm",
|
|
251
|
+
"google-generativeai",
|
|
252
|
+
"google-genai",
|
|
161
253
|
]
|
|
162
254
|
```
|
|
163
255
|
|
|
@@ -165,10 +257,10 @@ dependencies = [
|
|
|
165
257
|
|
|
166
258
|
## 📊 Example Experiment Workflow
|
|
167
259
|
|
|
168
|
-
1. Define your retriever and reranker
|
|
169
|
-
2. Launch
|
|
170
|
-
3.
|
|
171
|
-
4.
|
|
260
|
+
1. Define your retriever, embedding, and reranker setup
|
|
261
|
+
2. Launch optimization (Grid, Random, Bayesian) or AutoTune
|
|
262
|
+
3. Compare performance with explainability
|
|
263
|
+
4. Persist results to leaderboard for later inspection
|
|
172
264
|
|
|
173
265
|
---
|
|
174
266
|
|
|
@@ -181,7 +273,7 @@ flowchart TD
|
|
|
181
273
|
C --> D[Reranker]
|
|
182
274
|
D --> E[Generator]
|
|
183
275
|
E --> F[Evaluation]
|
|
184
|
-
F --> G[Optuna
|
|
276
|
+
F --> G[Optuna / AutoRAGTuner]
|
|
185
277
|
G -->|Best Params| B
|
|
186
278
|
```
|
|
187
279
|
|
|
@@ -191,8 +283,9 @@ flowchart TD
|
|
|
191
283
|
|
|
192
284
|
```
|
|
193
285
|
[INFO] Starting Bayesian optimization with Optuna
|
|
194
|
-
[INFO] Trial 7 finished:
|
|
286
|
+
[INFO] Trial 7 finished: faithfulness=0.83, latency=0.42s
|
|
195
287
|
[INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
|
|
288
|
+
[INFO] AutoRAGTuner: Suggested retriever=Chroma for medium corpus
|
|
196
289
|
```
|
|
197
290
|
|
|
198
291
|
---
|
|
@@ -200,8 +293,9 @@ flowchart TD
|
|
|
200
293
|
## 🧠 Why Ragmint?
|
|
201
294
|
|
|
202
295
|
- Built for **RAG researchers**, **AI engineers**, and **LLM ops**
|
|
203
|
-
- Works with **LangChain**, **LlamaIndex**, or standalone
|
|
204
|
-
- Designed for **extensibility** — plug in your own
|
|
296
|
+
- Works with **LangChain**, **LlamaIndex**, or standalone setups
|
|
297
|
+
- Designed for **extensibility** — plug in your own retrievers, models, or metrics
|
|
298
|
+
- Integrated **explainability and leaderboard** modules for research and production
|
|
205
299
|
|
|
206
300
|
---
|
|
207
301
|
|
ragmint-0.2.0/README.md
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
# Ragmint
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+

|
|
10
|
+
|
|
11
|
+
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
12
|
+
|
|
13
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## ✨ Features
|
|
18
|
+
|
|
19
|
+
- ✅ **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
|
|
20
|
+
- 🤖 **Auto-RAG Tuner** — dynamically recommends retriever–embedding pairs based on corpus size
|
|
21
|
+
- 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
|
|
22
|
+
- 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
|
|
23
|
+
- 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
|
|
24
|
+
- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
|
|
25
|
+
- 🧩 **Embeddings** — OpenAI, HuggingFace
|
|
26
|
+
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
27
|
+
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 🚀 Quick Start
|
|
32
|
+
|
|
33
|
+
### 1️⃣ Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/andyolivers/ragmint.git
|
|
37
|
+
cd ragmint
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
> The `-e` flag installs Ragmint in editable (development) mode.
|
|
42
|
+
> Requires **Python ≥ 3.9**.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
### 2️⃣ Run a RAG Optimization Experiment
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python ragmint/main.py --config configs/default.yaml --search bayesian
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Example `configs/default.yaml`:
|
|
53
|
+
```yaml
|
|
54
|
+
retriever: faiss
|
|
55
|
+
embedding_model: text-embedding-3-small
|
|
56
|
+
reranker:
|
|
57
|
+
mode: mmr
|
|
58
|
+
lambda_param: 0.5
|
|
59
|
+
optimization:
|
|
60
|
+
search_method: bayesian
|
|
61
|
+
n_trials: 20
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
### 3️⃣ Manual Pipeline Usage
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from ragmint.core.pipeline import RAGPipeline
|
|
70
|
+
|
|
71
|
+
pipeline = RAGPipeline({
|
|
72
|
+
"embedding_model": "text-embedding-3-small",
|
|
73
|
+
"retriever": "faiss",
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
result = pipeline.run("What is retrieval-augmented generation?")
|
|
77
|
+
print(result)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## 🧪 Dataset Options
|
|
83
|
+
|
|
84
|
+
Ragmint can automatically load evaluation datasets for your RAG pipeline:
|
|
85
|
+
|
|
86
|
+
| Mode | Example | Description |
|
|
87
|
+
|------|----------|-------------|
|
|
88
|
+
| 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
|
|
89
|
+
| 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
|
|
90
|
+
| 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
|
|
91
|
+
|
|
92
|
+
### Example
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from ragmint.tuner import RAGMint
|
|
96
|
+
|
|
97
|
+
ragmint = RAGMint(
|
|
98
|
+
docs_path="data/docs/",
|
|
99
|
+
retrievers=["faiss", "chroma"],
|
|
100
|
+
embeddings=["text-embedding-3-small"],
|
|
101
|
+
rerankers=["mmr"],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Use built-in default
|
|
105
|
+
ragmint.optimize(validation_set=None)
|
|
106
|
+
|
|
107
|
+
# Use Hugging Face benchmark
|
|
108
|
+
ragmint.optimize(validation_set="squad")
|
|
109
|
+
|
|
110
|
+
# Use your own dataset
|
|
111
|
+
ragmint.optimize(validation_set="data/custom_qa.json")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🧠 Auto-RAG Tuner
|
|
117
|
+
|
|
118
|
+
The **AutoRAGTuner** automatically recommends retriever–embedding combinations
|
|
119
|
+
based on corpus size and average document length.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
123
|
+
|
|
124
|
+
corpus_stats = {"size": 5000, "avg_len": 250}
|
|
125
|
+
tuner = AutoRAGTuner(corpus_stats)
|
|
126
|
+
recommendation = tuner.recommend()
|
|
127
|
+
print(recommendation)
|
|
128
|
+
# Example output: {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## 🏆 Leaderboard Tracking
|
|
134
|
+
|
|
135
|
+
Track and visualize your best experiments across runs.
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from ragmint.leaderboard import Leaderboard
|
|
139
|
+
|
|
140
|
+
lb = Leaderboard("experiments/leaderboard.json")
|
|
141
|
+
lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
|
|
142
|
+
lb.show_top(3)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## 🧠 Explainability with Gemini / Claude
|
|
148
|
+
|
|
149
|
+
Compare two RAG configurations and receive natural language insights
|
|
150
|
+
on **why** one performs better.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from ragmint.explainer import explain_results
|
|
154
|
+
|
|
155
|
+
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
156
|
+
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
157
|
+
|
|
158
|
+
explanation = explain_results(config_a, config_b, model="gemini")
|
|
159
|
+
print(explanation)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
> Set your API keys in a `.env` file or via environment variables:
|
|
163
|
+
> ```
|
|
164
|
+
> export GOOGLE_API_KEY="your_gemini_key"
|
|
165
|
+
> export ANTHROPIC_API_KEY="your_claude_key"
|
|
166
|
+
> ```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## 🧩 Folder Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
ragmint/
|
|
174
|
+
├── core/
|
|
175
|
+
│ ├── pipeline.py
|
|
176
|
+
│ ├── retriever.py
|
|
177
|
+
│ ├── reranker.py
|
|
178
|
+
│ ├── embedding.py
|
|
179
|
+
│ └── evaluation.py
|
|
180
|
+
├── autotuner.py
|
|
181
|
+
├── explainer.py
|
|
182
|
+
├── leaderboard.py
|
|
183
|
+
├── tuner.py
|
|
184
|
+
├── utils/
|
|
185
|
+
├── configs/
|
|
186
|
+
├── experiments/
|
|
187
|
+
├── tests/
|
|
188
|
+
└── main.py
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## 🧪 Running Tests
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
pytest -v
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
To include integration tests with Gemini or Claude APIs:
|
|
200
|
+
```bash
|
|
201
|
+
pytest -m integration
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## ⚙️ Configuration via `pyproject.toml`
|
|
207
|
+
|
|
208
|
+
Your `pyproject.toml` includes all required dependencies:
|
|
209
|
+
|
|
210
|
+
```toml
|
|
211
|
+
[project]
|
|
212
|
+
name = "ragmint"
|
|
213
|
+
version = "0.1.0"
|
|
214
|
+
dependencies = [
|
|
215
|
+
"numpy",
|
|
216
|
+
"optuna",
|
|
217
|
+
"scikit-learn",
|
|
218
|
+
"faiss-cpu",
|
|
219
|
+
"chromadb",
|
|
220
|
+
"pytest",
|
|
221
|
+
"openai",
|
|
222
|
+
"tqdm",
|
|
223
|
+
"google-generativeai",
|
|
224
|
+
"google-genai",
|
|
225
|
+
]
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## 📊 Example Experiment Workflow
|
|
231
|
+
|
|
232
|
+
1. Define your retriever, embedding, and reranker setup
|
|
233
|
+
2. Launch optimization (Grid, Random, Bayesian) or AutoTune
|
|
234
|
+
3. Compare performance with explainability
|
|
235
|
+
4. Persist results to leaderboard for later inspection
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## 🧬 Architecture Overview
|
|
240
|
+
|
|
241
|
+
```mermaid
|
|
242
|
+
flowchart TD
|
|
243
|
+
A[Query] --> B[Embedder]
|
|
244
|
+
B --> C[Retriever]
|
|
245
|
+
C --> D[Reranker]
|
|
246
|
+
D --> E[Generator]
|
|
247
|
+
E --> F[Evaluation]
|
|
248
|
+
F --> G[Optuna / AutoRAGTuner]
|
|
249
|
+
G -->|Best Params| B
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## 📘 Example Output
|
|
255
|
+
|
|
256
|
+
```
|
|
257
|
+
[INFO] Starting Bayesian optimization with Optuna
|
|
258
|
+
[INFO] Trial 7 finished: faithfulness=0.83, latency=0.42s
|
|
259
|
+
[INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
|
|
260
|
+
[INFO] AutoRAGTuner: Suggested retriever=Chroma for medium corpus
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## 🧠 Why Ragmint?
|
|
266
|
+
|
|
267
|
+
- Built for **RAG researchers**, **AI engineers**, and **LLM ops**
|
|
268
|
+
- Works with **LangChain**, **LlamaIndex**, or standalone setups
|
|
269
|
+
- Designed for **extensibility** — plug in your own retrievers, models, or metrics
|
|
270
|
+
- Integrated **explainability and leaderboard** modules for research and production
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## ⚖️ License
|
|
275
|
+
|
|
276
|
+
Licensed under the **Apache License 2.0** — free for personal, research, and commercial use.
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## 👤 Author
|
|
281
|
+
|
|
282
|
+
**André Oliveira**
|
|
283
|
+
[andyolivers.com](https://andyolivers.com)
|
|
284
|
+
Data Scientist | AI Engineer
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ragmint"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "A modular framework for evaluating and optimizing RAG pipelines."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache License 2.0" }
|
|
@@ -24,7 +24,9 @@ dependencies = [
|
|
|
24
24
|
"faiss-cpu; sys_platform != 'darwin'",
|
|
25
25
|
"optuna>=3.0",
|
|
26
26
|
"pytest",
|
|
27
|
-
"colorama"
|
|
27
|
+
"colorama",
|
|
28
|
+
"google-generativeai>=0.8.0",
|
|
29
|
+
"supabase>=2.4.0"
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
[project.urls]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Auto-RAG Tuner
|
|
3
|
+
--------------
|
|
4
|
+
Recommends retriever–embedding pairs dynamically based on corpus size
|
|
5
|
+
and dataset characteristics. Integrates seamlessly with RAGMint evaluator.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .core.evaluation import evaluate_config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AutoRAGTuner:
|
|
12
|
+
def __init__(self, corpus_stats: dict):
|
|
13
|
+
"""
|
|
14
|
+
corpus_stats: dict
|
|
15
|
+
Example: {'size': 12000, 'avg_len': 240}
|
|
16
|
+
"""
|
|
17
|
+
self.corpus_stats = corpus_stats
|
|
18
|
+
|
|
19
|
+
def recommend(self):
|
|
20
|
+
size = self.corpus_stats.get("size", 0)
|
|
21
|
+
avg_len = self.corpus_stats.get("avg_len", 0)
|
|
22
|
+
|
|
23
|
+
if size < 1000:
|
|
24
|
+
return {"retriever": "BM25", "embedding_model": "OpenAI"}
|
|
25
|
+
elif size < 10000:
|
|
26
|
+
return {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
27
|
+
else:
|
|
28
|
+
return {"retriever": "FAISS", "embedding_model": "InstructorXL"}
|
|
29
|
+
|
|
30
|
+
def auto_tune(self, validation_data):
|
|
31
|
+
config = self.recommend()
|
|
32
|
+
results = evaluate_config(config, validation_data)
|
|
33
|
+
return {"recommended": config, "results": results}
|
|
@@ -25,3 +25,14 @@ class Evaluator:
|
|
|
25
25
|
|
|
26
26
|
def _similarity(self, a: str, b: str) -> float:
|
|
27
27
|
return SequenceMatcher(None, a, b).ratio()
|
|
28
|
+
|
|
29
|
+
def evaluate_config(config, validation_data):
|
|
30
|
+
evaluator = Evaluator()
|
|
31
|
+
results = []
|
|
32
|
+
for sample in validation_data:
|
|
33
|
+
query = sample.get("query", "")
|
|
34
|
+
answer = sample.get("answer", "")
|
|
35
|
+
context = sample.get("context", "")
|
|
36
|
+
results.append(evaluator.evaluate(query, answer, context))
|
|
37
|
+
return results
|
|
38
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interpretability Layer
|
|
3
|
+
----------------------
|
|
4
|
+
Uses Gemini or Anthropic Claude to explain why one RAG configuration
|
|
5
|
+
outperforms another. Falls back gracefully if no API key is provided.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-pro") -> str:
|
|
13
|
+
"""
|
|
14
|
+
Generate a natural-language explanation comparing two RAG experiment results.
|
|
15
|
+
Priority:
|
|
16
|
+
1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
|
|
17
|
+
2. Google Gemini (if GOOGLE_API_KEY is set)
|
|
18
|
+
3. Fallback text message
|
|
19
|
+
"""
|
|
20
|
+
prompt = f"""
|
|
21
|
+
You are an AI evaluation expert.
|
|
22
|
+
Compare these two RAG experiment results and explain why one performs better.
|
|
23
|
+
Metrics A: {json.dumps(results_a, indent=2)}
|
|
24
|
+
Metrics B: {json.dumps(results_b, indent=2)}
|
|
25
|
+
Provide a concise, human-friendly explanation and practical improvement tips.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
29
|
+
google_key = os.getenv("GEMINI_API_KEY")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# 1️⃣ Try Anthropic Claude first
|
|
33
|
+
if anthropic_key:
|
|
34
|
+
try:
|
|
35
|
+
from anthropic import Anthropic
|
|
36
|
+
client = Anthropic(api_key=anthropic_key)
|
|
37
|
+
response = client.messages.create(
|
|
38
|
+
model="claude-3-opus-20240229",
|
|
39
|
+
max_tokens=300,
|
|
40
|
+
messages=[{"role": "user", "content": prompt}],
|
|
41
|
+
)
|
|
42
|
+
return response.content[0].text
|
|
43
|
+
except Exception as e:
|
|
44
|
+
return f"[Claude unavailable] {e}"
|
|
45
|
+
|
|
46
|
+
# 2️⃣ Fallback to Google Gemini
|
|
47
|
+
elif google_key:
|
|
48
|
+
try:
|
|
49
|
+
import google.generativeai as genai
|
|
50
|
+
genai.configure(api_key=google_key)
|
|
51
|
+
response = genai.GenerativeModel(model).generate_content(prompt)
|
|
52
|
+
return response.text
|
|
53
|
+
except Exception as e:
|
|
54
|
+
return f"[Gemini unavailable] {e}"
|
|
55
|
+
|
|
56
|
+
# 3️⃣ Fallback if neither key is available
|
|
57
|
+
else:
|
|
58
|
+
return (
|
|
59
|
+
"[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
|
|
60
|
+
"to enable interpretability via Claude or Gemini."
|
|
61
|
+
)
|