ragmint 0.2.3__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragmint-0.2.3 → ragmint-0.3.1}/PKG-INFO +161 -32
- ragmint-0.2.3/src/ragmint.egg-info/PKG-INFO → ragmint-0.3.1/README.md +138 -53
- {ragmint-0.2.3 → ragmint-0.3.1}/pyproject.toml +38 -12
- ragmint-0.3.1/src/ragmint/autotuner.py +138 -0
- ragmint-0.3.1/src/ragmint/core/chunking.py +86 -0
- ragmint-0.3.1/src/ragmint/core/embeddings.py +55 -0
- ragmint-0.3.1/src/ragmint/core/pipeline.py +62 -0
- ragmint-0.3.1/src/ragmint/core/retriever.py +165 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/explainer.py +5 -3
- ragmint-0.3.1/src/ragmint/integrations/config_adapter.py +96 -0
- ragmint-0.3.1/src/ragmint/integrations/langchain_prebuilder.py +99 -0
- ragmint-0.3.1/src/ragmint/tests/test_autotuner.py +51 -0
- ragmint-0.3.1/src/ragmint/tests/test_config_adapter.py +39 -0
- ragmint-0.3.1/src/ragmint/tests/test_embeddings.py +46 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_explainer_integration.py +1 -1
- ragmint-0.3.1/src/ragmint/tests/test_integration_autotuner_ragmint.py +47 -0
- ragmint-0.3.1/src/ragmint/tests/test_langchain_prebuilder.py +82 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_pipeline.py +3 -2
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_retriever.py +3 -2
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_tuner.py +1 -1
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tuner.py +87 -21
- ragmint-0.3.1/src/ragmint/utils/__init__.py +0 -0
- ragmint-0.2.3/README.md → ragmint-0.3.1/src/ragmint.egg-info/PKG-INFO +182 -25
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/SOURCES.txt +6 -0
- ragmint-0.3.1/src/ragmint.egg-info/requires.txt +35 -0
- ragmint-0.2.3/src/ragmint/autotuner.py +0 -33
- ragmint-0.2.3/src/ragmint/core/chunking.py +0 -22
- ragmint-0.2.3/src/ragmint/core/embeddings.py +0 -19
- ragmint-0.2.3/src/ragmint/core/pipeline.py +0 -38
- ragmint-0.2.3/src/ragmint/core/retriever.py +0 -33
- ragmint-0.2.3/src/ragmint/tests/test_autotuner.py +0 -42
- ragmint-0.2.3/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -60
- ragmint-0.2.3/src/ragmint.egg-info/requires.txt +0 -15
- {ragmint-0.2.3 → ragmint-0.3.1}/LICENSE +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/MANIFEST.in +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/setup.cfg +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/__main__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/evaluation.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/reranker.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/experiments/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/experiments/validation_qa.json +0 -0
- {ragmint-0.2.3/src/ragmint/optimization → ragmint-0.3.1/src/ragmint/integrations}/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/leaderboard.py +0 -0
- {ragmint-0.2.3/src/ragmint/tests → ragmint-0.3.1/src/ragmint/optimization}/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/optimization/search.py +0 -0
- {ragmint-0.2.3/src/ragmint/utils → ragmint-0.3.1/src/ragmint/tests}/__init__.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/conftest.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_explainer.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_leaderboard.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_search.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/caching.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/data_loader.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/logger.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/metrics.py +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/dependency_links.txt +0 -0
- {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/top_level.txt +0 -0
|
@@ -1,29 +1,45 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragmint
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
5
|
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
6
|
License: Apache License 2.0
|
|
7
7
|
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
8
|
Project-URL: Documentation, https://andyolivers.com
|
|
9
9
|
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
-
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
10
|
+
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation,chunking,autotuning
|
|
11
11
|
Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: numpy<2.0.0
|
|
15
15
|
Requires-Dist: pandas>=2.0
|
|
16
16
|
Requires-Dist: scikit-learn>=1.3
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: chromadb>=0.4
|
|
17
|
+
Requires-Dist: sentence-transformers>=2.2.2
|
|
18
|
+
Requires-Dist: chromadb>=0.3.1
|
|
21
19
|
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
20
|
+
Requires-Dist: faiss-cpu==1.7.4; sys_platform == "darwin"
|
|
21
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
22
22
|
Requires-Dist: optuna>=3.0
|
|
23
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: tqdm
|
|
24
24
|
Requires-Dist: colorama
|
|
25
|
+
Requires-Dist: pyyaml
|
|
26
|
+
Requires-Dist: python-dotenv
|
|
27
|
+
Requires-Dist: openai>=1.0.0
|
|
25
28
|
Requires-Dist: google-generativeai>=0.8.0
|
|
29
|
+
Requires-Dist: anthropic>=0.25.0
|
|
26
30
|
Requires-Dist: supabase>=2.4.0
|
|
31
|
+
Requires-Dist: pytest
|
|
32
|
+
Requires-Dist: langchain>=0.2.5
|
|
33
|
+
Requires-Dist: langchain-community>=0.2.5
|
|
34
|
+
Requires-Dist: langchain-text-splitters>=0.2.1
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: black; extra == "dev"
|
|
37
|
+
Requires-Dist: flake8; extra == "dev"
|
|
38
|
+
Requires-Dist: isort; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
27
43
|
Dynamic: license-file
|
|
28
44
|
|
|
29
45
|
# Ragmint
|
|
@@ -38,7 +54,7 @@ Dynamic: license-file
|
|
|
38
54
|
|
|
39
55
|
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
40
56
|
|
|
41
|
-
It provides a complete toolkit for **retriever selection**, **embedding model tuning**,
|
|
57
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
|
|
42
58
|
|
|
43
59
|
---
|
|
44
60
|
|
|
@@ -49,10 +65,13 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
|
|
|
49
65
|
- 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
|
|
50
66
|
- 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
|
|
51
67
|
- 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
|
|
52
|
-
- ⚙️ **Retrievers** — FAISS, Chroma,
|
|
53
|
-
- 🧩 **Embeddings** —
|
|
68
|
+
- ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
|
|
69
|
+
- 🧩 **Embeddings** — Hugging Face
|
|
54
70
|
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
55
71
|
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
72
|
+
- 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
|
|
73
|
+
- 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
|
|
74
|
+
- ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
|
|
56
75
|
|
|
57
76
|
---
|
|
58
77
|
|
|
@@ -81,6 +100,8 @@ Example `configs/default.yaml`:
|
|
|
81
100
|
```yaml
|
|
82
101
|
retriever: faiss
|
|
83
102
|
embedding_model: text-embedding-3-small
|
|
103
|
+
chunk_size: 500
|
|
104
|
+
overlap: 100
|
|
84
105
|
reranker:
|
|
85
106
|
mode: mmr
|
|
86
107
|
lambda_param: 0.5
|
|
@@ -94,15 +115,94 @@ optimization:
|
|
|
94
115
|
### 3️⃣ Manual Pipeline Usage
|
|
95
116
|
|
|
96
117
|
```python
|
|
97
|
-
from ragmint.
|
|
118
|
+
from ragmint.prebuilder import PreBuilder
|
|
119
|
+
from ragmint.tuner import RAGMint
|
|
120
|
+
|
|
121
|
+
# Prebuild pipeline (chunking, embeddings, vector store)
|
|
122
|
+
prebuilder = PreBuilder(
|
|
123
|
+
docs_path="data/docs/",
|
|
124
|
+
config_path="configs/default.yaml"
|
|
125
|
+
)
|
|
126
|
+
pipeline = prebuilder.build_pipeline()
|
|
127
|
+
|
|
128
|
+
# Initialize RAGMint with prebuilt components
|
|
129
|
+
rag = RAGMint(pipeline=pipeline)
|
|
130
|
+
|
|
131
|
+
# Run optimization
|
|
132
|
+
best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
|
|
133
|
+
print("Best configuration:", best)
|
|
98
134
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
135
|
+
```
|
|
136
|
+
---
|
|
137
|
+
# 🧩 Embeddings and Retrievers
|
|
138
|
+
|
|
139
|
+
**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
## 🧩 Chunking System
|
|
143
|
+
|
|
144
|
+
* **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
|
|
145
|
+
* **Supports default values** if not provided in configuration.
|
|
146
|
+
* **Optimized** for downstream **retrieval and embeddings**.
|
|
147
|
+
* **Enables adaptive chunking strategies** in future releases.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
## 🧩 Langchain Config Adapter
|
|
151
|
+
|
|
152
|
+
* **Ensures consistent configuration** across pipeline components.
|
|
153
|
+
* **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
|
|
154
|
+
* **Adds default chunk parameters** when missing.
|
|
155
|
+
* **Validates retriever backends** and **raises clear errors** for unsupported options.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
## 🧩 Langchain Prebuilder
|
|
103
159
|
|
|
104
|
-
|
|
105
|
-
|
|
160
|
+
**Automates pipeline preparation:**
|
|
161
|
+
1. Reads documents
|
|
162
|
+
2. Applies chunking
|
|
163
|
+
3. Creates embeddings
|
|
164
|
+
4. Initializes retriever / vector store
|
|
165
|
+
5. Returns ready-to-use pipeline** for RAGMint or custom usage.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 🔤 Available Embeddings (Hugging Face)
|
|
170
|
+
|
|
171
|
+
You can select from the following models:
|
|
172
|
+
|
|
173
|
+
* `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
|
|
174
|
+
* `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
|
|
175
|
+
* `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
|
|
176
|
+
* `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
### Configuration Example
|
|
181
|
+
|
|
182
|
+
Use the following format in your config file to specify the embedding model:
|
|
183
|
+
|
|
184
|
+
```yaml
|
|
185
|
+
embedding_model: sentence-transformers/all-MiniLM-L6-v2
|
|
186
|
+
```
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## 🔍 Available Retrievers
|
|
190
|
+
|
|
191
|
+
**Ragmint** integrates multiple **retrieval backends** to suit different needs:
|
|
192
|
+
|
|
193
|
+
| Retriever | Description |
|
|
194
|
+
| :--- | :--- |
|
|
195
|
+
| **FAISS** | Fast vector similarity search; efficient for dense embeddings |
|
|
196
|
+
| **Chroma** | Persistent vector DB; works well for incremental updates |
|
|
197
|
+
| **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
### Configuration Example
|
|
201
|
+
|
|
202
|
+
To specify the retriever in your configuration file, use the following format:
|
|
203
|
+
|
|
204
|
+
```yaml
|
|
205
|
+
retriever: faiss
|
|
106
206
|
```
|
|
107
207
|
|
|
108
208
|
---
|
|
@@ -174,8 +274,7 @@ lb.show_top(3)
|
|
|
174
274
|
|
|
175
275
|
## 🧠 Explainability with Gemini / Claude
|
|
176
276
|
|
|
177
|
-
Compare two RAG configurations and receive natural language insights
|
|
178
|
-
on **why** one performs better.
|
|
277
|
+
Compare two RAG configurations and receive **natural language insights** on why one performs better.
|
|
179
278
|
|
|
180
279
|
```python
|
|
181
280
|
from ragmint.explainer import explain_results
|
|
@@ -189,7 +288,7 @@ print(explanation)
|
|
|
189
288
|
|
|
190
289
|
> Set your API keys in a `.env` file or via environment variables:
|
|
191
290
|
> ```
|
|
192
|
-
> export
|
|
291
|
+
> export GEMINI_API_KEY="your_gemini_key"
|
|
193
292
|
> export ANTHROPIC_API_KEY="your_claude_key"
|
|
194
293
|
> ```
|
|
195
294
|
|
|
@@ -203,8 +302,12 @@ ragmint/
|
|
|
203
302
|
│ ├── pipeline.py
|
|
204
303
|
│ ├── retriever.py
|
|
205
304
|
│ ├── reranker.py
|
|
206
|
-
│ ├──
|
|
305
|
+
│ ├── embeddings.py
|
|
306
|
+
│ ├── chunking.py
|
|
207
307
|
│ └── evaluation.py
|
|
308
|
+
├── integration/
|
|
309
|
+
│ ├── config_adapter.py
|
|
310
|
+
│ └── langchain_prebuilder.py
|
|
208
311
|
├── autotuner.py
|
|
209
312
|
├── explainer.py
|
|
210
313
|
├── leaderboard.py
|
|
@@ -240,16 +343,42 @@ Your `pyproject.toml` includes all required dependencies:
|
|
|
240
343
|
name = "ragmint"
|
|
241
344
|
version = "0.1.0"
|
|
242
345
|
dependencies = [
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
346
|
+
# Core ML + Embeddings
|
|
347
|
+
"numpy<2.0.0",
|
|
348
|
+
"pandas>=2.0",
|
|
349
|
+
"scikit-learn>=1.3",
|
|
350
|
+
"sentence-transformers>=2.2.2",
|
|
351
|
+
|
|
352
|
+
# Retrieval backends
|
|
353
|
+
"chromadb>=0.4",
|
|
354
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
355
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
356
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
357
|
+
|
|
358
|
+
# Optimization & evaluation
|
|
359
|
+
"optuna>=3.0",
|
|
360
|
+
"tqdm",
|
|
361
|
+
"colorama",
|
|
362
|
+
|
|
363
|
+
# RAG evaluation and data utils
|
|
364
|
+
"pyyaml",
|
|
365
|
+
"python-dotenv",
|
|
366
|
+
|
|
367
|
+
# Explainability and LLM APIs
|
|
368
|
+
"openai>=1.0.0",
|
|
369
|
+
"google-generativeai>=0.8.0",
|
|
370
|
+
"anthropic>=0.25.0",
|
|
371
|
+
|
|
372
|
+
# Integration / storage
|
|
373
|
+
"supabase>=2.4.0",
|
|
374
|
+
|
|
375
|
+
# Testing
|
|
376
|
+
"pytest",
|
|
377
|
+
|
|
378
|
+
# LangChain integration layer
|
|
379
|
+
"langchain>=0.2.5",
|
|
380
|
+
"langchain-community>=0.2.5",
|
|
381
|
+
"langchain-text-splitters>=0.2.1"
|
|
253
382
|
]
|
|
254
383
|
```
|
|
255
384
|
|
|
@@ -1,31 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ragmint
|
|
3
|
-
Version: 0.2.3
|
|
4
|
-
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
|
-
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
|
-
License: Apache License 2.0
|
|
7
|
-
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
|
-
Project-URL: Documentation, https://andyolivers.com
|
|
9
|
-
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
-
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
11
|
-
Requires-Python: >=3.9
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Requires-Dist: numpy>=1.23
|
|
15
|
-
Requires-Dist: pandas>=2.0
|
|
16
|
-
Requires-Dist: scikit-learn>=1.3
|
|
17
|
-
Requires-Dist: openai>=1.0
|
|
18
|
-
Requires-Dist: tqdm
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: chromadb>=0.4
|
|
21
|
-
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
22
|
-
Requires-Dist: optuna>=3.0
|
|
23
|
-
Requires-Dist: pytest
|
|
24
|
-
Requires-Dist: colorama
|
|
25
|
-
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
-
Requires-Dist: supabase>=2.4.0
|
|
27
|
-
Dynamic: license-file
|
|
28
|
-
|
|
29
1
|
# Ragmint
|
|
30
2
|
|
|
31
3
|

|
|
@@ -38,7 +10,7 @@ Dynamic: license-file
|
|
|
38
10
|
|
|
39
11
|
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
40
12
|
|
|
41
|
-
It provides a complete toolkit for **retriever selection**, **embedding model tuning**,
|
|
13
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
|
|
42
14
|
|
|
43
15
|
---
|
|
44
16
|
|
|
@@ -49,10 +21,13 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
|
|
|
49
21
|
- 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
|
|
50
22
|
- 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
|
|
51
23
|
- 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
|
|
52
|
-
- ⚙️ **Retrievers** — FAISS, Chroma,
|
|
53
|
-
- 🧩 **Embeddings** —
|
|
24
|
+
- ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
|
|
25
|
+
- 🧩 **Embeddings** — Hugging Face
|
|
54
26
|
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
55
27
|
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
28
|
+
- 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
|
|
29
|
+
- 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
|
|
30
|
+
- ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
|
|
56
31
|
|
|
57
32
|
---
|
|
58
33
|
|
|
@@ -81,6 +56,8 @@ Example `configs/default.yaml`:
|
|
|
81
56
|
```yaml
|
|
82
57
|
retriever: faiss
|
|
83
58
|
embedding_model: text-embedding-3-small
|
|
59
|
+
chunk_size: 500
|
|
60
|
+
overlap: 100
|
|
84
61
|
reranker:
|
|
85
62
|
mode: mmr
|
|
86
63
|
lambda_param: 0.5
|
|
@@ -94,15 +71,94 @@ optimization:
|
|
|
94
71
|
### 3️⃣ Manual Pipeline Usage
|
|
95
72
|
|
|
96
73
|
```python
|
|
97
|
-
from ragmint.
|
|
74
|
+
from ragmint.prebuilder import PreBuilder
|
|
75
|
+
from ragmint.tuner import RAGMint
|
|
76
|
+
|
|
77
|
+
# Prebuild pipeline (chunking, embeddings, vector store)
|
|
78
|
+
prebuilder = PreBuilder(
|
|
79
|
+
docs_path="data/docs/",
|
|
80
|
+
config_path="configs/default.yaml"
|
|
81
|
+
)
|
|
82
|
+
pipeline = prebuilder.build_pipeline()
|
|
83
|
+
|
|
84
|
+
# Initialize RAGMint with prebuilt components
|
|
85
|
+
rag = RAGMint(pipeline=pipeline)
|
|
86
|
+
|
|
87
|
+
# Run optimization
|
|
88
|
+
best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
|
|
89
|
+
print("Best configuration:", best)
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
---
|
|
93
|
+
# 🧩 Embeddings and Retrievers
|
|
94
|
+
|
|
95
|
+
**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
## 🧩 Chunking System
|
|
99
|
+
|
|
100
|
+
* **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
|
|
101
|
+
* **Supports default values** if not provided in configuration.
|
|
102
|
+
* **Optimized** for downstream **retrieval and embeddings**.
|
|
103
|
+
* **Enables adaptive chunking strategies** in future releases.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
## 🧩 Langchain Config Adapter
|
|
107
|
+
|
|
108
|
+
* **Ensures consistent configuration** across pipeline components.
|
|
109
|
+
* **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
|
|
110
|
+
* **Adds default chunk parameters** when missing.
|
|
111
|
+
* **Validates retriever backends** and **raises clear errors** for unsupported options.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
## 🧩 Langchain Prebuilder
|
|
115
|
+
|
|
116
|
+
**Automates pipeline preparation:**
|
|
117
|
+
1. Reads documents
|
|
118
|
+
2. Applies chunking
|
|
119
|
+
3. Creates embeddings
|
|
120
|
+
4. Initializes retriever / vector store
|
|
121
|
+
5. Returns ready-to-use pipeline** for RAGMint or custom usage.
|
|
98
122
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 🔤 Available Embeddings (Hugging Face)
|
|
126
|
+
|
|
127
|
+
You can select from the following models:
|
|
128
|
+
|
|
129
|
+
* `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
|
|
130
|
+
* `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
|
|
131
|
+
* `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
|
|
132
|
+
* `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
### Configuration Example
|
|
137
|
+
|
|
138
|
+
Use the following format in your config file to specify the embedding model:
|
|
139
|
+
|
|
140
|
+
```yaml
|
|
141
|
+
embedding_model: sentence-transformers/all-MiniLM-L6-v2
|
|
142
|
+
```
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 🔍 Available Retrievers
|
|
146
|
+
|
|
147
|
+
**Ragmint** integrates multiple **retrieval backends** to suit different needs:
|
|
148
|
+
|
|
149
|
+
| Retriever | Description |
|
|
150
|
+
| :--- | :--- |
|
|
151
|
+
| **FAISS** | Fast vector similarity search; efficient for dense embeddings |
|
|
152
|
+
| **Chroma** | Persistent vector DB; works well for incremental updates |
|
|
153
|
+
| **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
|
|
103
154
|
|
|
104
|
-
|
|
105
|
-
|
|
155
|
+
|
|
156
|
+
### Configuration Example
|
|
157
|
+
|
|
158
|
+
To specify the retriever in your configuration file, use the following format:
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
retriever: faiss
|
|
106
162
|
```
|
|
107
163
|
|
|
108
164
|
---
|
|
@@ -174,8 +230,7 @@ lb.show_top(3)
|
|
|
174
230
|
|
|
175
231
|
## 🧠 Explainability with Gemini / Claude
|
|
176
232
|
|
|
177
|
-
Compare two RAG configurations and receive natural language insights
|
|
178
|
-
on **why** one performs better.
|
|
233
|
+
Compare two RAG configurations and receive **natural language insights** on why one performs better.
|
|
179
234
|
|
|
180
235
|
```python
|
|
181
236
|
from ragmint.explainer import explain_results
|
|
@@ -189,7 +244,7 @@ print(explanation)
|
|
|
189
244
|
|
|
190
245
|
> Set your API keys in a `.env` file or via environment variables:
|
|
191
246
|
> ```
|
|
192
|
-
> export
|
|
247
|
+
> export GEMINI_API_KEY="your_gemini_key"
|
|
193
248
|
> export ANTHROPIC_API_KEY="your_claude_key"
|
|
194
249
|
> ```
|
|
195
250
|
|
|
@@ -203,8 +258,12 @@ ragmint/
|
|
|
203
258
|
│ ├── pipeline.py
|
|
204
259
|
│ ├── retriever.py
|
|
205
260
|
│ ├── reranker.py
|
|
206
|
-
│ ├──
|
|
261
|
+
│ ├── embeddings.py
|
|
262
|
+
│ ├── chunking.py
|
|
207
263
|
│ └── evaluation.py
|
|
264
|
+
├── integration/
|
|
265
|
+
│ ├── config_adapter.py
|
|
266
|
+
│ └── langchain_prebuilder.py
|
|
208
267
|
├── autotuner.py
|
|
209
268
|
├── explainer.py
|
|
210
269
|
├── leaderboard.py
|
|
@@ -240,16 +299,42 @@ Your `pyproject.toml` includes all required dependencies:
|
|
|
240
299
|
name = "ragmint"
|
|
241
300
|
version = "0.1.0"
|
|
242
301
|
dependencies = [
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
302
|
+
# Core ML + Embeddings
|
|
303
|
+
"numpy<2.0.0",
|
|
304
|
+
"pandas>=2.0",
|
|
305
|
+
"scikit-learn>=1.3",
|
|
306
|
+
"sentence-transformers>=2.2.2",
|
|
307
|
+
|
|
308
|
+
# Retrieval backends
|
|
309
|
+
"chromadb>=0.4",
|
|
310
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
311
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
312
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
313
|
+
|
|
314
|
+
# Optimization & evaluation
|
|
315
|
+
"optuna>=3.0",
|
|
316
|
+
"tqdm",
|
|
317
|
+
"colorama",
|
|
318
|
+
|
|
319
|
+
# RAG evaluation and data utils
|
|
320
|
+
"pyyaml",
|
|
321
|
+
"python-dotenv",
|
|
322
|
+
|
|
323
|
+
# Explainability and LLM APIs
|
|
324
|
+
"openai>=1.0.0",
|
|
325
|
+
"google-generativeai>=0.8.0",
|
|
326
|
+
"anthropic>=0.25.0",
|
|
327
|
+
|
|
328
|
+
# Integration / storage
|
|
329
|
+
"supabase>=2.4.0",
|
|
330
|
+
|
|
331
|
+
# Testing
|
|
332
|
+
"pytest",
|
|
333
|
+
|
|
334
|
+
# LangChain integration layer
|
|
335
|
+
"langchain>=0.2.5",
|
|
336
|
+
"langchain-community>=0.2.5",
|
|
337
|
+
"langchain-text-splitters>=0.2.1"
|
|
253
338
|
]
|
|
254
339
|
```
|
|
255
340
|
|
|
@@ -309,4 +394,4 @@ Licensed under the **Apache License 2.0** — free for personal, research, and c
|
|
|
309
394
|
|
|
310
395
|
**André Oliveira**
|
|
311
396
|
[andyolivers.com](https://andyolivers.com)
|
|
312
|
-
Data Scientist | AI Engineer
|
|
397
|
+
Data Scientist | AI Engineer
|
|
@@ -4,31 +4,58 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ragmint"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "A modular framework for evaluating and optimizing RAG pipelines."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache License 2.0" }
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
|
|
13
13
|
]
|
|
14
|
-
keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
|
|
14
|
+
keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation", "chunking", "autotuning"]
|
|
15
15
|
requires-python = ">=3.9"
|
|
16
16
|
dependencies = [
|
|
17
|
-
|
|
17
|
+
# Core ML + Embeddings
|
|
18
|
+
"numpy<2.0.0",
|
|
18
19
|
"pandas>=2.0",
|
|
19
20
|
"scikit-learn>=1.3",
|
|
20
|
-
"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"chromadb>=0.
|
|
24
|
-
"faiss-cpu; sys_platform != 'darwin'",
|
|
21
|
+
"sentence-transformers>=2.2.2",
|
|
22
|
+
|
|
23
|
+
# Retrieval backends
|
|
24
|
+
"chromadb>=0.3.1",
|
|
25
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
26
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
27
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
28
|
+
|
|
29
|
+
# Optimization & evaluation
|
|
25
30
|
"optuna>=3.0",
|
|
26
|
-
"
|
|
31
|
+
"tqdm",
|
|
27
32
|
"colorama",
|
|
33
|
+
|
|
34
|
+
# RAG evaluation and data utils
|
|
35
|
+
"pyyaml",
|
|
36
|
+
"python-dotenv",
|
|
37
|
+
|
|
38
|
+
# Explainability and LLM APIs
|
|
39
|
+
"openai>=1.0.0",
|
|
28
40
|
"google-generativeai>=0.8.0",
|
|
29
|
-
"
|
|
41
|
+
"anthropic>=0.25.0",
|
|
42
|
+
|
|
43
|
+
# Integration / storage
|
|
44
|
+
"supabase>=2.4.0",
|
|
45
|
+
|
|
46
|
+
# Testing
|
|
47
|
+
"pytest",
|
|
48
|
+
|
|
49
|
+
# LangChain integration layer
|
|
50
|
+
"langchain>=0.2.5",
|
|
51
|
+
"langchain-community>=0.2.5",
|
|
52
|
+
"langchain-text-splitters>=0.2.1"
|
|
30
53
|
]
|
|
31
54
|
|
|
55
|
+
[project.optional-dependencies]
|
|
56
|
+
dev = ["black", "flake8", "isort", "pytest-cov"]
|
|
57
|
+
docs = ["mkdocs", "mkdocs-material"]
|
|
58
|
+
|
|
32
59
|
[project.urls]
|
|
33
60
|
Homepage = "https://github.com/andyolivers/ragmint"
|
|
34
61
|
Documentation = "https://andyolivers.com"
|
|
@@ -49,5 +76,4 @@ ragmint = ["experiments/*.json"]
|
|
|
49
76
|
|
|
50
77
|
[tool.pytest.ini_options]
|
|
51
78
|
testpaths = ["tests"]
|
|
52
|
-
addopts = "-v"
|
|
53
|
-
|
|
79
|
+
addopts = "-v --tb=short"
|