ragmint 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragmint-0.3.0 → ragmint-0.3.1}/PKG-INFO +99 -30
- ragmint-0.3.0/src/ragmint.egg-info/PKG-INFO → ragmint-0.3.1/README.md +77 -52
- {ragmint-0.3.0 → ragmint-0.3.1}/pyproject.toml +36 -12
- ragmint-0.3.1/src/ragmint/autotuner.py +138 -0
- ragmint-0.3.1/src/ragmint/core/chunking.py +86 -0
- ragmint-0.3.1/src/ragmint/core/pipeline.py +62 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/retriever.py +50 -33
- ragmint-0.3.1/src/ragmint/integrations/config_adapter.py +96 -0
- ragmint-0.3.1/src/ragmint/integrations/langchain_prebuilder.py +99 -0
- ragmint-0.3.1/src/ragmint/tests/test_autotuner.py +51 -0
- ragmint-0.3.1/src/ragmint/tests/test_config_adapter.py +39 -0
- ragmint-0.3.1/src/ragmint/tests/test_integration_autotuner_ragmint.py +47 -0
- ragmint-0.3.1/src/ragmint/tests/test_langchain_prebuilder.py +82 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tuner.py +27 -3
- ragmint-0.3.1/src/ragmint/utils/__init__.py +0 -0
- ragmint-0.3.0/README.md → ragmint-0.3.1/src/ragmint.egg-info/PKG-INFO +121 -22
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/SOURCES.txt +5 -0
- ragmint-0.3.1/src/ragmint.egg-info/requires.txt +35 -0
- ragmint-0.3.0/src/ragmint/autotuner.py +0 -33
- ragmint-0.3.0/src/ragmint/core/chunking.py +0 -22
- ragmint-0.3.0/src/ragmint/core/pipeline.py +0 -37
- ragmint-0.3.0/src/ragmint/tests/test_autotuner.py +0 -42
- ragmint-0.3.0/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -60
- ragmint-0.3.0/src/ragmint.egg-info/requires.txt +0 -17
- {ragmint-0.3.0 → ragmint-0.3.1}/LICENSE +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/MANIFEST.in +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/setup.cfg +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/__main__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/embeddings.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/evaluation.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/reranker.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/experiments/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/experiments/validation_qa.json +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/explainer.py +0 -0
- {ragmint-0.3.0/src/ragmint/optimization → ragmint-0.3.1/src/ragmint/integrations}/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/leaderboard.py +0 -0
- {ragmint-0.3.0/src/ragmint/tests → ragmint-0.3.1/src/ragmint/optimization}/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/optimization/search.py +0 -0
- {ragmint-0.3.0/src/ragmint/utils → ragmint-0.3.1/src/ragmint/tests}/__init__.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/conftest.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_embeddings.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_explainer.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_explainer_integration.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_leaderboard.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_pipeline.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_retriever.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_search.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_tuner.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/caching.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/data_loader.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/logger.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/metrics.py +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/dependency_links.txt +0 -0
- {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/top_level.txt +0 -0
|
@@ -1,31 +1,45 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragmint
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
5
|
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
6
|
License: Apache License 2.0
|
|
7
7
|
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
8
|
Project-URL: Documentation, https://andyolivers.com
|
|
9
9
|
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
-
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
10
|
+
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation,chunking,autotuning
|
|
11
11
|
Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: numpy<2.0.0
|
|
15
15
|
Requires-Dist: pandas>=2.0
|
|
16
16
|
Requires-Dist: scikit-learn>=1.3
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: chromadb>=0.4
|
|
17
|
+
Requires-Dist: sentence-transformers>=2.2.2
|
|
18
|
+
Requires-Dist: chromadb>=0.3.1
|
|
21
19
|
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
20
|
+
Requires-Dist: faiss-cpu==1.7.4; sys_platform == "darwin"
|
|
21
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
22
22
|
Requires-Dist: optuna>=3.0
|
|
23
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: tqdm
|
|
24
24
|
Requires-Dist: colorama
|
|
25
|
+
Requires-Dist: pyyaml
|
|
26
|
+
Requires-Dist: python-dotenv
|
|
27
|
+
Requires-Dist: openai>=1.0.0
|
|
25
28
|
Requires-Dist: google-generativeai>=0.8.0
|
|
29
|
+
Requires-Dist: anthropic>=0.25.0
|
|
26
30
|
Requires-Dist: supabase>=2.4.0
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
31
|
+
Requires-Dist: pytest
|
|
32
|
+
Requires-Dist: langchain>=0.2.5
|
|
33
|
+
Requires-Dist: langchain-community>=0.2.5
|
|
34
|
+
Requires-Dist: langchain-text-splitters>=0.2.1
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: black; extra == "dev"
|
|
37
|
+
Requires-Dist: flake8; extra == "dev"
|
|
38
|
+
Requires-Dist: isort; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
29
43
|
Dynamic: license-file
|
|
30
44
|
|
|
31
45
|
# Ragmint
|
|
@@ -40,7 +54,7 @@ Dynamic: license-file
|
|
|
40
54
|
|
|
41
55
|
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
42
56
|
|
|
43
|
-
It provides a complete toolkit for **retriever selection**, **embedding model tuning**,
|
|
57
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
|
|
44
58
|
|
|
45
59
|
---
|
|
46
60
|
|
|
@@ -55,6 +69,9 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
|
|
|
55
69
|
- 🧩 **Embeddings** — Hugging Face
|
|
56
70
|
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
57
71
|
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
72
|
+
- 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
|
|
73
|
+
- 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
|
|
74
|
+
- ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
|
|
58
75
|
|
|
59
76
|
---
|
|
60
77
|
|
|
@@ -83,6 +100,8 @@ Example `configs/default.yaml`:
|
|
|
83
100
|
```yaml
|
|
84
101
|
retriever: faiss
|
|
85
102
|
embedding_model: text-embedding-3-small
|
|
103
|
+
chunk_size: 500
|
|
104
|
+
overlap: 100
|
|
86
105
|
reranker:
|
|
87
106
|
mode: mmr
|
|
88
107
|
lambda_param: 0.5
|
|
@@ -96,33 +115,58 @@ optimization:
|
|
|
96
115
|
### 3️⃣ Manual Pipeline Usage
|
|
97
116
|
|
|
98
117
|
```python
|
|
118
|
+
from ragmint.prebuilder import PreBuilder
|
|
99
119
|
from ragmint.tuner import RAGMint
|
|
100
120
|
|
|
101
|
-
#
|
|
102
|
-
|
|
121
|
+
# Prebuild pipeline (chunking, embeddings, vector store)
|
|
122
|
+
prebuilder = PreBuilder(
|
|
103
123
|
docs_path="data/docs/",
|
|
104
|
-
|
|
105
|
-
embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
|
|
106
|
-
rerankers=["mmr"]
|
|
124
|
+
config_path="configs/default.yaml"
|
|
107
125
|
)
|
|
126
|
+
pipeline = prebuilder.build_pipeline()
|
|
108
127
|
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
validation_set=None,
|
|
112
|
-
metric="faithfulness",
|
|
113
|
-
trials=3
|
|
114
|
-
)
|
|
128
|
+
# Initialize RAGMint with prebuilt components
|
|
129
|
+
rag = RAGMint(pipeline=pipeline)
|
|
115
130
|
|
|
131
|
+
# Run optimization
|
|
132
|
+
best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
|
|
116
133
|
print("Best configuration:", best)
|
|
134
|
+
|
|
117
135
|
```
|
|
118
136
|
---
|
|
119
137
|
# 🧩 Embeddings and Retrievers
|
|
120
138
|
|
|
121
139
|
**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
|
|
122
140
|
|
|
141
|
+
---
|
|
142
|
+
## 🧩 Chunking System
|
|
143
|
+
|
|
144
|
+
* **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
|
|
145
|
+
* **Supports default values** if not provided in configuration.
|
|
146
|
+
* **Optimized** for downstream **retrieval and embeddings**.
|
|
147
|
+
* **Enables adaptive chunking strategies** in future releases.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
## 🧩 Langchain Config Adapter
|
|
151
|
+
|
|
152
|
+
* **Ensures consistent configuration** across pipeline components.
|
|
153
|
+
* **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
|
|
154
|
+
* **Adds default chunk parameters** when missing.
|
|
155
|
+
* **Validates retriever backends** and **raises clear errors** for unsupported options.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
## 🧩 Langchain Prebuilder
|
|
159
|
+
|
|
160
|
+
**Automates pipeline preparation:**
|
|
161
|
+
1. Reads documents
|
|
162
|
+
2. Applies chunking
|
|
163
|
+
3. Creates embeddings
|
|
164
|
+
4. Initializes retriever / vector store
|
|
165
|
+
5. Returns ready-to-use pipeline** for RAGMint or custom usage.
|
|
166
|
+
|
|
123
167
|
---
|
|
124
168
|
|
|
125
|
-
## 🔤 Available Embeddings (Hugging Face
|
|
169
|
+
## 🔤 Available Embeddings (Hugging Face)
|
|
126
170
|
|
|
127
171
|
You can select from the following models:
|
|
128
172
|
|
|
@@ -258,8 +302,12 @@ ragmint/
|
|
|
258
302
|
│ ├── pipeline.py
|
|
259
303
|
│ ├── retriever.py
|
|
260
304
|
│ ├── reranker.py
|
|
261
|
-
│ ├──
|
|
305
|
+
│ ├── embeddings.py
|
|
306
|
+
│ ├── chunking.py
|
|
262
307
|
│ └── evaluation.py
|
|
308
|
+
├── integration/
|
|
309
|
+
│ ├── config_adapter.py
|
|
310
|
+
│ └── langchain_prebuilder.py
|
|
263
311
|
├── autotuner.py
|
|
264
312
|
├── explainer.py
|
|
265
313
|
├── leaderboard.py
|
|
@@ -295,21 +343,42 @@ Your `pyproject.toml` includes all required dependencies:
|
|
|
295
343
|
name = "ragmint"
|
|
296
344
|
version = "0.1.0"
|
|
297
345
|
dependencies = [
|
|
346
|
+
# Core ML + Embeddings
|
|
298
347
|
"numpy<2.0.0",
|
|
299
348
|
"pandas>=2.0",
|
|
300
349
|
"scikit-learn>=1.3",
|
|
301
|
-
"
|
|
302
|
-
|
|
303
|
-
|
|
350
|
+
"sentence-transformers>=2.2.2",
|
|
351
|
+
|
|
352
|
+
# Retrieval backends
|
|
304
353
|
"chromadb>=0.4",
|
|
305
|
-
"faiss-cpu; sys_platform != 'darwin'",
|
|
354
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
355
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
356
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
357
|
+
|
|
358
|
+
# Optimization & evaluation
|
|
306
359
|
"optuna>=3.0",
|
|
307
|
-
"
|
|
360
|
+
"tqdm",
|
|
308
361
|
"colorama",
|
|
362
|
+
|
|
363
|
+
# RAG evaluation and data utils
|
|
364
|
+
"pyyaml",
|
|
365
|
+
"python-dotenv",
|
|
366
|
+
|
|
367
|
+
# Explainability and LLM APIs
|
|
368
|
+
"openai>=1.0.0",
|
|
309
369
|
"google-generativeai>=0.8.0",
|
|
370
|
+
"anthropic>=0.25.0",
|
|
371
|
+
|
|
372
|
+
# Integration / storage
|
|
310
373
|
"supabase>=2.4.0",
|
|
311
|
-
|
|
312
|
-
|
|
374
|
+
|
|
375
|
+
# Testing
|
|
376
|
+
"pytest",
|
|
377
|
+
|
|
378
|
+
# LangChain integration layer
|
|
379
|
+
"langchain>=0.2.5",
|
|
380
|
+
"langchain-community>=0.2.5",
|
|
381
|
+
"langchain-text-splitters>=0.2.1"
|
|
313
382
|
]
|
|
314
383
|
```
|
|
315
384
|
|
|
@@ -1,33 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ragmint
|
|
3
|
-
Version: 0.3.0
|
|
4
|
-
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
|
-
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
|
-
License: Apache License 2.0
|
|
7
|
-
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
|
-
Project-URL: Documentation, https://andyolivers.com
|
|
9
|
-
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
-
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
11
|
-
Requires-Python: >=3.9
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Requires-Dist: numpy<2.0.0
|
|
15
|
-
Requires-Dist: pandas>=2.0
|
|
16
|
-
Requires-Dist: scikit-learn>=1.3
|
|
17
|
-
Requires-Dist: openai>=1.0
|
|
18
|
-
Requires-Dist: tqdm
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: chromadb>=0.4
|
|
21
|
-
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
22
|
-
Requires-Dist: optuna>=3.0
|
|
23
|
-
Requires-Dist: pytest
|
|
24
|
-
Requires-Dist: colorama
|
|
25
|
-
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
-
Requires-Dist: supabase>=2.4.0
|
|
27
|
-
Requires-Dist: python-dotenv
|
|
28
|
-
Requires-Dist: sentence-transformers
|
|
29
|
-
Dynamic: license-file
|
|
30
|
-
|
|
31
1
|
# Ragmint
|
|
32
2
|
|
|
33
3
|

|
|
@@ -40,7 +10,7 @@ Dynamic: license-file
|
|
|
40
10
|
|
|
41
11
|
**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
|
|
42
12
|
|
|
43
|
-
It provides a complete toolkit for **retriever selection**, **embedding model tuning**,
|
|
13
|
+
It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
|
|
44
14
|
|
|
45
15
|
---
|
|
46
16
|
|
|
@@ -55,6 +25,9 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
|
|
|
55
25
|
- 🧩 **Embeddings** — Hugging Face
|
|
56
26
|
- 💾 **Caching, experiment tracking, and reproducibility** out of the box
|
|
57
27
|
- 🧰 **Clean modular structure** for easy integration in research and production setups
|
|
28
|
+
- 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
|
|
29
|
+
- 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
|
|
30
|
+
- ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
|
|
58
31
|
|
|
59
32
|
---
|
|
60
33
|
|
|
@@ -83,6 +56,8 @@ Example `configs/default.yaml`:
|
|
|
83
56
|
```yaml
|
|
84
57
|
retriever: faiss
|
|
85
58
|
embedding_model: text-embedding-3-small
|
|
59
|
+
chunk_size: 500
|
|
60
|
+
overlap: 100
|
|
86
61
|
reranker:
|
|
87
62
|
mode: mmr
|
|
88
63
|
lambda_param: 0.5
|
|
@@ -96,24 +71,23 @@ optimization:
|
|
|
96
71
|
### 3️⃣ Manual Pipeline Usage
|
|
97
72
|
|
|
98
73
|
```python
|
|
74
|
+
from ragmint.prebuilder import PreBuilder
|
|
99
75
|
from ragmint.tuner import RAGMint
|
|
100
76
|
|
|
101
|
-
#
|
|
102
|
-
|
|
77
|
+
# Prebuild pipeline (chunking, embeddings, vector store)
|
|
78
|
+
prebuilder = PreBuilder(
|
|
103
79
|
docs_path="data/docs/",
|
|
104
|
-
|
|
105
|
-
embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
|
|
106
|
-
rerankers=["mmr"]
|
|
80
|
+
config_path="configs/default.yaml"
|
|
107
81
|
)
|
|
82
|
+
pipeline = prebuilder.build_pipeline()
|
|
108
83
|
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
validation_set=None,
|
|
112
|
-
metric="faithfulness",
|
|
113
|
-
trials=3
|
|
114
|
-
)
|
|
84
|
+
# Initialize RAGMint with prebuilt components
|
|
85
|
+
rag = RAGMint(pipeline=pipeline)
|
|
115
86
|
|
|
87
|
+
# Run optimization
|
|
88
|
+
best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
|
|
116
89
|
print("Best configuration:", best)
|
|
90
|
+
|
|
117
91
|
```
|
|
118
92
|
---
|
|
119
93
|
# 🧩 Embeddings and Retrievers
|
|
@@ -121,8 +95,34 @@ print("Best configuration:", best)
|
|
|
121
95
|
**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
|
|
122
96
|
|
|
123
97
|
---
|
|
98
|
+
## 🧩 Chunking System
|
|
99
|
+
|
|
100
|
+
* **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
|
|
101
|
+
* **Supports default values** if not provided in configuration.
|
|
102
|
+
* **Optimized** for downstream **retrieval and embeddings**.
|
|
103
|
+
* **Enables adaptive chunking strategies** in future releases.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
## 🧩 Langchain Config Adapter
|
|
107
|
+
|
|
108
|
+
* **Ensures consistent configuration** across pipeline components.
|
|
109
|
+
* **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
|
|
110
|
+
* **Adds default chunk parameters** when missing.
|
|
111
|
+
* **Validates retriever backends** and **raises clear errors** for unsupported options.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
## 🧩 Langchain Prebuilder
|
|
124
115
|
|
|
125
|
-
|
|
116
|
+
**Automates pipeline preparation:**
|
|
117
|
+
1. Reads documents
|
|
118
|
+
2. Applies chunking
|
|
119
|
+
3. Creates embeddings
|
|
120
|
+
4. Initializes retriever / vector store
|
|
121
|
+
5. Returns ready-to-use pipeline** for RAGMint or custom usage.
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 🔤 Available Embeddings (Hugging Face)
|
|
126
126
|
|
|
127
127
|
You can select from the following models:
|
|
128
128
|
|
|
@@ -258,8 +258,12 @@ ragmint/
|
|
|
258
258
|
│ ├── pipeline.py
|
|
259
259
|
│ ├── retriever.py
|
|
260
260
|
│ ├── reranker.py
|
|
261
|
-
│ ├──
|
|
261
|
+
│ ├── embeddings.py
|
|
262
|
+
│ ├── chunking.py
|
|
262
263
|
│ └── evaluation.py
|
|
264
|
+
├── integration/
|
|
265
|
+
│ ├── config_adapter.py
|
|
266
|
+
│ └── langchain_prebuilder.py
|
|
263
267
|
├── autotuner.py
|
|
264
268
|
├── explainer.py
|
|
265
269
|
├── leaderboard.py
|
|
@@ -295,21 +299,42 @@ Your `pyproject.toml` includes all required dependencies:
|
|
|
295
299
|
name = "ragmint"
|
|
296
300
|
version = "0.1.0"
|
|
297
301
|
dependencies = [
|
|
302
|
+
# Core ML + Embeddings
|
|
298
303
|
"numpy<2.0.0",
|
|
299
304
|
"pandas>=2.0",
|
|
300
305
|
"scikit-learn>=1.3",
|
|
301
|
-
"
|
|
302
|
-
|
|
303
|
-
|
|
306
|
+
"sentence-transformers>=2.2.2",
|
|
307
|
+
|
|
308
|
+
# Retrieval backends
|
|
304
309
|
"chromadb>=0.4",
|
|
305
|
-
"faiss-cpu; sys_platform != 'darwin'",
|
|
310
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
311
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
312
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
313
|
+
|
|
314
|
+
# Optimization & evaluation
|
|
306
315
|
"optuna>=3.0",
|
|
307
|
-
"
|
|
316
|
+
"tqdm",
|
|
308
317
|
"colorama",
|
|
318
|
+
|
|
319
|
+
# RAG evaluation and data utils
|
|
320
|
+
"pyyaml",
|
|
321
|
+
"python-dotenv",
|
|
322
|
+
|
|
323
|
+
# Explainability and LLM APIs
|
|
324
|
+
"openai>=1.0.0",
|
|
309
325
|
"google-generativeai>=0.8.0",
|
|
326
|
+
"anthropic>=0.25.0",
|
|
327
|
+
|
|
328
|
+
# Integration / storage
|
|
310
329
|
"supabase>=2.4.0",
|
|
311
|
-
|
|
312
|
-
|
|
330
|
+
|
|
331
|
+
# Testing
|
|
332
|
+
"pytest",
|
|
333
|
+
|
|
334
|
+
# LangChain integration layer
|
|
335
|
+
"langchain>=0.2.5",
|
|
336
|
+
"langchain-community>=0.2.5",
|
|
337
|
+
"langchain-text-splitters>=0.2.1"
|
|
313
338
|
]
|
|
314
339
|
```
|
|
315
340
|
|
|
@@ -369,4 +394,4 @@ Licensed under the **Apache License 2.0** — free for personal, research, and c
|
|
|
369
394
|
|
|
370
395
|
**André Oliveira**
|
|
371
396
|
[andyolivers.com](https://andyolivers.com)
|
|
372
|
-
Data Scientist | AI Engineer
|
|
397
|
+
Data Scientist | AI Engineer
|
|
@@ -4,33 +4,58 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ragmint"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "A modular framework for evaluating and optimizing RAG pipelines."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache License 2.0" }
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
|
|
13
13
|
]
|
|
14
|
-
keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
|
|
14
|
+
keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation", "chunking", "autotuning"]
|
|
15
15
|
requires-python = ">=3.9"
|
|
16
16
|
dependencies = [
|
|
17
|
+
# Core ML + Embeddings
|
|
17
18
|
"numpy<2.0.0",
|
|
18
19
|
"pandas>=2.0",
|
|
19
20
|
"scikit-learn>=1.3",
|
|
20
|
-
"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"chromadb>=0.
|
|
24
|
-
"faiss-cpu; sys_platform != 'darwin'",
|
|
21
|
+
"sentence-transformers>=2.2.2",
|
|
22
|
+
|
|
23
|
+
# Retrieval backends
|
|
24
|
+
"chromadb>=0.3.1",
|
|
25
|
+
"faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
|
|
26
|
+
"faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
|
|
27
|
+
"rank-bm25>=0.2.2", # For BM25 retriever
|
|
28
|
+
|
|
29
|
+
# Optimization & evaluation
|
|
25
30
|
"optuna>=3.0",
|
|
26
|
-
"
|
|
31
|
+
"tqdm",
|
|
27
32
|
"colorama",
|
|
33
|
+
|
|
34
|
+
# RAG evaluation and data utils
|
|
35
|
+
"pyyaml",
|
|
36
|
+
"python-dotenv",
|
|
37
|
+
|
|
38
|
+
# Explainability and LLM APIs
|
|
39
|
+
"openai>=1.0.0",
|
|
28
40
|
"google-generativeai>=0.8.0",
|
|
41
|
+
"anthropic>=0.25.0",
|
|
42
|
+
|
|
43
|
+
# Integration / storage
|
|
29
44
|
"supabase>=2.4.0",
|
|
30
|
-
|
|
31
|
-
|
|
45
|
+
|
|
46
|
+
# Testing
|
|
47
|
+
"pytest",
|
|
48
|
+
|
|
49
|
+
# LangChain integration layer
|
|
50
|
+
"langchain>=0.2.5",
|
|
51
|
+
"langchain-community>=0.2.5",
|
|
52
|
+
"langchain-text-splitters>=0.2.1"
|
|
32
53
|
]
|
|
33
54
|
|
|
55
|
+
[project.optional-dependencies]
|
|
56
|
+
dev = ["black", "flake8", "isort", "pytest-cov"]
|
|
57
|
+
docs = ["mkdocs", "mkdocs-material"]
|
|
58
|
+
|
|
34
59
|
[project.urls]
|
|
35
60
|
Homepage = "https://github.com/andyolivers/ragmint"
|
|
36
61
|
Documentation = "https://andyolivers.com"
|
|
@@ -51,5 +76,4 @@ ragmint = ["experiments/*.json"]
|
|
|
51
76
|
|
|
52
77
|
[tool.pytest.ini_options]
|
|
53
78
|
testpaths = ["tests"]
|
|
54
|
-
addopts = "-v"
|
|
55
|
-
|
|
79
|
+
addopts = "-v --tb=short"
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Auto-RAG Tuner
|
|
3
|
+
--------------
|
|
4
|
+
Automatically recommends and optimizes RAG configurations based on corpus statistics.
|
|
5
|
+
Integrates with RAGMint to perform full end-to-end tuning.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
from statistics import mean
|
|
11
|
+
from typing import Dict, Any, Tuple, List
|
|
12
|
+
|
|
13
|
+
from .tuner import RAGMint
|
|
14
|
+
from .core.evaluation import evaluate_config
|
|
15
|
+
|
|
16
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AutoRAGTuner:
|
|
20
|
+
def __init__(self, docs_path: str):
|
|
21
|
+
"""
|
|
22
|
+
AutoRAGTuner automatically analyzes a corpus and runs an optimized RAG tuning pipeline.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
docs_path (str): Path to the directory containing documents (.txt, .md, .rst)
|
|
26
|
+
"""
|
|
27
|
+
self.docs_path = docs_path
|
|
28
|
+
self.corpus_stats = self._analyze_corpus()
|
|
29
|
+
|
|
30
|
+
# -----------------------------
|
|
31
|
+
# Corpus Analysis
|
|
32
|
+
# -----------------------------
|
|
33
|
+
def _analyze_corpus(self) -> Dict[str, Any]:
|
|
34
|
+
"""Compute corpus size, average length, and number of documents."""
|
|
35
|
+
docs = []
|
|
36
|
+
total_chars = 0
|
|
37
|
+
num_docs = 0
|
|
38
|
+
|
|
39
|
+
if not os.path.exists(self.docs_path):
|
|
40
|
+
logging.warning(f"⚠️ Corpus path not found: {self.docs_path}")
|
|
41
|
+
return {"size": 0, "avg_len": 0, "num_docs": 0}
|
|
42
|
+
|
|
43
|
+
for file in os.listdir(self.docs_path):
|
|
44
|
+
if file.endswith((".txt", ".md", ".rst")):
|
|
45
|
+
with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
|
|
46
|
+
content = f.read()
|
|
47
|
+
docs.append(content)
|
|
48
|
+
total_chars += len(content)
|
|
49
|
+
num_docs += 1
|
|
50
|
+
|
|
51
|
+
avg_len = int(mean([len(d) for d in docs])) if docs else 0
|
|
52
|
+
stats = {"size": total_chars, "avg_len": avg_len, "num_docs": num_docs}
|
|
53
|
+
logging.info(f"📊 Corpus stats: {stats}")
|
|
54
|
+
return stats
|
|
55
|
+
|
|
56
|
+
# -----------------------------
|
|
57
|
+
# Recommendation Logic
|
|
58
|
+
# -----------------------------
|
|
59
|
+
def recommend(self) -> Dict[str, Any]:
|
|
60
|
+
"""Recommend retriever, embedding, and chunking based on corpus stats."""
|
|
61
|
+
size = self.corpus_stats.get("size", 0)
|
|
62
|
+
avg_len = self.corpus_stats.get("avg_len", 0)
|
|
63
|
+
num_docs = self.corpus_stats.get("num_docs", 0)
|
|
64
|
+
|
|
65
|
+
# Heuristic-based tuning
|
|
66
|
+
# Determine chunking heuristics first
|
|
67
|
+
if avg_len < 200:
|
|
68
|
+
chunk_size, overlap = 300, 50
|
|
69
|
+
elif avg_len < 500:
|
|
70
|
+
chunk_size, overlap = 500, 100
|
|
71
|
+
else:
|
|
72
|
+
chunk_size, overlap = 800, 150
|
|
73
|
+
|
|
74
|
+
# Determine retriever–embedding based on corpus size
|
|
75
|
+
if size <= 2000:
|
|
76
|
+
retriever = "BM25"
|
|
77
|
+
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
|
|
78
|
+
elif size <= 10000:
|
|
79
|
+
retriever = "Chroma"
|
|
80
|
+
embedding_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
|
81
|
+
else:
|
|
82
|
+
retriever = "FAISS"
|
|
83
|
+
embedding_model = "sentence-transformers/all-mpnet-base-v2"
|
|
84
|
+
|
|
85
|
+
strategy = "fixed" if avg_len < 400 else "sentence"
|
|
86
|
+
|
|
87
|
+
recommendation = {
|
|
88
|
+
"retriever": retriever,
|
|
89
|
+
"embedding_model": embedding_model,
|
|
90
|
+
"chunk_size": chunk_size,
|
|
91
|
+
"overlap": overlap,
|
|
92
|
+
"strategy": strategy,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
logging.info(f"🔮 AutoRAG Recommendation: {recommendation}")
|
|
96
|
+
return recommendation
|
|
97
|
+
|
|
98
|
+
# -----------------------------
|
|
99
|
+
# Full Auto-Tuning
|
|
100
|
+
# -----------------------------
|
|
101
|
+
def auto_tune(
|
|
102
|
+
self,
|
|
103
|
+
validation_set: str = None,
|
|
104
|
+
metric: str = "faithfulness",
|
|
105
|
+
trials: int = 5,
|
|
106
|
+
search_type: str = "random",
|
|
107
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
108
|
+
"""
|
|
109
|
+
Run a full automatic optimization using RAGMint.
|
|
110
|
+
|
|
111
|
+
Automatically:
|
|
112
|
+
- Recommends initial config (retriever, embedding, chunking)
|
|
113
|
+
- Launches RAGMint optimization trials
|
|
114
|
+
- Returns best configuration and results
|
|
115
|
+
"""
|
|
116
|
+
rec = self.recommend()
|
|
117
|
+
|
|
118
|
+
logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
|
|
119
|
+
|
|
120
|
+
tuner = RAGMint(
|
|
121
|
+
docs_path=self.docs_path,
|
|
122
|
+
retrievers=[rec["retriever"]],
|
|
123
|
+
embeddings=[rec["embedding_model"]],
|
|
124
|
+
rerankers=["mmr"],
|
|
125
|
+
chunk_sizes=[rec["chunk_size"]],
|
|
126
|
+
overlaps=[rec["overlap"]],
|
|
127
|
+
strategies=[rec["strategy"]],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
best, results = tuner.optimize(
|
|
131
|
+
validation_set=validation_set,
|
|
132
|
+
metric=metric,
|
|
133
|
+
trials=trials,
|
|
134
|
+
search_type=search_type,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
logging.info(f"🏁 AutoRAG tuning complete. Best: {best}")
|
|
138
|
+
return best, results
|