ragmint 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {ragmint-0.3.0 → ragmint-0.3.1}/PKG-INFO +99 -30
  2. ragmint-0.3.0/src/ragmint.egg-info/PKG-INFO → ragmint-0.3.1/README.md +77 -52
  3. {ragmint-0.3.0 → ragmint-0.3.1}/pyproject.toml +36 -12
  4. ragmint-0.3.1/src/ragmint/autotuner.py +138 -0
  5. ragmint-0.3.1/src/ragmint/core/chunking.py +86 -0
  6. ragmint-0.3.1/src/ragmint/core/pipeline.py +62 -0
  7. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/retriever.py +50 -33
  8. ragmint-0.3.1/src/ragmint/integrations/config_adapter.py +96 -0
  9. ragmint-0.3.1/src/ragmint/integrations/langchain_prebuilder.py +99 -0
  10. ragmint-0.3.1/src/ragmint/tests/test_autotuner.py +51 -0
  11. ragmint-0.3.1/src/ragmint/tests/test_config_adapter.py +39 -0
  12. ragmint-0.3.1/src/ragmint/tests/test_integration_autotuner_ragmint.py +47 -0
  13. ragmint-0.3.1/src/ragmint/tests/test_langchain_prebuilder.py +82 -0
  14. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tuner.py +27 -3
  15. ragmint-0.3.1/src/ragmint/utils/__init__.py +0 -0
  16. ragmint-0.3.0/README.md → ragmint-0.3.1/src/ragmint.egg-info/PKG-INFO +121 -22
  17. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/SOURCES.txt +5 -0
  18. ragmint-0.3.1/src/ragmint.egg-info/requires.txt +35 -0
  19. ragmint-0.3.0/src/ragmint/autotuner.py +0 -33
  20. ragmint-0.3.0/src/ragmint/core/chunking.py +0 -22
  21. ragmint-0.3.0/src/ragmint/core/pipeline.py +0 -37
  22. ragmint-0.3.0/src/ragmint/tests/test_autotuner.py +0 -42
  23. ragmint-0.3.0/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -60
  24. ragmint-0.3.0/src/ragmint.egg-info/requires.txt +0 -17
  25. {ragmint-0.3.0 → ragmint-0.3.1}/LICENSE +0 -0
  26. {ragmint-0.3.0 → ragmint-0.3.1}/MANIFEST.in +0 -0
  27. {ragmint-0.3.0 → ragmint-0.3.1}/setup.cfg +0 -0
  28. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/__init__.py +0 -0
  29. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/__main__.py +0 -0
  30. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/__init__.py +0 -0
  31. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/embeddings.py +0 -0
  32. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/evaluation.py +0 -0
  33. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/core/reranker.py +0 -0
  34. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/experiments/__init__.py +0 -0
  35. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/experiments/validation_qa.json +0 -0
  36. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/explainer.py +0 -0
  37. {ragmint-0.3.0/src/ragmint/optimization → ragmint-0.3.1/src/ragmint/integrations}/__init__.py +0 -0
  38. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/leaderboard.py +0 -0
  39. {ragmint-0.3.0/src/ragmint/tests → ragmint-0.3.1/src/ragmint/optimization}/__init__.py +0 -0
  40. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/optimization/search.py +0 -0
  41. {ragmint-0.3.0/src/ragmint/utils → ragmint-0.3.1/src/ragmint/tests}/__init__.py +0 -0
  42. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/conftest.py +0 -0
  43. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_embeddings.py +0 -0
  44. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_explainer.py +0 -0
  45. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_explainer_integration.py +0 -0
  46. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_leaderboard.py +0 -0
  47. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_pipeline.py +0 -0
  48. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_retriever.py +0 -0
  49. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_search.py +0 -0
  50. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/tests/test_tuner.py +0 -0
  51. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/caching.py +0 -0
  52. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/data_loader.py +0 -0
  53. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/logger.py +0 -0
  54. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint/utils/metrics.py +0 -0
  55. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/dependency_links.txt +0 -0
  56. {ragmint-0.3.0 → ragmint-0.3.1}/src/ragmint.egg-info/top_level.txt +0 -0
@@ -1,31 +1,45 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
7
7
  Project-URL: Homepage, https://github.com/andyolivers/ragmint
8
8
  Project-URL: Documentation, https://andyolivers.com
9
9
  Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
10
- Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
10
+ Keywords: RAG,LLM,retrieval,optimization,AI,evaluation,chunking,autotuning
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numpy<2.0.0
15
15
  Requires-Dist: pandas>=2.0
16
16
  Requires-Dist: scikit-learn>=1.3
17
- Requires-Dist: openai>=1.0
18
- Requires-Dist: tqdm
19
- Requires-Dist: pyyaml
20
- Requires-Dist: chromadb>=0.4
17
+ Requires-Dist: sentence-transformers>=2.2.2
18
+ Requires-Dist: chromadb>=0.3.1
21
19
  Requires-Dist: faiss-cpu; sys_platform != "darwin"
20
+ Requires-Dist: faiss-cpu==1.7.4; sys_platform == "darwin"
21
+ Requires-Dist: rank-bm25>=0.2.2
22
22
  Requires-Dist: optuna>=3.0
23
- Requires-Dist: pytest
23
+ Requires-Dist: tqdm
24
24
  Requires-Dist: colorama
25
+ Requires-Dist: pyyaml
26
+ Requires-Dist: python-dotenv
27
+ Requires-Dist: openai>=1.0.0
25
28
  Requires-Dist: google-generativeai>=0.8.0
29
+ Requires-Dist: anthropic>=0.25.0
26
30
  Requires-Dist: supabase>=2.4.0
27
- Requires-Dist: python-dotenv
28
- Requires-Dist: sentence-transformers
31
+ Requires-Dist: pytest
32
+ Requires-Dist: langchain>=0.2.5
33
+ Requires-Dist: langchain-community>=0.2.5
34
+ Requires-Dist: langchain-text-splitters>=0.2.1
35
+ Provides-Extra: dev
36
+ Requires-Dist: black; extra == "dev"
37
+ Requires-Dist: flake8; extra == "dev"
38
+ Requires-Dist: isort; extra == "dev"
39
+ Requires-Dist: pytest-cov; extra == "dev"
40
+ Provides-Extra: docs
41
+ Requires-Dist: mkdocs; extra == "docs"
42
+ Requires-Dist: mkdocs-material; extra == "docs"
29
43
  Dynamic: license-file
30
44
 
31
45
  # Ragmint
@@ -40,7 +54,7 @@ Dynamic: license-file
40
54
 
41
55
  **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
42
56
 
43
- It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
57
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
44
58
 
45
59
  ---
46
60
 
@@ -55,6 +69,9 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
55
69
  - 🧩 **Embeddings** — Hugging Face
56
70
  - 💾 **Caching, experiment tracking, and reproducibility** out of the box
57
71
  - 🧰 **Clean modular structure** for easy integration in research and production setups
72
+ - 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
73
+ - 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
74
+ - ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
58
75
 
59
76
  ---
60
77
 
@@ -83,6 +100,8 @@ Example `configs/default.yaml`:
83
100
  ```yaml
84
101
  retriever: faiss
85
102
  embedding_model: text-embedding-3-small
103
+ chunk_size: 500
104
+ overlap: 100
86
105
  reranker:
87
106
  mode: mmr
88
107
  lambda_param: 0.5
@@ -96,33 +115,58 @@ optimization:
96
115
  ### 3️⃣ Manual Pipeline Usage
97
116
 
98
117
  ```python
118
+ from ragmint.prebuilder import PreBuilder
99
119
  from ragmint.tuner import RAGMint
100
120
 
101
- # Initialize RAGMint with available components
102
- rag = RAGMint(
121
+ # Prebuild pipeline (chunking, embeddings, vector store)
122
+ prebuilder = PreBuilder(
103
123
  docs_path="data/docs/",
104
- retrievers=["faiss", "chroma", "sklearn"],
105
- embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
106
- rerankers=["mmr"]
124
+ config_path="configs/default.yaml"
107
125
  )
126
+ pipeline = prebuilder.build_pipeline()
108
127
 
109
- # Run optimization over 3 trials using the default validation set
110
- best, results = rag.optimize(
111
- validation_set=None,
112
- metric="faithfulness",
113
- trials=3
114
- )
128
+ # Initialize RAGMint with prebuilt components
129
+ rag = RAGMint(pipeline=pipeline)
115
130
 
131
+ # Run optimization
132
+ best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
116
133
  print("Best configuration:", best)
134
+
117
135
  ```
118
136
  ---
119
137
  # 🧩 Embeddings and Retrievers
120
138
 
121
139
  **Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
122
140
 
141
+ ---
142
+ ## 🧩 Chunking System
143
+
144
+ * **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
145
+ * **Supports default values** if not provided in configuration.
146
+ * **Optimized** for downstream **retrieval and embeddings**.
147
+ * **Enables adaptive chunking strategies** in future releases.
148
+
149
+ ---
150
+ ## 🧩 Langchain Config Adapter
151
+
152
+ * **Ensures consistent configuration** across pipeline components.
153
+ * **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
154
+ * **Adds default chunk parameters** when missing.
155
+ * **Validates retriever backends** and **raises clear errors** for unsupported options.
156
+
157
+ ---
158
+ ## 🧩 Langchain Prebuilder
159
+
160
+ **Automates pipeline preparation:**
161
+ 1. Reads documents
162
+ 2. Applies chunking
163
+ 3. Creates embeddings
164
+ 4. Initializes retriever / vector store
165
+ 5. Returns ready-to-use pipeline** for RAGMint or custom usage.
166
+
123
167
  ---
124
168
 
125
- ## 🔤 Available Embeddings (Hugging Face / OpenAI)
169
+ ## 🔤 Available Embeddings (Hugging Face)
126
170
 
127
171
  You can select from the following models:
128
172
 
@@ -258,8 +302,12 @@ ragmint/
258
302
  │ ├── pipeline.py
259
303
  │ ├── retriever.py
260
304
  │ ├── reranker.py
261
- │ ├── embedding.py
305
+ │ ├── embeddings.py
306
+ │ ├── chunking.py
262
307
  │ └── evaluation.py
308
+ ├── integration/
309
+ │ ├── config_adapter.py
310
+ │ └── langchain_prebuilder.py
263
311
  ├── autotuner.py
264
312
  ├── explainer.py
265
313
  ├── leaderboard.py
@@ -295,21 +343,42 @@ Your `pyproject.toml` includes all required dependencies:
295
343
  name = "ragmint"
296
344
  version = "0.1.0"
297
345
  dependencies = [
346
+ # Core ML + Embeddings
298
347
  "numpy<2.0.0",
299
348
  "pandas>=2.0",
300
349
  "scikit-learn>=1.3",
301
- "openai>=1.0",
302
- "tqdm",
303
- "pyyaml",
350
+ "sentence-transformers>=2.2.2",
351
+
352
+ # Retrieval backends
304
353
  "chromadb>=0.4",
305
- "faiss-cpu; sys_platform != 'darwin'",
354
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
355
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
356
+ "rank-bm25>=0.2.2", # For BM25 retriever
357
+
358
+ # Optimization & evaluation
306
359
  "optuna>=3.0",
307
- "pytest",
360
+ "tqdm",
308
361
  "colorama",
362
+
363
+ # RAG evaluation and data utils
364
+ "pyyaml",
365
+ "python-dotenv",
366
+
367
+ # Explainability and LLM APIs
368
+ "openai>=1.0.0",
309
369
  "google-generativeai>=0.8.0",
370
+ "anthropic>=0.25.0",
371
+
372
+ # Integration / storage
310
373
  "supabase>=2.4.0",
311
- "python-dotenv",
312
- "sentence-transformers"
374
+
375
+ # Testing
376
+ "pytest",
377
+
378
+ # LangChain integration layer
379
+ "langchain>=0.2.5",
380
+ "langchain-community>=0.2.5",
381
+ "langchain-text-splitters>=0.2.1"
313
382
  ]
314
383
  ```
315
384
 
@@ -1,33 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: ragmint
3
- Version: 0.3.0
4
- Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
- Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
- License: Apache License 2.0
7
- Project-URL: Homepage, https://github.com/andyolivers/ragmint
8
- Project-URL: Documentation, https://andyolivers.com
9
- Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
10
- Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
11
- Requires-Python: >=3.9
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: numpy<2.0.0
15
- Requires-Dist: pandas>=2.0
16
- Requires-Dist: scikit-learn>=1.3
17
- Requires-Dist: openai>=1.0
18
- Requires-Dist: tqdm
19
- Requires-Dist: pyyaml
20
- Requires-Dist: chromadb>=0.4
21
- Requires-Dist: faiss-cpu; sys_platform != "darwin"
22
- Requires-Dist: optuna>=3.0
23
- Requires-Dist: pytest
24
- Requires-Dist: colorama
25
- Requires-Dist: google-generativeai>=0.8.0
26
- Requires-Dist: supabase>=2.4.0
27
- Requires-Dist: python-dotenv
28
- Requires-Dist: sentence-transformers
29
- Dynamic: license-file
30
-
31
1
  # Ragmint
32
2
 
33
3
  ![Python](https://img.shields.io/badge/python-3.9%2B-blue)
@@ -40,7 +10,7 @@ Dynamic: license-file
40
10
 
41
11
  **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
42
12
 
43
- It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
13
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
44
14
 
45
15
  ---
46
16
 
@@ -55,6 +25,9 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
55
25
  - 🧩 **Embeddings** — Hugging Face
56
26
  - 💾 **Caching, experiment tracking, and reproducibility** out of the box
57
27
  - 🧰 **Clean modular structure** for easy integration in research and production setups
28
+ - 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
29
+ - 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
30
+ - ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
58
31
 
59
32
  ---
60
33
 
@@ -83,6 +56,8 @@ Example `configs/default.yaml`:
83
56
  ```yaml
84
57
  retriever: faiss
85
58
  embedding_model: text-embedding-3-small
59
+ chunk_size: 500
60
+ overlap: 100
86
61
  reranker:
87
62
  mode: mmr
88
63
  lambda_param: 0.5
@@ -96,24 +71,23 @@ optimization:
96
71
  ### 3️⃣ Manual Pipeline Usage
97
72
 
98
73
  ```python
74
+ from ragmint.prebuilder import PreBuilder
99
75
  from ragmint.tuner import RAGMint
100
76
 
101
- # Initialize RAGMint with available components
102
- rag = RAGMint(
77
+ # Prebuild pipeline (chunking, embeddings, vector store)
78
+ prebuilder = PreBuilder(
103
79
  docs_path="data/docs/",
104
- retrievers=["faiss", "chroma", "sklearn"],
105
- embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
106
- rerankers=["mmr"]
80
+ config_path="configs/default.yaml"
107
81
  )
82
+ pipeline = prebuilder.build_pipeline()
108
83
 
109
- # Run optimization over 3 trials using the default validation set
110
- best, results = rag.optimize(
111
- validation_set=None,
112
- metric="faithfulness",
113
- trials=3
114
- )
84
+ # Initialize RAGMint with prebuilt components
85
+ rag = RAGMint(pipeline=pipeline)
115
86
 
87
+ # Run optimization
88
+ best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
116
89
  print("Best configuration:", best)
90
+
117
91
  ```
118
92
  ---
119
93
  # 🧩 Embeddings and Retrievers
@@ -121,8 +95,34 @@ print("Best configuration:", best)
121
95
  **Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
122
96
 
123
97
  ---
98
+ ## 🧩 Chunking System
99
+
100
+ * **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
101
+ * **Supports default values** if not provided in configuration.
102
+ * **Optimized** for downstream **retrieval and embeddings**.
103
+ * **Enables adaptive chunking strategies** in future releases.
104
+
105
+ ---
106
+ ## 🧩 Langchain Config Adapter
107
+
108
+ * **Ensures consistent configuration** across pipeline components.
109
+ * **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
110
+ * **Adds default chunk parameters** when missing.
111
+ * **Validates retriever backends** and **raises clear errors** for unsupported options.
112
+
113
+ ---
114
+ ## 🧩 Langchain Prebuilder
124
115
 
125
- ## 🔤 Available Embeddings (Hugging Face / OpenAI)
116
+ **Automates pipeline preparation:**
117
+ 1. Reads documents
118
+ 2. Applies chunking
119
+ 3. Creates embeddings
120
+ 4. Initializes retriever / vector store
121
+ 5. Returns ready-to-use pipeline** for RAGMint or custom usage.
122
+
123
+ ---
124
+
125
+ ## 🔤 Available Embeddings (Hugging Face)
126
126
 
127
127
  You can select from the following models:
128
128
 
@@ -258,8 +258,12 @@ ragmint/
258
258
  │ ├── pipeline.py
259
259
  │ ├── retriever.py
260
260
  │ ├── reranker.py
261
- │ ├── embedding.py
261
+ │ ├── embeddings.py
262
+ │ ├── chunking.py
262
263
  │ └── evaluation.py
264
+ ├── integration/
265
+ │ ├── config_adapter.py
266
+ │ └── langchain_prebuilder.py
263
267
  ├── autotuner.py
264
268
  ├── explainer.py
265
269
  ├── leaderboard.py
@@ -295,21 +299,42 @@ Your `pyproject.toml` includes all required dependencies:
295
299
  name = "ragmint"
296
300
  version = "0.1.0"
297
301
  dependencies = [
302
+ # Core ML + Embeddings
298
303
  "numpy<2.0.0",
299
304
  "pandas>=2.0",
300
305
  "scikit-learn>=1.3",
301
- "openai>=1.0",
302
- "tqdm",
303
- "pyyaml",
306
+ "sentence-transformers>=2.2.2",
307
+
308
+ # Retrieval backends
304
309
  "chromadb>=0.4",
305
- "faiss-cpu; sys_platform != 'darwin'",
310
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
311
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
312
+ "rank-bm25>=0.2.2", # For BM25 retriever
313
+
314
+ # Optimization & evaluation
306
315
  "optuna>=3.0",
307
- "pytest",
316
+ "tqdm",
308
317
  "colorama",
318
+
319
+ # RAG evaluation and data utils
320
+ "pyyaml",
321
+ "python-dotenv",
322
+
323
+ # Explainability and LLM APIs
324
+ "openai>=1.0.0",
309
325
  "google-generativeai>=0.8.0",
326
+ "anthropic>=0.25.0",
327
+
328
+ # Integration / storage
310
329
  "supabase>=2.4.0",
311
- "python-dotenv",
312
- "sentence-transformers"
330
+
331
+ # Testing
332
+ "pytest",
333
+
334
+ # LangChain integration layer
335
+ "langchain>=0.2.5",
336
+ "langchain-community>=0.2.5",
337
+ "langchain-text-splitters>=0.2.1"
313
338
  ]
314
339
  ```
315
340
 
@@ -369,4 +394,4 @@ Licensed under the **Apache License 2.0** — free for personal, research, and c
369
394
 
370
395
  **André Oliveira**
371
396
  [andyolivers.com](https://andyolivers.com)
372
- Data Scientist | AI Engineer
397
+ Data Scientist | AI Engineer
@@ -4,33 +4,58 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ragmint"
7
- version = "0.3.0"
7
+ version = "0.3.1"
8
8
  description = "A modular framework for evaluating and optimizing RAG pipelines."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache License 2.0" }
11
11
  authors = [
12
12
  { name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
13
13
  ]
14
- keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
14
+ keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation", "chunking", "autotuning"]
15
15
  requires-python = ">=3.9"
16
16
  dependencies = [
17
+ # Core ML + Embeddings
17
18
  "numpy<2.0.0",
18
19
  "pandas>=2.0",
19
20
  "scikit-learn>=1.3",
20
- "openai>=1.0",
21
- "tqdm",
22
- "pyyaml",
23
- "chromadb>=0.4",
24
- "faiss-cpu; sys_platform != 'darwin'",
21
+ "sentence-transformers>=2.2.2",
22
+
23
+ # Retrieval backends
24
+ "chromadb>=0.3.1",
25
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
26
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
27
+ "rank-bm25>=0.2.2", # For BM25 retriever
28
+
29
+ # Optimization & evaluation
25
30
  "optuna>=3.0",
26
- "pytest",
31
+ "tqdm",
27
32
  "colorama",
33
+
34
+ # RAG evaluation and data utils
35
+ "pyyaml",
36
+ "python-dotenv",
37
+
38
+ # Explainability and LLM APIs
39
+ "openai>=1.0.0",
28
40
  "google-generativeai>=0.8.0",
41
+ "anthropic>=0.25.0",
42
+
43
+ # Integration / storage
29
44
  "supabase>=2.4.0",
30
- "python-dotenv",
31
- "sentence-transformers"
45
+
46
+ # Testing
47
+ "pytest",
48
+
49
+ # LangChain integration layer
50
+ "langchain>=0.2.5",
51
+ "langchain-community>=0.2.5",
52
+ "langchain-text-splitters>=0.2.1"
32
53
  ]
33
54
 
55
+ [project.optional-dependencies]
56
+ dev = ["black", "flake8", "isort", "pytest-cov"]
57
+ docs = ["mkdocs", "mkdocs-material"]
58
+
34
59
  [project.urls]
35
60
  Homepage = "https://github.com/andyolivers/ragmint"
36
61
  Documentation = "https://andyolivers.com"
@@ -51,5 +76,4 @@ ragmint = ["experiments/*.json"]
51
76
 
52
77
  [tool.pytest.ini_options]
53
78
  testpaths = ["tests"]
54
- addopts = "-v"
55
-
79
+ addopts = "-v --tb=short"
@@ -0,0 +1,138 @@
1
+ """
2
+ Auto-RAG Tuner
3
+ --------------
4
+ Automatically recommends and optimizes RAG configurations based on corpus statistics.
5
+ Integrates with RAGMint to perform full end-to-end tuning.
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ from statistics import mean
11
+ from typing import Dict, Any, Tuple, List
12
+
13
+ from .tuner import RAGMint
14
+ from .core.evaluation import evaluate_config
15
+
16
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
17
+
18
+
19
+ class AutoRAGTuner:
20
+ def __init__(self, docs_path: str):
21
+ """
22
+ AutoRAGTuner automatically analyzes a corpus and runs an optimized RAG tuning pipeline.
23
+
24
+ Args:
25
+ docs_path (str): Path to the directory containing documents (.txt, .md, .rst)
26
+ """
27
+ self.docs_path = docs_path
28
+ self.corpus_stats = self._analyze_corpus()
29
+
30
+ # -----------------------------
31
+ # Corpus Analysis
32
+ # -----------------------------
33
+ def _analyze_corpus(self) -> Dict[str, Any]:
34
+ """Compute corpus size, average length, and number of documents."""
35
+ docs = []
36
+ total_chars = 0
37
+ num_docs = 0
38
+
39
+ if not os.path.exists(self.docs_path):
40
+ logging.warning(f"⚠️ Corpus path not found: {self.docs_path}")
41
+ return {"size": 0, "avg_len": 0, "num_docs": 0}
42
+
43
+ for file in os.listdir(self.docs_path):
44
+ if file.endswith((".txt", ".md", ".rst")):
45
+ with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
46
+ content = f.read()
47
+ docs.append(content)
48
+ total_chars += len(content)
49
+ num_docs += 1
50
+
51
+ avg_len = int(mean([len(d) for d in docs])) if docs else 0
52
+ stats = {"size": total_chars, "avg_len": avg_len, "num_docs": num_docs}
53
+ logging.info(f"📊 Corpus stats: {stats}")
54
+ return stats
55
+
56
+ # -----------------------------
57
+ # Recommendation Logic
58
+ # -----------------------------
59
+ def recommend(self) -> Dict[str, Any]:
60
+ """Recommend retriever, embedding, and chunking based on corpus stats."""
61
+ size = self.corpus_stats.get("size", 0)
62
+ avg_len = self.corpus_stats.get("avg_len", 0)
63
+ num_docs = self.corpus_stats.get("num_docs", 0)
64
+
65
+ # Heuristic-based tuning
66
+ # Determine chunking heuristics first
67
+ if avg_len < 200:
68
+ chunk_size, overlap = 300, 50
69
+ elif avg_len < 500:
70
+ chunk_size, overlap = 500, 100
71
+ else:
72
+ chunk_size, overlap = 800, 150
73
+
74
+ # Determine retriever–embedding based on corpus size
75
+ if size <= 2000:
76
+ retriever = "BM25"
77
+ embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
78
+ elif size <= 10000:
79
+ retriever = "Chroma"
80
+ embedding_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
81
+ else:
82
+ retriever = "FAISS"
83
+ embedding_model = "sentence-transformers/all-mpnet-base-v2"
84
+
85
+ strategy = "fixed" if avg_len < 400 else "sentence"
86
+
87
+ recommendation = {
88
+ "retriever": retriever,
89
+ "embedding_model": embedding_model,
90
+ "chunk_size": chunk_size,
91
+ "overlap": overlap,
92
+ "strategy": strategy,
93
+ }
94
+
95
+ logging.info(f"🔮 AutoRAG Recommendation: {recommendation}")
96
+ return recommendation
97
+
98
+ # -----------------------------
99
+ # Full Auto-Tuning
100
+ # -----------------------------
101
+ def auto_tune(
102
+ self,
103
+ validation_set: str = None,
104
+ metric: str = "faithfulness",
105
+ trials: int = 5,
106
+ search_type: str = "random",
107
+ ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
108
+ """
109
+ Run a full automatic optimization using RAGMint.
110
+
111
+ Automatically:
112
+ - Recommends initial config (retriever, embedding, chunking)
113
+ - Launches RAGMint optimization trials
114
+ - Returns best configuration and results
115
+ """
116
+ rec = self.recommend()
117
+
118
+ logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
119
+
120
+ tuner = RAGMint(
121
+ docs_path=self.docs_path,
122
+ retrievers=[rec["retriever"]],
123
+ embeddings=[rec["embedding_model"]],
124
+ rerankers=["mmr"],
125
+ chunk_sizes=[rec["chunk_size"]],
126
+ overlaps=[rec["overlap"]],
127
+ strategies=[rec["strategy"]],
128
+ )
129
+
130
+ best, results = tuner.optimize(
131
+ validation_set=validation_set,
132
+ metric=metric,
133
+ trials=trials,
134
+ search_type=search_type,
135
+ )
136
+
137
+ logging.info(f"🏁 AutoRAG tuning complete. Best: {best}")
138
+ return best, results