ragmint 0.2.3__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {ragmint-0.2.3 → ragmint-0.3.1}/PKG-INFO +161 -32
  2. ragmint-0.2.3/src/ragmint.egg-info/PKG-INFO → ragmint-0.3.1/README.md +138 -53
  3. {ragmint-0.2.3 → ragmint-0.3.1}/pyproject.toml +38 -12
  4. ragmint-0.3.1/src/ragmint/autotuner.py +138 -0
  5. ragmint-0.3.1/src/ragmint/core/chunking.py +86 -0
  6. ragmint-0.3.1/src/ragmint/core/embeddings.py +55 -0
  7. ragmint-0.3.1/src/ragmint/core/pipeline.py +62 -0
  8. ragmint-0.3.1/src/ragmint/core/retriever.py +165 -0
  9. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/explainer.py +5 -3
  10. ragmint-0.3.1/src/ragmint/integrations/config_adapter.py +96 -0
  11. ragmint-0.3.1/src/ragmint/integrations/langchain_prebuilder.py +99 -0
  12. ragmint-0.3.1/src/ragmint/tests/test_autotuner.py +51 -0
  13. ragmint-0.3.1/src/ragmint/tests/test_config_adapter.py +39 -0
  14. ragmint-0.3.1/src/ragmint/tests/test_embeddings.py +46 -0
  15. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_explainer_integration.py +1 -1
  16. ragmint-0.3.1/src/ragmint/tests/test_integration_autotuner_ragmint.py +47 -0
  17. ragmint-0.3.1/src/ragmint/tests/test_langchain_prebuilder.py +82 -0
  18. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_pipeline.py +3 -2
  19. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_retriever.py +3 -2
  20. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_tuner.py +1 -1
  21. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tuner.py +87 -21
  22. ragmint-0.3.1/src/ragmint/utils/__init__.py +0 -0
  23. ragmint-0.2.3/README.md → ragmint-0.3.1/src/ragmint.egg-info/PKG-INFO +182 -25
  24. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/SOURCES.txt +6 -0
  25. ragmint-0.3.1/src/ragmint.egg-info/requires.txt +35 -0
  26. ragmint-0.2.3/src/ragmint/autotuner.py +0 -33
  27. ragmint-0.2.3/src/ragmint/core/chunking.py +0 -22
  28. ragmint-0.2.3/src/ragmint/core/embeddings.py +0 -19
  29. ragmint-0.2.3/src/ragmint/core/pipeline.py +0 -38
  30. ragmint-0.2.3/src/ragmint/core/retriever.py +0 -33
  31. ragmint-0.2.3/src/ragmint/tests/test_autotuner.py +0 -42
  32. ragmint-0.2.3/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -60
  33. ragmint-0.2.3/src/ragmint.egg-info/requires.txt +0 -15
  34. {ragmint-0.2.3 → ragmint-0.3.1}/LICENSE +0 -0
  35. {ragmint-0.2.3 → ragmint-0.3.1}/MANIFEST.in +0 -0
  36. {ragmint-0.2.3 → ragmint-0.3.1}/setup.cfg +0 -0
  37. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/__init__.py +0 -0
  38. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/__main__.py +0 -0
  39. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/__init__.py +0 -0
  40. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/evaluation.py +0 -0
  41. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/core/reranker.py +0 -0
  42. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/experiments/__init__.py +0 -0
  43. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/experiments/validation_qa.json +0 -0
  44. {ragmint-0.2.3/src/ragmint/optimization → ragmint-0.3.1/src/ragmint/integrations}/__init__.py +0 -0
  45. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/leaderboard.py +0 -0
  46. {ragmint-0.2.3/src/ragmint/tests → ragmint-0.3.1/src/ragmint/optimization}/__init__.py +0 -0
  47. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/optimization/search.py +0 -0
  48. {ragmint-0.2.3/src/ragmint/utils → ragmint-0.3.1/src/ragmint/tests}/__init__.py +0 -0
  49. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/conftest.py +0 -0
  50. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_explainer.py +0 -0
  51. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_leaderboard.py +0 -0
  52. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/tests/test_search.py +0 -0
  53. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/caching.py +0 -0
  54. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/data_loader.py +0 -0
  55. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/logger.py +0 -0
  56. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint/utils/metrics.py +0 -0
  57. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/dependency_links.txt +0 -0
  58. {ragmint-0.2.3 → ragmint-0.3.1}/src/ragmint.egg-info/top_level.txt +0 -0
@@ -1,29 +1,45 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.2.3
3
+ Version: 0.3.1
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
7
7
  Project-URL: Homepage, https://github.com/andyolivers/ragmint
8
8
  Project-URL: Documentation, https://andyolivers.com
9
9
  Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
10
- Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
10
+ Keywords: RAG,LLM,retrieval,optimization,AI,evaluation,chunking,autotuning
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: numpy>=1.23
14
+ Requires-Dist: numpy<2.0.0
15
15
  Requires-Dist: pandas>=2.0
16
16
  Requires-Dist: scikit-learn>=1.3
17
- Requires-Dist: openai>=1.0
18
- Requires-Dist: tqdm
19
- Requires-Dist: pyyaml
20
- Requires-Dist: chromadb>=0.4
17
+ Requires-Dist: sentence-transformers>=2.2.2
18
+ Requires-Dist: chromadb>=0.3.1
21
19
  Requires-Dist: faiss-cpu; sys_platform != "darwin"
20
+ Requires-Dist: faiss-cpu==1.7.4; sys_platform == "darwin"
21
+ Requires-Dist: rank-bm25>=0.2.2
22
22
  Requires-Dist: optuna>=3.0
23
- Requires-Dist: pytest
23
+ Requires-Dist: tqdm
24
24
  Requires-Dist: colorama
25
+ Requires-Dist: pyyaml
26
+ Requires-Dist: python-dotenv
27
+ Requires-Dist: openai>=1.0.0
25
28
  Requires-Dist: google-generativeai>=0.8.0
29
+ Requires-Dist: anthropic>=0.25.0
26
30
  Requires-Dist: supabase>=2.4.0
31
+ Requires-Dist: pytest
32
+ Requires-Dist: langchain>=0.2.5
33
+ Requires-Dist: langchain-community>=0.2.5
34
+ Requires-Dist: langchain-text-splitters>=0.2.1
35
+ Provides-Extra: dev
36
+ Requires-Dist: black; extra == "dev"
37
+ Requires-Dist: flake8; extra == "dev"
38
+ Requires-Dist: isort; extra == "dev"
39
+ Requires-Dist: pytest-cov; extra == "dev"
40
+ Provides-Extra: docs
41
+ Requires-Dist: mkdocs; extra == "docs"
42
+ Requires-Dist: mkdocs-material; extra == "docs"
27
43
  Dynamic: license-file
28
44
 
29
45
  # Ragmint
@@ -38,7 +54,7 @@ Dynamic: license-file
38
54
 
39
55
  **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
40
56
 
41
- It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
57
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
42
58
 
43
59
  ---
44
60
 
@@ -49,10 +65,13 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
49
65
  - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
50
66
  - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
51
67
  - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
52
- - ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
53
- - 🧩 **Embeddings** — OpenAI, HuggingFace
68
+ - ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
69
+ - 🧩 **Embeddings** — Hugging Face
54
70
  - 💾 **Caching, experiment tracking, and reproducibility** out of the box
55
71
  - 🧰 **Clean modular structure** for easy integration in research and production setups
72
+ - 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
73
+ - 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
74
+ - ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
56
75
 
57
76
  ---
58
77
 
@@ -81,6 +100,8 @@ Example `configs/default.yaml`:
81
100
  ```yaml
82
101
  retriever: faiss
83
102
  embedding_model: text-embedding-3-small
103
+ chunk_size: 500
104
+ overlap: 100
84
105
  reranker:
85
106
  mode: mmr
86
107
  lambda_param: 0.5
@@ -94,15 +115,94 @@ optimization:
94
115
  ### 3️⃣ Manual Pipeline Usage
95
116
 
96
117
  ```python
97
- from ragmint.core.pipeline import RAGPipeline
118
+ from ragmint.prebuilder import PreBuilder
119
+ from ragmint.tuner import RAGMint
120
+
121
+ # Prebuild pipeline (chunking, embeddings, vector store)
122
+ prebuilder = PreBuilder(
123
+ docs_path="data/docs/",
124
+ config_path="configs/default.yaml"
125
+ )
126
+ pipeline = prebuilder.build_pipeline()
127
+
128
+ # Initialize RAGMint with prebuilt components
129
+ rag = RAGMint(pipeline=pipeline)
130
+
131
+ # Run optimization
132
+ best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
133
+ print("Best configuration:", best)
98
134
 
99
- pipeline = RAGPipeline({
100
- "embedding_model": "text-embedding-3-small",
101
- "retriever": "faiss",
102
- })
135
+ ```
136
+ ---
137
+ # 🧩 Embeddings and Retrievers
138
+
139
+ **Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
140
+
141
+ ---
142
+ ## 🧩 Chunking System
143
+
144
+ * **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
145
+ * **Supports default values** if not provided in configuration.
146
+ * **Optimized** for downstream **retrieval and embeddings**.
147
+ * **Enables adaptive chunking strategies** in future releases.
148
+
149
+ ---
150
+ ## 🧩 Langchain Config Adapter
151
+
152
+ * **Ensures consistent configuration** across pipeline components.
153
+ * **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
154
+ * **Adds default chunk parameters** when missing.
155
+ * **Validates retriever backends** and **raises clear errors** for unsupported options.
156
+
157
+ ---
158
+ ## 🧩 Langchain Prebuilder
103
159
 
104
- result = pipeline.run("What is retrieval-augmented generation?")
105
- print(result)
160
+ **Automates pipeline preparation:**
161
+ 1. Reads documents
162
+ 2. Applies chunking
163
+ 3. Creates embeddings
164
+ 4. Initializes retriever / vector store
165
+ 5. Returns ready-to-use pipeline** for RAGMint or custom usage.
166
+
167
+ ---
168
+
169
+ ## 🔤 Available Embeddings (Hugging Face)
170
+
171
+ You can select from the following models:
172
+
173
+ * `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
174
+ * `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
175
+ * `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
176
+ * `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
177
+
178
+
179
+
180
+ ### Configuration Example
181
+
182
+ Use the following format in your config file to specify the embedding model:
183
+
184
+ ```yaml
185
+ embedding_model: sentence-transformers/all-MiniLM-L6-v2
186
+ ```
187
+ ---
188
+
189
+ ## 🔍 Available Retrievers
190
+
191
+ **Ragmint** integrates multiple **retrieval backends** to suit different needs:
192
+
193
+ | Retriever | Description |
194
+ | :--- | :--- |
195
+ | **FAISS** | Fast vector similarity search; efficient for dense embeddings |
196
+ | **Chroma** | Persistent vector DB; works well for incremental updates |
197
+ | **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
198
+
199
+
200
+ ### Configuration Example
201
+
202
+ To specify the retriever in your configuration file, use the following format:
203
+
204
+ ```yaml
205
+ retriever: faiss
106
206
  ```
107
207
 
108
208
  ---
@@ -174,8 +274,7 @@ lb.show_top(3)
174
274
 
175
275
  ## 🧠 Explainability with Gemini / Claude
176
276
 
177
- Compare two RAG configurations and receive natural language insights
178
- on **why** one performs better.
277
+ Compare two RAG configurations and receive **natural language insights** on why one performs better.
179
278
 
180
279
  ```python
181
280
  from ragmint.explainer import explain_results
@@ -189,7 +288,7 @@ print(explanation)
189
288
 
190
289
  > Set your API keys in a `.env` file or via environment variables:
191
290
  > ```
192
- > export GOOGLE_API_KEY="your_gemini_key"
291
+ > export GEMINI_API_KEY="your_gemini_key"
193
292
  > export ANTHROPIC_API_KEY="your_claude_key"
194
293
  > ```
195
294
 
@@ -203,8 +302,12 @@ ragmint/
203
302
  │ ├── pipeline.py
204
303
  │ ├── retriever.py
205
304
  │ ├── reranker.py
206
- │ ├── embedding.py
305
+ │ ├── embeddings.py
306
+ │ ├── chunking.py
207
307
  │ └── evaluation.py
308
+ ├── integration/
309
+ │ ├── config_adapter.py
310
+ │ └── langchain_prebuilder.py
208
311
  ├── autotuner.py
209
312
  ├── explainer.py
210
313
  ├── leaderboard.py
@@ -240,16 +343,42 @@ Your `pyproject.toml` includes all required dependencies:
240
343
  name = "ragmint"
241
344
  version = "0.1.0"
242
345
  dependencies = [
243
- "numpy",
244
- "optuna",
245
- "scikit-learn",
246
- "faiss-cpu",
247
- "chromadb",
248
- "pytest",
249
- "openai",
250
- "tqdm",
251
- "google-generativeai",
252
- "google-genai",
346
+ # Core ML + Embeddings
347
+ "numpy<2.0.0",
348
+ "pandas>=2.0",
349
+ "scikit-learn>=1.3",
350
+ "sentence-transformers>=2.2.2",
351
+
352
+ # Retrieval backends
353
+ "chromadb>=0.4",
354
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
355
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
356
+ "rank-bm25>=0.2.2", # For BM25 retriever
357
+
358
+ # Optimization & evaluation
359
+ "optuna>=3.0",
360
+ "tqdm",
361
+ "colorama",
362
+
363
+ # RAG evaluation and data utils
364
+ "pyyaml",
365
+ "python-dotenv",
366
+
367
+ # Explainability and LLM APIs
368
+ "openai>=1.0.0",
369
+ "google-generativeai>=0.8.0",
370
+ "anthropic>=0.25.0",
371
+
372
+ # Integration / storage
373
+ "supabase>=2.4.0",
374
+
375
+ # Testing
376
+ "pytest",
377
+
378
+ # LangChain integration layer
379
+ "langchain>=0.2.5",
380
+ "langchain-community>=0.2.5",
381
+ "langchain-text-splitters>=0.2.1"
253
382
  ]
254
383
  ```
255
384
 
@@ -1,31 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: ragmint
3
- Version: 0.2.3
4
- Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
- Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
- License: Apache License 2.0
7
- Project-URL: Homepage, https://github.com/andyolivers/ragmint
8
- Project-URL: Documentation, https://andyolivers.com
9
- Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
10
- Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
11
- Requires-Python: >=3.9
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: numpy>=1.23
15
- Requires-Dist: pandas>=2.0
16
- Requires-Dist: scikit-learn>=1.3
17
- Requires-Dist: openai>=1.0
18
- Requires-Dist: tqdm
19
- Requires-Dist: pyyaml
20
- Requires-Dist: chromadb>=0.4
21
- Requires-Dist: faiss-cpu; sys_platform != "darwin"
22
- Requires-Dist: optuna>=3.0
23
- Requires-Dist: pytest
24
- Requires-Dist: colorama
25
- Requires-Dist: google-generativeai>=0.8.0
26
- Requires-Dist: supabase>=2.4.0
27
- Dynamic: license-file
28
-
29
1
  # Ragmint
30
2
 
31
3
  ![Python](https://img.shields.io/badge/python-3.9%2B-blue)
@@ -38,7 +10,7 @@ Dynamic: license-file
38
10
 
39
11
  **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
40
12
 
41
- It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
13
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, **automated RAG evaluation**, and **config-driven prebuilding** of pipelines with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, **chunking**, and **explainability** through Gemini or Claude.
42
14
 
43
15
  ---
44
16
 
@@ -49,10 +21,13 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
49
21
  - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
50
22
  - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
51
23
  - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
52
- - ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
53
- - 🧩 **Embeddings** — OpenAI, HuggingFace
24
+ - ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
25
+ - 🧩 **Embeddings** — Hugging Face
54
26
  - 💾 **Caching, experiment tracking, and reproducibility** out of the box
55
27
  - 🧰 **Clean modular structure** for easy integration in research and production setups
28
+ - 📦 **Chunking system** — automatic or configurable chunk_size and overlap for documents
29
+ - 🏗️ **Langchain Prebuilder** — prepares pipelines, applies chunking, embeddings, and vector store creation automatically
30
+ - ⚙️ **Config Adapter (LangchainConfigAdapter)** — normalizes configuration, fills defaults, validates retrievers
56
31
 
57
32
  ---
58
33
 
@@ -81,6 +56,8 @@ Example `configs/default.yaml`:
81
56
  ```yaml
82
57
  retriever: faiss
83
58
  embedding_model: text-embedding-3-small
59
+ chunk_size: 500
60
+ overlap: 100
84
61
  reranker:
85
62
  mode: mmr
86
63
  lambda_param: 0.5
@@ -94,15 +71,94 @@ optimization:
94
71
  ### 3️⃣ Manual Pipeline Usage
95
72
 
96
73
  ```python
97
- from ragmint.core.pipeline import RAGPipeline
74
+ from ragmint.prebuilder import PreBuilder
75
+ from ragmint.tuner import RAGMint
76
+
77
+ # Prebuild pipeline (chunking, embeddings, vector store)
78
+ prebuilder = PreBuilder(
79
+ docs_path="data/docs/",
80
+ config_path="configs/default.yaml"
81
+ )
82
+ pipeline = prebuilder.build_pipeline()
83
+
84
+ # Initialize RAGMint with prebuilt components
85
+ rag = RAGMint(pipeline=pipeline)
86
+
87
+ # Run optimization
88
+ best, results = rag.optimize(validation_set=None, metric="faithfulness", trials=3)
89
+ print("Best configuration:", best)
90
+
91
+ ```
92
+ ---
93
+ # 🧩 Embeddings and Retrievers
94
+
95
+ **Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
96
+
97
+ ---
98
+ ## 🧩 Chunking System
99
+
100
+ * **Automatically splits documents** into chunks with `chunk_size` and `overlap` parameters.
101
+ * **Supports default values** if not provided in configuration.
102
+ * **Optimized** for downstream **retrieval and embeddings**.
103
+ * **Enables adaptive chunking strategies** in future releases.
104
+
105
+ ---
106
+ ## 🧩 Langchain Config Adapter
107
+
108
+ * **Ensures consistent configuration** across pipeline components.
109
+ * **Normalizes retriever and embedding names** (e.g., `faiss`, `sentence-transformers/...`).
110
+ * **Adds default chunk parameters** when missing.
111
+ * **Validates retriever backends** and **raises clear errors** for unsupported options.
112
+
113
+ ---
114
+ ## 🧩 Langchain Prebuilder
115
+
116
+ **Automates pipeline preparation:**
117
+ 1. Reads documents
118
+ 2. Applies chunking
119
+ 3. Creates embeddings
120
+ 4. Initializes retriever / vector store
121
+ 5. Returns ready-to-use pipeline** for RAGMint or custom usage.
98
122
 
99
- pipeline = RAGPipeline({
100
- "embedding_model": "text-embedding-3-small",
101
- "retriever": "faiss",
102
- })
123
+ ---
124
+
125
+ ## 🔤 Available Embeddings (Hugging Face)
126
+
127
+ You can select from the following models:
128
+
129
+ * `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
130
+ * `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
131
+ * `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
132
+ * `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
133
+
134
+
135
+
136
+ ### Configuration Example
137
+
138
+ Use the following format in your config file to specify the embedding model:
139
+
140
+ ```yaml
141
+ embedding_model: sentence-transformers/all-MiniLM-L6-v2
142
+ ```
143
+ ---
144
+
145
+ ## 🔍 Available Retrievers
146
+
147
+ **Ragmint** integrates multiple **retrieval backends** to suit different needs:
148
+
149
+ | Retriever | Description |
150
+ | :--- | :--- |
151
+ | **FAISS** | Fast vector similarity search; efficient for dense embeddings |
152
+ | **Chroma** | Persistent vector DB; works well for incremental updates |
153
+ | **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
103
154
 
104
- result = pipeline.run("What is retrieval-augmented generation?")
105
- print(result)
155
+
156
+ ### Configuration Example
157
+
158
+ To specify the retriever in your configuration file, use the following format:
159
+
160
+ ```yaml
161
+ retriever: faiss
106
162
  ```
107
163
 
108
164
  ---
@@ -174,8 +230,7 @@ lb.show_top(3)
174
230
 
175
231
  ## 🧠 Explainability with Gemini / Claude
176
232
 
177
- Compare two RAG configurations and receive natural language insights
178
- on **why** one performs better.
233
+ Compare two RAG configurations and receive **natural language insights** on why one performs better.
179
234
 
180
235
  ```python
181
236
  from ragmint.explainer import explain_results
@@ -189,7 +244,7 @@ print(explanation)
189
244
 
190
245
  > Set your API keys in a `.env` file or via environment variables:
191
246
  > ```
192
- > export GOOGLE_API_KEY="your_gemini_key"
247
+ > export GEMINI_API_KEY="your_gemini_key"
193
248
  > export ANTHROPIC_API_KEY="your_claude_key"
194
249
  > ```
195
250
 
@@ -203,8 +258,12 @@ ragmint/
203
258
  │ ├── pipeline.py
204
259
  │ ├── retriever.py
205
260
  │ ├── reranker.py
206
- │ ├── embedding.py
261
+ │ ├── embeddings.py
262
+ │ ├── chunking.py
207
263
  │ └── evaluation.py
264
+ ├── integration/
265
+ │ ├── config_adapter.py
266
+ │ └── langchain_prebuilder.py
208
267
  ├── autotuner.py
209
268
  ├── explainer.py
210
269
  ├── leaderboard.py
@@ -240,16 +299,42 @@ Your `pyproject.toml` includes all required dependencies:
240
299
  name = "ragmint"
241
300
  version = "0.1.0"
242
301
  dependencies = [
243
- "numpy",
244
- "optuna",
245
- "scikit-learn",
246
- "faiss-cpu",
247
- "chromadb",
248
- "pytest",
249
- "openai",
250
- "tqdm",
251
- "google-generativeai",
252
- "google-genai",
302
+ # Core ML + Embeddings
303
+ "numpy<2.0.0",
304
+ "pandas>=2.0",
305
+ "scikit-learn>=1.3",
306
+ "sentence-transformers>=2.2.2",
307
+
308
+ # Retrieval backends
309
+ "chromadb>=0.4",
310
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
311
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
312
+ "rank-bm25>=0.2.2", # For BM25 retriever
313
+
314
+ # Optimization & evaluation
315
+ "optuna>=3.0",
316
+ "tqdm",
317
+ "colorama",
318
+
319
+ # RAG evaluation and data utils
320
+ "pyyaml",
321
+ "python-dotenv",
322
+
323
+ # Explainability and LLM APIs
324
+ "openai>=1.0.0",
325
+ "google-generativeai>=0.8.0",
326
+ "anthropic>=0.25.0",
327
+
328
+ # Integration / storage
329
+ "supabase>=2.4.0",
330
+
331
+ # Testing
332
+ "pytest",
333
+
334
+ # LangChain integration layer
335
+ "langchain>=0.2.5",
336
+ "langchain-community>=0.2.5",
337
+ "langchain-text-splitters>=0.2.1"
253
338
  ]
254
339
  ```
255
340
 
@@ -309,4 +394,4 @@ Licensed under the **Apache License 2.0** — free for personal, research, and c
309
394
 
310
395
  **André Oliveira**
311
396
  [andyolivers.com](https://andyolivers.com)
312
- Data Scientist | AI Engineer
397
+ Data Scientist | AI Engineer
@@ -4,31 +4,58 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ragmint"
7
- version = "0.2.3"
7
+ version = "0.3.1"
8
8
  description = "A modular framework for evaluating and optimizing RAG pipelines."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache License 2.0" }
11
11
  authors = [
12
12
  { name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
13
13
  ]
14
- keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
14
+ keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation", "chunking", "autotuning"]
15
15
  requires-python = ">=3.9"
16
16
  dependencies = [
17
- "numpy>=1.23",
17
+ # Core ML + Embeddings
18
+ "numpy<2.0.0",
18
19
  "pandas>=2.0",
19
20
  "scikit-learn>=1.3",
20
- "openai>=1.0",
21
- "tqdm",
22
- "pyyaml",
23
- "chromadb>=0.4",
24
- "faiss-cpu; sys_platform != 'darwin'",
21
+ "sentence-transformers>=2.2.2",
22
+
23
+ # Retrieval backends
24
+ "chromadb>=0.3.1",
25
+ "faiss-cpu; sys_platform != 'darwin'", # For Linux/Windows
26
+ "faiss-cpu==1.7.4; sys_platform == 'darwin'", # Optional fix for macOS MPS
27
+ "rank-bm25>=0.2.2", # For BM25 retriever
28
+
29
+ # Optimization & evaluation
25
30
  "optuna>=3.0",
26
- "pytest",
31
+ "tqdm",
27
32
  "colorama",
33
+
34
+ # RAG evaluation and data utils
35
+ "pyyaml",
36
+ "python-dotenv",
37
+
38
+ # Explainability and LLM APIs
39
+ "openai>=1.0.0",
28
40
  "google-generativeai>=0.8.0",
29
- "supabase>=2.4.0"
41
+ "anthropic>=0.25.0",
42
+
43
+ # Integration / storage
44
+ "supabase>=2.4.0",
45
+
46
+ # Testing
47
+ "pytest",
48
+
49
+ # LangChain integration layer
50
+ "langchain>=0.2.5",
51
+ "langchain-community>=0.2.5",
52
+ "langchain-text-splitters>=0.2.1"
30
53
  ]
31
54
 
55
+ [project.optional-dependencies]
56
+ dev = ["black", "flake8", "isort", "pytest-cov"]
57
+ docs = ["mkdocs", "mkdocs-material"]
58
+
32
59
  [project.urls]
33
60
  Homepage = "https://github.com/andyolivers/ragmint"
34
61
  Documentation = "https://andyolivers.com"
@@ -49,5 +76,4 @@ ragmint = ["experiments/*.json"]
49
76
 
50
77
  [tool.pytest.ini_options]
51
78
  testpaths = ["tests"]
52
- addopts = "-v"
53
-
79
+ addopts = "-v --tb=short"