ragmint 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragmint might be problematic. Click here for more details.

Files changed (33) hide show
  1. ragmint-0.1.0/LICENSE +19 -0
  2. ragmint-0.1.0/PKG-INFO +218 -0
  3. ragmint-0.1.0/README.md +192 -0
  4. ragmint-0.1.0/pyproject.toml +43 -0
  5. ragmint-0.1.0/setup.cfg +4 -0
  6. ragmint-0.1.0/src/ragmint/__init__.py +0 -0
  7. ragmint-0.1.0/src/ragmint/__main__.py +28 -0
  8. ragmint-0.1.0/src/ragmint/core/__init__.py +0 -0
  9. ragmint-0.1.0/src/ragmint/core/chunking.py +22 -0
  10. ragmint-0.1.0/src/ragmint/core/embeddings.py +19 -0
  11. ragmint-0.1.0/src/ragmint/core/evaluation.py +27 -0
  12. ragmint-0.1.0/src/ragmint/core/pipeline.py +38 -0
  13. ragmint-0.1.0/src/ragmint/core/reranker.py +62 -0
  14. ragmint-0.1.0/src/ragmint/core/retriever.py +33 -0
  15. ragmint-0.1.0/src/ragmint/experiments/__init__.py +0 -0
  16. ragmint-0.1.0/src/ragmint/optimization/__init__.py +0 -0
  17. ragmint-0.1.0/src/ragmint/optimization/search.py +48 -0
  18. ragmint-0.1.0/src/ragmint/tests/__init__.py +0 -0
  19. ragmint-0.1.0/src/ragmint/tests/test_pipeline.py +19 -0
  20. ragmint-0.1.0/src/ragmint/tests/test_retriever.py +14 -0
  21. ragmint-0.1.0/src/ragmint/tests/test_search.py +17 -0
  22. ragmint-0.1.0/src/ragmint/tests/test_tuner.py +38 -0
  23. ragmint-0.1.0/src/ragmint/tuner.py +123 -0
  24. ragmint-0.1.0/src/ragmint/utils/__init__.py +0 -0
  25. ragmint-0.1.0/src/ragmint/utils/caching.py +37 -0
  26. ragmint-0.1.0/src/ragmint/utils/data_loader.py +35 -0
  27. ragmint-0.1.0/src/ragmint/utils/logger.py +36 -0
  28. ragmint-0.1.0/src/ragmint/utils/metrics.py +27 -0
  29. ragmint-0.1.0/src/ragmint.egg-info/PKG-INFO +218 -0
  30. ragmint-0.1.0/src/ragmint.egg-info/SOURCES.txt +31 -0
  31. ragmint-0.1.0/src/ragmint.egg-info/dependency_links.txt +1 -0
  32. ragmint-0.1.0/src/ragmint.egg-info/requires.txt +13 -0
  33. ragmint-0.1.0/src/ragmint.egg-info/top_level.txt +1 -0
ragmint-0.1.0/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ Copyright 2025 AndrΓ© Oliveira
8
+
9
+ Licensed under the Apache License, Version 2.0 (the "License");
10
+ you may not use this file except in compliance with the License.
11
+ You may obtain a copy of the License at
12
+
13
+ http://www.apache.org/licenses/LICENSE-2.0
14
+
15
+ Unless required by applicable law or agreed to in writing, software
16
+ distributed under the License is distributed on an "AS IS" BASIS,
17
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ See the License for the specific language governing permissions and
19
+ limitations under the License.
ragmint-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.4
2
+ Name: ragmint
3
+ Version: 0.1.0
4
+ Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
+ Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
+ License: Apache License 2.0
7
+ Project-URL: Homepage, https://github.com/andyolivers/ragmint
8
+ Project-URL: Documentation, https://andyolivers.com
9
+ Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
10
+ Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: numpy>=1.23
15
+ Requires-Dist: pandas>=2.0
16
+ Requires-Dist: scikit-learn>=1.3
17
+ Requires-Dist: openai>=1.0
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: chromadb>=0.4
21
+ Requires-Dist: faiss-cpu; sys_platform != "darwin"
22
+ Requires-Dist: optuna>=3.0
23
+ Requires-Dist: pytest
24
+ Requires-Dist: colorama
25
+ Dynamic: license-file
26
+
27
+ # Ragmint
28
+
29
+ ![Python](https://img.shields.io/badge/python-3.9%2B-blue)
30
+ ![License](https://img.shields.io/badge/license-Apache%202.0-green)
31
+ ![Tests](https://github.com/andyolivers/ragmint/actions/workflows/tests.yml/badge.svg)
32
+ ![Optuna](https://img.shields.io/badge/Optuna-Integrated-orange)
33
+ ![Status](https://img.shields.io/badge/Status-Active-success)
34
+
35
+ ![](/assets/images/ragmint-banner.png)
36
+
37
+ **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
38
+
39
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**.
40
+
41
+ ---
42
+
43
+ ## ✨ Features
44
+
45
+ - βœ… **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
46
+ - πŸ” **Built-in RAG evaluation metrics** β€” faithfulness, recall, BLEU, ROUGE, latency
47
+ - βš™οΈ **Retrievers** β€” FAISS, Chroma, ElasticSearch
48
+ - 🧩 **Embeddings** β€” OpenAI, HuggingFace
49
+ - 🧠 **Rerankers** β€” MMR, CrossEncoder (extensible via plugin interface)
50
+ - πŸ’Ύ **Caching, experiment tracking, and reproducibility** out of the box
51
+ - 🧰 **Clean modular structure** for easy integration in research and production setups
52
+
53
+ ---
54
+
55
+ ## πŸš€ Quick Start
56
+
57
+ ### 1️⃣ Installation
58
+
59
+ ```bash
60
+ git clone https://github.com/andyolivers/ragmint.git
61
+ cd ragmint
62
+ pip install -e .
63
+ ```
64
+
65
+ > The `-e` flag installs Ragmint in editable (development) mode.
66
+ > Requires **Python β‰₯ 3.9**.
67
+
68
+ ---
69
+
70
+ ### 2️⃣ Run a RAG Optimization Experiment
71
+
72
+ ```bash
73
+ python ragmint/main.py --config configs/default.yaml --search bayesian
74
+ ```
75
+
76
+ Example `configs/default.yaml`:
77
+ ```yaml
78
+ retriever: faiss
79
+ embedding_model: text-embedding-3-small
80
+ reranker:
81
+ mode: mmr
82
+ lambda_param: 0.5
83
+ optimization:
84
+ search_method: bayesian
85
+ n_trials: 20
86
+ ```
87
+
88
+ ---
89
+
90
+ ### 3️⃣ Manual Pipeline Usage
91
+
92
+ ```python
93
+ from ragmint.core.pipeline import RAGPipeline
94
+
95
+ pipeline = RAGPipeline({
96
+ "embedding_model": "text-embedding-3-small",
97
+ "retriever": "faiss",
98
+ })
99
+
100
+ result = pipeline.run("What is retrieval-augmented generation?")
101
+ print(result)
102
+ ```
103
+
104
+ ---
105
+
106
+ ## 🧩 Folder Structure
107
+
108
+ ```
109
+ ragmint/
110
+ β”œβ”€β”€ core/
111
+ β”‚ β”œβ”€β”€ pipeline.py # RAGPipeline implementation
112
+ β”‚ β”œβ”€β”€ retriever.py # Retriever logic (FAISS, Chroma)
113
+ β”‚ β”œβ”€β”€ reranker.py # MMR + CrossEncoder rerankers
114
+ β”‚ └── embedding.py # Embedding backends
115
+ β”œβ”€β”€ tuner.py # Grid, Random, Bayesian optimization (Optuna)
116
+ β”œβ”€β”€ utils/ # Metrics, logging, caching helpers
117
+ β”œβ”€β”€ configs/ # Default experiment configs
118
+ β”œβ”€β”€ experiments/ # Saved experiment results
119
+ β”œβ”€β”€ tests/ # Unit tests for all components
120
+ β”œβ”€β”€ main.py # CLI entrypoint for tuning
121
+ └── pyproject.toml # Project dependencies & build metadata
122
+ ```
123
+
124
+ ---
125
+
126
+ ## πŸ§ͺ Running Tests
127
+
128
+ To verify your setup:
129
+
130
+ ```bash
131
+ pytest -v
132
+ ```
133
+
134
+ Or to test a specific component (e.g., reranker):
135
+
136
+ ```bash
137
+ pytest tests/test_reranker.py -v
138
+ ```
139
+
140
+ All tests are designed for **Pytest** and run with lightweight mock data.
141
+
142
+ ---
143
+
144
+ ## βš™οΈ Configuration via `pyproject.toml`
145
+
146
+ Your `pyproject.toml` automatically includes:
147
+
148
+ ```toml
149
+ [project]
150
+ name = "ragmint"
151
+ version = "0.1.0"
152
+ dependencies = [
153
+ "numpy",
154
+ "optuna",
155
+ "scikit-learn",
156
+ "faiss-cpu",
157
+ "chromadb",
158
+ "pytest",
159
+ "openai",
160
+ "tqdm",
161
+ ]
162
+ ```
163
+
164
+ ---
165
+
166
+ ## πŸ“Š Example Experiment Workflow
167
+
168
+ 1. Define your retriever and reranker configuration in YAML
169
+ 2. Launch an optimization search (Grid, Random, or Bayesian)
170
+ 3. Ragmint evaluates combinations automatically and reports top results
171
+ 4. Export best parameters for production pipelines
172
+
173
+ ---
174
+
175
+ ## 🧬 Architecture Overview
176
+
177
+ ```mermaid
178
+ flowchart TD
179
+ A[Query] --> B[Embedder]
180
+ B --> C[Retriever]
181
+ C --> D[Reranker]
182
+ D --> E[Generator]
183
+ E --> F[Evaluation]
184
+ F --> G[Optuna Tuner]
185
+ G -->|Best Params| B
186
+ ```
187
+
188
+ ---
189
+
190
+ ## πŸ“˜ Example Output
191
+
192
+ ```
193
+ [INFO] Starting Bayesian optimization with Optuna
194
+ [INFO] Trial 7 finished: recall=0.83, latency=0.42s
195
+ [INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
196
+ ```
197
+
198
+ ---
199
+
200
+ ## 🧠 Why Ragmint?
201
+
202
+ - Built for **RAG researchers**, **AI engineers**, and **LLM ops**
203
+ - Works with **LangChain**, **LlamaIndex**, or standalone RAG setups
204
+ - Designed for **extensibility** β€” plug in your own models, retrievers, or metrics
205
+
206
+ ---
207
+
208
+ ## βš–οΈ License
209
+
210
+ Licensed under the **Apache License 2.0** β€” free for personal, research, and commercial use.
211
+
212
+ ---
213
+
214
+ ## πŸ‘€ Author
215
+
216
+ **AndrΓ© Oliveira**
217
+ [andyolivers.com](https://andyolivers.com)
218
+ Data Scientist | AI Engineer
@@ -0,0 +1,192 @@
1
+ # Ragmint
2
+
3
+ ![Python](https://img.shields.io/badge/python-3.9%2B-blue)
4
+ ![License](https://img.shields.io/badge/license-Apache%202.0-green)
5
+ ![Tests](https://github.com/andyolivers/ragmint/actions/workflows/tests.yml/badge.svg)
6
+ ![Optuna](https://img.shields.io/badge/Optuna-Integrated-orange)
7
+ ![Status](https://img.shields.io/badge/Status-Active-success)
8
+
9
+ ![](/assets/images/ragmint-banner.png)
10
+
11
+ **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
12
+
13
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**.
14
+
15
+ ---
16
+
17
+ ## ✨ Features
18
+
19
+ - βœ… **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
20
+ - πŸ” **Built-in RAG evaluation metrics** β€” faithfulness, recall, BLEU, ROUGE, latency
21
+ - βš™οΈ **Retrievers** β€” FAISS, Chroma, ElasticSearch
22
+ - 🧩 **Embeddings** β€” OpenAI, HuggingFace
23
+ - 🧠 **Rerankers** β€” MMR, CrossEncoder (extensible via plugin interface)
24
+ - πŸ’Ύ **Caching, experiment tracking, and reproducibility** out of the box
25
+ - 🧰 **Clean modular structure** for easy integration in research and production setups
26
+
27
+ ---
28
+
29
+ ## πŸš€ Quick Start
30
+
31
+ ### 1️⃣ Installation
32
+
33
+ ```bash
34
+ git clone https://github.com/andyolivers/ragmint.git
35
+ cd ragmint
36
+ pip install -e .
37
+ ```
38
+
39
+ > The `-e` flag installs Ragmint in editable (development) mode.
40
+ > Requires **Python β‰₯ 3.9**.
41
+
42
+ ---
43
+
44
+ ### 2️⃣ Run a RAG Optimization Experiment
45
+
46
+ ```bash
47
+ python ragmint/main.py --config configs/default.yaml --search bayesian
48
+ ```
49
+
50
+ Example `configs/default.yaml`:
51
+ ```yaml
52
+ retriever: faiss
53
+ embedding_model: text-embedding-3-small
54
+ reranker:
55
+ mode: mmr
56
+ lambda_param: 0.5
57
+ optimization:
58
+ search_method: bayesian
59
+ n_trials: 20
60
+ ```
61
+
62
+ ---
63
+
64
+ ### 3️⃣ Manual Pipeline Usage
65
+
66
+ ```python
67
+ from ragmint.core.pipeline import RAGPipeline
68
+
69
+ pipeline = RAGPipeline({
70
+ "embedding_model": "text-embedding-3-small",
71
+ "retriever": "faiss",
72
+ })
73
+
74
+ result = pipeline.run("What is retrieval-augmented generation?")
75
+ print(result)
76
+ ```
77
+
78
+ ---
79
+
80
+ ## 🧩 Folder Structure
81
+
82
+ ```
83
+ ragmint/
84
+ β”œβ”€β”€ core/
85
+ β”‚ β”œβ”€β”€ pipeline.py # RAGPipeline implementation
86
+ β”‚ β”œβ”€β”€ retriever.py # Retriever logic (FAISS, Chroma)
87
+ β”‚ β”œβ”€β”€ reranker.py # MMR + CrossEncoder rerankers
88
+ β”‚ └── embedding.py # Embedding backends
89
+ β”œβ”€β”€ tuner.py # Grid, Random, Bayesian optimization (Optuna)
90
+ β”œβ”€β”€ utils/ # Metrics, logging, caching helpers
91
+ β”œβ”€β”€ configs/ # Default experiment configs
92
+ β”œβ”€β”€ experiments/ # Saved experiment results
93
+ β”œβ”€β”€ tests/ # Unit tests for all components
94
+ β”œβ”€β”€ main.py # CLI entrypoint for tuning
95
+ └── pyproject.toml # Project dependencies & build metadata
96
+ ```
97
+
98
+ ---
99
+
100
+ ## πŸ§ͺ Running Tests
101
+
102
+ To verify your setup:
103
+
104
+ ```bash
105
+ pytest -v
106
+ ```
107
+
108
+ Or to test a specific component (e.g., reranker):
109
+
110
+ ```bash
111
+ pytest tests/test_reranker.py -v
112
+ ```
113
+
114
+ All tests are designed for **Pytest** and run with lightweight mock data.
115
+
116
+ ---
117
+
118
+ ## βš™οΈ Configuration via `pyproject.toml`
119
+
120
+ Your `pyproject.toml` automatically includes:
121
+
122
+ ```toml
123
+ [project]
124
+ name = "ragmint"
125
+ version = "0.1.0"
126
+ dependencies = [
127
+ "numpy",
128
+ "optuna",
129
+ "scikit-learn",
130
+ "faiss-cpu",
131
+ "chromadb",
132
+ "pytest",
133
+ "openai",
134
+ "tqdm",
135
+ ]
136
+ ```
137
+
138
+ ---
139
+
140
+ ## πŸ“Š Example Experiment Workflow
141
+
142
+ 1. Define your retriever and reranker configuration in YAML
143
+ 2. Launch an optimization search (Grid, Random, or Bayesian)
144
+ 3. Ragmint evaluates combinations automatically and reports top results
145
+ 4. Export best parameters for production pipelines
146
+
147
+ ---
148
+
149
+ ## 🧬 Architecture Overview
150
+
151
+ ```mermaid
152
+ flowchart TD
153
+ A[Query] --> B[Embedder]
154
+ B --> C[Retriever]
155
+ C --> D[Reranker]
156
+ D --> E[Generator]
157
+ E --> F[Evaluation]
158
+ F --> G[Optuna Tuner]
159
+ G -->|Best Params| B
160
+ ```
161
+
162
+ ---
163
+
164
+ ## πŸ“˜ Example Output
165
+
166
+ ```
167
+ [INFO] Starting Bayesian optimization with Optuna
168
+ [INFO] Trial 7 finished: recall=0.83, latency=0.42s
169
+ [INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
170
+ ```
171
+
172
+ ---
173
+
174
+ ## 🧠 Why Ragmint?
175
+
176
+ - Built for **RAG researchers**, **AI engineers**, and **LLM ops**
177
+ - Works with **LangChain**, **LlamaIndex**, or standalone RAG setups
178
+ - Designed for **extensibility** β€” plug in your own models, retrievers, or metrics
179
+
180
+ ---
181
+
182
+ ## βš–οΈ License
183
+
184
+ Licensed under the **Apache License 2.0** β€” free for personal, research, and commercial use.
185
+
186
+ ---
187
+
188
+ ## πŸ‘€ Author
189
+
190
+ **AndrΓ© Oliveira**
191
+ [andyolivers.com](https://andyolivers.com)
192
+ Data Scientist | AI Engineer
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ragmint"
7
+ version = "0.1.0"
8
+ description = "A modular framework for evaluating and optimizing RAG pipelines."
9
+ readme = "README.md"
10
+ license = { text = "Apache License 2.0" }
11
+ authors = [
12
+ { name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
13
+ ]
14
+ keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
15
+ requires-python = ">=3.9"
16
+ dependencies = [
17
+ "numpy>=1.23",
18
+ "pandas>=2.0",
19
+ "scikit-learn>=1.3",
20
+ "openai>=1.0",
21
+ "tqdm",
22
+ "pyyaml",
23
+ "chromadb>=0.4",
24
+ "faiss-cpu; sys_platform != 'darwin'",
25
+ "optuna>=3.0",
26
+ "pytest",
27
+ "colorama"
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/andyolivers/ragmint"
32
+ Documentation = "https://andyolivers.com"
33
+ Issues = "https://github.com/andyolivers/ragmint/issues"
34
+
35
+ [tool.setuptools]
36
+ include-package-data = true
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+
41
+ [tool.pytest.ini_options]
42
+ testpaths = ["tests"]
43
+ addopts = "-v"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,28 @@
1
+ from pathlib import Path
2
+ from ragmint.tuner import RAGMint
3
+
4
+ def main():
5
+ # Dynamically resolve the path to the installed ragmint package
6
+ base_dir = Path(__file__).resolve().parent
7
+
8
+ docs_path = base_dir / "experiments" / "corpus"
9
+ validation_file = base_dir / "experiments" / "validation_qa.json"
10
+
11
+ rag = RAGMint(
12
+ docs_path=str(docs_path),
13
+ retrievers=["faiss"],
14
+ embeddings=["openai/text-embedding-3-small"],
15
+ rerankers=["mmr"],
16
+ )
17
+
18
+ best, results = rag.optimize(
19
+ validation_set=str(validation_file),
20
+ metric="faithfulness",
21
+ search_type="bayesian",
22
+ trials=10,
23
+ )
24
+
25
+ print("Best config found:\n", best)
26
+
27
+ if __name__ == "__main__":
28
+ main()
File without changes
@@ -0,0 +1,22 @@
1
+ from typing import List
2
+
3
+
4
+ class Chunker:
5
+ """
6
+ Handles text chunking and splitting strategies:
7
+ - Fixed size chunks
8
+ - Overlapping windows
9
+ """
10
+
11
+ def __init__(self, chunk_size: int = 500, overlap: int = 100):
12
+ self.chunk_size = chunk_size
13
+ self.overlap = overlap
14
+
15
+ def chunk_text(self, text: str) -> List[str]:
16
+ chunks = []
17
+ start = 0
18
+ while start < len(text):
19
+ end = start + self.chunk_size
20
+ chunks.append(text[start:end])
21
+ start += self.chunk_size - self.overlap
22
+ return chunks
@@ -0,0 +1,19 @@
1
+ import numpy as np
2
+
3
+
4
+ class EmbeddingModel:
5
+ """
6
+ Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
7
+ """
8
+
9
+ def __init__(self, backend: str = "dummy"):
10
+ self.backend = backend
11
+
12
+ def encode(self, texts):
13
+ if self.backend == "openai":
14
+ # Example placeholder β€” integrate with actual OpenAI API
15
+ return [np.random.rand(768) for _ in texts]
16
+ elif self.backend == "huggingface":
17
+ return [np.random.rand(768) for _ in texts]
18
+ else:
19
+ return [np.random.rand(768) for _ in texts]
@@ -0,0 +1,27 @@
1
+ import time
2
+ from typing import Dict, Any
3
+ from difflib import SequenceMatcher
4
+
5
+
6
+ class Evaluator:
7
+ """
8
+ Simple evaluation of generated answers:
9
+ - Faithfulness (similarity between answer and context)
10
+ - Latency
11
+ """
12
+
13
+ def __init__(self):
14
+ pass
15
+
16
+ def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
17
+ start = time.time()
18
+ faithfulness = self._similarity(answer, context)
19
+ latency = time.time() - start
20
+
21
+ return {
22
+ "faithfulness": faithfulness,
23
+ "latency": latency,
24
+ }
25
+
26
+ def _similarity(self, a: str, b: str) -> float:
27
+ return SequenceMatcher(None, a, b).ratio()
@@ -0,0 +1,38 @@
1
+ from typing import Any, Dict, List
2
+ from .retriever import Retriever
3
+ from .reranker import Reranker
4
+ from .evaluation import Evaluator
5
+
6
+
7
+ class RAGPipeline:
8
+ """
9
+ Core Retrieval-Augmented Generation pipeline.
10
+ Simplified (no generator). It retrieves, reranks, and evaluates.
11
+ """
12
+
13
+ def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
14
+ self.retriever = retriever
15
+ self.reranker = reranker
16
+ self.evaluator = evaluator
17
+
18
+ def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
19
+ # Retrieve documents
20
+ retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
21
+ # Rerank
22
+ reranked_docs = self.reranker.rerank(query, retrieved_docs)
23
+
24
+ # Use top document as pseudo-answer
25
+ if reranked_docs:
26
+ answer = reranked_docs[0]["text"]
27
+ else:
28
+ answer = ""
29
+
30
+ context = "\n".join([d["text"] for d in reranked_docs])
31
+ metrics = self.evaluator.evaluate(query, answer, context)
32
+
33
+ return {
34
+ "query": query,
35
+ "answer": answer,
36
+ "docs": reranked_docs,
37
+ "metrics": metrics,
38
+ }
@@ -0,0 +1,62 @@
1
+ from typing import List, Dict, Any
2
+ import numpy as np
3
+
4
+
5
+ class Reranker:
6
+ """
7
+ Supports:
8
+ - MMR (Maximal Marginal Relevance)
9
+ - Dummy CrossEncoder (for demonstration)
10
+ """
11
+
12
+ def __init__(self, mode: str = "mmr", lambda_param: float = 0.5, seed: int = 42):
13
+ self.mode = mode
14
+ self.lambda_param = lambda_param
15
+ np.random.seed(seed)
16
+
17
+ def rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
18
+ if not docs:
19
+ return []
20
+
21
+ if self.mode == "crossencoder":
22
+ return self._crossencoder_rerank(query, docs)
23
+ return self._mmr_rerank(query, docs)
24
+
25
+ def _mmr_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
26
+ """Perform MMR reranking using dummy similarity scores."""
27
+ selected = []
28
+ remaining = docs.copy()
29
+
30
+ while remaining and len(selected) < len(docs):
31
+ if not selected:
32
+ # pick doc with highest base score
33
+ best = max(remaining, key=lambda d: d["score"])
34
+ else:
35
+ # MMR balancing between relevance and diversity
36
+ mmr_scores = []
37
+ for d in remaining:
38
+ max_div = max(
39
+ [self._similarity(d["text"], s["text"]) for s in selected],
40
+ default=0,
41
+ )
42
+ mmr_score = (
43
+ self.lambda_param * d["score"]
44
+ - (1 - self.lambda_param) * max_div
45
+ )
46
+ mmr_scores.append(mmr_score)
47
+ best = remaining[int(np.argmax(mmr_scores))]
48
+ selected.append(best)
49
+ remaining.remove(best)
50
+
51
+ return selected
52
+
53
+ def _crossencoder_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
54
+ """Adds a small random perturbation to simulate crossencoder reranking."""
55
+ for d in docs:
56
+ d["score"] += np.random.uniform(0, 0.1)
57
+ return sorted(docs, key=lambda d: d["score"], reverse=True)
58
+
59
+ def _similarity(self, a: str, b: str) -> float:
60
+ """Dummy similarity function between two strings."""
61
+ # Deterministic pseudo-similarity based on hash
62
+ return abs(hash(a + b)) % 100 / 100.0