ragmint 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragmint might be problematic. Click here for more details.
- ragmint-0.2.1/LICENSE +19 -0
- ragmint-0.2.1/PKG-INFO +27 -0
- ragmint-0.2.1/pyproject.toml +49 -0
- ragmint-0.2.1/setup.cfg +4 -0
- ragmint-0.2.1/src/ragmint/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/__main__.py +28 -0
- ragmint-0.2.1/src/ragmint/autotuner.py +33 -0
- ragmint-0.2.1/src/ragmint/core/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/core/chunking.py +22 -0
- ragmint-0.2.1/src/ragmint/core/embeddings.py +19 -0
- ragmint-0.2.1/src/ragmint/core/evaluation.py +38 -0
- ragmint-0.2.1/src/ragmint/core/pipeline.py +38 -0
- ragmint-0.2.1/src/ragmint/core/reranker.py +62 -0
- ragmint-0.2.1/src/ragmint/core/retriever.py +33 -0
- ragmint-0.2.1/src/ragmint/experiments/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/experiments/validation_qa.json +14 -0
- ragmint-0.2.1/src/ragmint/explainer.py +61 -0
- ragmint-0.2.1/src/ragmint/leaderboard.py +45 -0
- ragmint-0.2.1/src/ragmint/optimization/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/optimization/search.py +48 -0
- ragmint-0.2.1/src/ragmint/tests/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/tests/conftest.py +16 -0
- ragmint-0.2.1/src/ragmint/tests/test_autotuner.py +42 -0
- ragmint-0.2.1/src/ragmint/tests/test_explainer.py +20 -0
- ragmint-0.2.1/src/ragmint/tests/test_explainer_integration.py +18 -0
- ragmint-0.2.1/src/ragmint/tests/test_integration_autotuner_ragmint.py +60 -0
- ragmint-0.2.1/src/ragmint/tests/test_leaderboard.py +39 -0
- ragmint-0.2.1/src/ragmint/tests/test_pipeline.py +19 -0
- ragmint-0.2.1/src/ragmint/tests/test_retriever.py +14 -0
- ragmint-0.2.1/src/ragmint/tests/test_search.py +17 -0
- ragmint-0.2.1/src/ragmint/tests/test_tuner.py +71 -0
- ragmint-0.2.1/src/ragmint/tuner.py +123 -0
- ragmint-0.2.1/src/ragmint/utils/__init__.py +0 -0
- ragmint-0.2.1/src/ragmint/utils/caching.py +37 -0
- ragmint-0.2.1/src/ragmint/utils/data_loader.py +65 -0
- ragmint-0.2.1/src/ragmint/utils/logger.py +36 -0
- ragmint-0.2.1/src/ragmint/utils/metrics.py +27 -0
- ragmint-0.2.1/src/ragmint.egg-info/PKG-INFO +27 -0
- ragmint-0.2.1/src/ragmint.egg-info/SOURCES.txt +40 -0
- ragmint-0.2.1/src/ragmint.egg-info/dependency_links.txt +1 -0
- ragmint-0.2.1/src/ragmint.egg-info/requires.txt +15 -0
- ragmint-0.2.1/src/ragmint.egg-info/top_level.txt +1 -0
ragmint-0.2.1/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
Copyright 2025 André Oliveira
|
|
8
|
+
|
|
9
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
10
|
+
you may not use this file except in compliance with the License.
|
|
11
|
+
You may obtain a copy of the License at
|
|
12
|
+
|
|
13
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
14
|
+
|
|
15
|
+
Unless required by applicable law or agreed to in writing, software
|
|
16
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
17
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
18
|
+
See the License for the specific language governing permissions and
|
|
19
|
+
limitations under the License.
|
ragmint-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragmint
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
|
+
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
|
+
License: Apache License 2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
|
+
Project-URL: Documentation, https://andyolivers.com
|
|
9
|
+
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
+
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.23
|
|
15
|
+
Requires-Dist: pandas>=2.0
|
|
16
|
+
Requires-Dist: scikit-learn>=1.3
|
|
17
|
+
Requires-Dist: openai>=1.0
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: chromadb>=0.4
|
|
21
|
+
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
22
|
+
Requires-Dist: optuna>=3.0
|
|
23
|
+
Requires-Dist: pytest
|
|
24
|
+
Requires-Dist: colorama
|
|
25
|
+
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
+
Requires-Dist: supabase>=2.4.0
|
|
27
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ragmint"
|
|
7
|
+
version = "0.2.1"
|
|
8
|
+
description = "A modular framework for evaluating and optimizing RAG pipelines."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache License 2.0" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Andre Oliveira", email = "oandreoliveira@outlook.com" }
|
|
13
|
+
]
|
|
14
|
+
keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
|
|
15
|
+
requires-python = ">=3.9"
|
|
16
|
+
dependencies = [
|
|
17
|
+
"numpy>=1.23",
|
|
18
|
+
"pandas>=2.0",
|
|
19
|
+
"scikit-learn>=1.3",
|
|
20
|
+
"openai>=1.0",
|
|
21
|
+
"tqdm",
|
|
22
|
+
"pyyaml",
|
|
23
|
+
"chromadb>=0.4",
|
|
24
|
+
"faiss-cpu; sys_platform != 'darwin'",
|
|
25
|
+
"optuna>=3.0",
|
|
26
|
+
"pytest",
|
|
27
|
+
"colorama",
|
|
28
|
+
"google-generativeai>=0.8.0",
|
|
29
|
+
"supabase>=2.4.0"
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/andyolivers/ragmint"
|
|
34
|
+
Documentation = "https://andyolivers.com"
|
|
35
|
+
Issues = "https://github.com/andyolivers/ragmint/issues"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools]
|
|
38
|
+
include-package-data = true
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = ["tests"]
|
|
45
|
+
addopts = "-v"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.package-data]
|
|
48
|
+
ragmint = ["experiments/*.json"]
|
|
49
|
+
|
ragmint-0.2.1/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from ragmint.tuner import RAGMint
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
# Dynamically resolve the path to the installed ragmint package
|
|
6
|
+
base_dir = Path(__file__).resolve().parent
|
|
7
|
+
|
|
8
|
+
docs_path = base_dir / "experiments" / "corpus"
|
|
9
|
+
validation_file = base_dir / "experiments" / "validation_qa.json"
|
|
10
|
+
|
|
11
|
+
rag = RAGMint(
|
|
12
|
+
docs_path=str(docs_path),
|
|
13
|
+
retrievers=["faiss"],
|
|
14
|
+
embeddings=["openai/text-embedding-3-small"],
|
|
15
|
+
rerankers=["mmr"],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
best, results = rag.optimize(
|
|
19
|
+
validation_set=str(validation_file),
|
|
20
|
+
metric="faithfulness",
|
|
21
|
+
search_type="bayesian",
|
|
22
|
+
trials=10,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
print("Best config found:\n", best)
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
main()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Auto-RAG Tuner
|
|
3
|
+
--------------
|
|
4
|
+
Recommends retriever–embedding pairs dynamically based on corpus size
|
|
5
|
+
and dataset characteristics. Integrates seamlessly with RAGMint evaluator.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .core.evaluation import evaluate_config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AutoRAGTuner:
|
|
12
|
+
def __init__(self, corpus_stats: dict):
|
|
13
|
+
"""
|
|
14
|
+
corpus_stats: dict
|
|
15
|
+
Example: {'size': 12000, 'avg_len': 240}
|
|
16
|
+
"""
|
|
17
|
+
self.corpus_stats = corpus_stats
|
|
18
|
+
|
|
19
|
+
def recommend(self):
|
|
20
|
+
size = self.corpus_stats.get("size", 0)
|
|
21
|
+
avg_len = self.corpus_stats.get("avg_len", 0)
|
|
22
|
+
|
|
23
|
+
if size < 1000:
|
|
24
|
+
return {"retriever": "BM25", "embedding_model": "OpenAI"}
|
|
25
|
+
elif size < 10000:
|
|
26
|
+
return {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
27
|
+
else:
|
|
28
|
+
return {"retriever": "FAISS", "embedding_model": "InstructorXL"}
|
|
29
|
+
|
|
30
|
+
def auto_tune(self, validation_data):
|
|
31
|
+
config = self.recommend()
|
|
32
|
+
results = evaluate_config(config, validation_data)
|
|
33
|
+
return {"recommended": config, "results": results}
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Chunker:
|
|
5
|
+
"""
|
|
6
|
+
Handles text chunking and splitting strategies:
|
|
7
|
+
- Fixed size chunks
|
|
8
|
+
- Overlapping windows
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, chunk_size: int = 500, overlap: int = 100):
|
|
12
|
+
self.chunk_size = chunk_size
|
|
13
|
+
self.overlap = overlap
|
|
14
|
+
|
|
15
|
+
def chunk_text(self, text: str) -> List[str]:
|
|
16
|
+
chunks = []
|
|
17
|
+
start = 0
|
|
18
|
+
while start < len(text):
|
|
19
|
+
end = start + self.chunk_size
|
|
20
|
+
chunks.append(text[start:end])
|
|
21
|
+
start += self.chunk_size - self.overlap
|
|
22
|
+
return chunks
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EmbeddingModel:
|
|
5
|
+
"""
|
|
6
|
+
Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, backend: str = "dummy"):
|
|
10
|
+
self.backend = backend
|
|
11
|
+
|
|
12
|
+
def encode(self, texts):
|
|
13
|
+
if self.backend == "openai":
|
|
14
|
+
# Example placeholder — integrate with actual OpenAI API
|
|
15
|
+
return [np.random.rand(768) for _ in texts]
|
|
16
|
+
elif self.backend == "huggingface":
|
|
17
|
+
return [np.random.rand(768) for _ in texts]
|
|
18
|
+
else:
|
|
19
|
+
return [np.random.rand(768) for _ in texts]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Dict, Any
|
|
3
|
+
from difflib import SequenceMatcher
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Evaluator:
|
|
7
|
+
"""
|
|
8
|
+
Simple evaluation of generated answers:
|
|
9
|
+
- Faithfulness (similarity between answer and context)
|
|
10
|
+
- Latency
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
|
|
17
|
+
start = time.time()
|
|
18
|
+
faithfulness = self._similarity(answer, context)
|
|
19
|
+
latency = time.time() - start
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
"faithfulness": faithfulness,
|
|
23
|
+
"latency": latency,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def _similarity(self, a: str, b: str) -> float:
|
|
27
|
+
return SequenceMatcher(None, a, b).ratio()
|
|
28
|
+
|
|
29
|
+
def evaluate_config(config, validation_data):
|
|
30
|
+
evaluator = Evaluator()
|
|
31
|
+
results = []
|
|
32
|
+
for sample in validation_data:
|
|
33
|
+
query = sample.get("query", "")
|
|
34
|
+
answer = sample.get("answer", "")
|
|
35
|
+
context = sample.get("context", "")
|
|
36
|
+
results.append(evaluator.evaluate(query, answer, context))
|
|
37
|
+
return results
|
|
38
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
from .retriever import Retriever
|
|
3
|
+
from .reranker import Reranker
|
|
4
|
+
from .evaluation import Evaluator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RAGPipeline:
|
|
8
|
+
"""
|
|
9
|
+
Core Retrieval-Augmented Generation pipeline.
|
|
10
|
+
Simplified (no generator). It retrieves, reranks, and evaluates.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
|
|
14
|
+
self.retriever = retriever
|
|
15
|
+
self.reranker = reranker
|
|
16
|
+
self.evaluator = evaluator
|
|
17
|
+
|
|
18
|
+
def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
|
|
19
|
+
# Retrieve documents
|
|
20
|
+
retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
|
|
21
|
+
# Rerank
|
|
22
|
+
reranked_docs = self.reranker.rerank(query, retrieved_docs)
|
|
23
|
+
|
|
24
|
+
# Use top document as pseudo-answer
|
|
25
|
+
if reranked_docs:
|
|
26
|
+
answer = reranked_docs[0]["text"]
|
|
27
|
+
else:
|
|
28
|
+
answer = ""
|
|
29
|
+
|
|
30
|
+
context = "\n".join([d["text"] for d in reranked_docs])
|
|
31
|
+
metrics = self.evaluator.evaluate(query, answer, context)
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
"query": query,
|
|
35
|
+
"answer": answer,
|
|
36
|
+
"docs": reranked_docs,
|
|
37
|
+
"metrics": metrics,
|
|
38
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List, Dict, Any
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Reranker:
|
|
6
|
+
"""
|
|
7
|
+
Supports:
|
|
8
|
+
- MMR (Maximal Marginal Relevance)
|
|
9
|
+
- Dummy CrossEncoder (for demonstration)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, mode: str = "mmr", lambda_param: float = 0.5, seed: int = 42):
|
|
13
|
+
self.mode = mode
|
|
14
|
+
self.lambda_param = lambda_param
|
|
15
|
+
np.random.seed(seed)
|
|
16
|
+
|
|
17
|
+
def rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
18
|
+
if not docs:
|
|
19
|
+
return []
|
|
20
|
+
|
|
21
|
+
if self.mode == "crossencoder":
|
|
22
|
+
return self._crossencoder_rerank(query, docs)
|
|
23
|
+
return self._mmr_rerank(query, docs)
|
|
24
|
+
|
|
25
|
+
def _mmr_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
26
|
+
"""Perform MMR reranking using dummy similarity scores."""
|
|
27
|
+
selected = []
|
|
28
|
+
remaining = docs.copy()
|
|
29
|
+
|
|
30
|
+
while remaining and len(selected) < len(docs):
|
|
31
|
+
if not selected:
|
|
32
|
+
# pick doc with highest base score
|
|
33
|
+
best = max(remaining, key=lambda d: d["score"])
|
|
34
|
+
else:
|
|
35
|
+
# MMR balancing between relevance and diversity
|
|
36
|
+
mmr_scores = []
|
|
37
|
+
for d in remaining:
|
|
38
|
+
max_div = max(
|
|
39
|
+
[self._similarity(d["text"], s["text"]) for s in selected],
|
|
40
|
+
default=0,
|
|
41
|
+
)
|
|
42
|
+
mmr_score = (
|
|
43
|
+
self.lambda_param * d["score"]
|
|
44
|
+
- (1 - self.lambda_param) * max_div
|
|
45
|
+
)
|
|
46
|
+
mmr_scores.append(mmr_score)
|
|
47
|
+
best = remaining[int(np.argmax(mmr_scores))]
|
|
48
|
+
selected.append(best)
|
|
49
|
+
remaining.remove(best)
|
|
50
|
+
|
|
51
|
+
return selected
|
|
52
|
+
|
|
53
|
+
def _crossencoder_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
54
|
+
"""Adds a small random perturbation to simulate crossencoder reranking."""
|
|
55
|
+
for d in docs:
|
|
56
|
+
d["score"] += np.random.uniform(0, 0.1)
|
|
57
|
+
return sorted(docs, key=lambda d: d["score"], reverse=True)
|
|
58
|
+
|
|
59
|
+
def _similarity(self, a: str, b: str) -> float:
|
|
60
|
+
"""Dummy similarity function between two strings."""
|
|
61
|
+
# Deterministic pseudo-similarity based on hash
|
|
62
|
+
return abs(hash(a + b)) % 100 / 100.0
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import List, Dict, Any
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Retriever:
|
|
6
|
+
"""
|
|
7
|
+
Simple vector retriever using cosine similarity.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, embeddings: List[np.ndarray], documents: List[str]):
|
|
11
|
+
if len(embeddings) == 0:
|
|
12
|
+
self.embeddings = np.zeros((1, 768))
|
|
13
|
+
else:
|
|
14
|
+
self.embeddings = np.array(embeddings)
|
|
15
|
+
self.documents = documents or [""]
|
|
16
|
+
|
|
17
|
+
def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
|
18
|
+
if self.embeddings.size == 0 or len(self.documents) == 0:
|
|
19
|
+
return [{"text": "", "score": 0.0}]
|
|
20
|
+
|
|
21
|
+
query_vec = self._embed(query)
|
|
22
|
+
scores = self._cosine_similarity(query_vec, self.embeddings)
|
|
23
|
+
top_indices = np.argsort(scores)[::-1][:min(top_k, len(scores))]
|
|
24
|
+
return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
|
|
25
|
+
|
|
26
|
+
def _embed(self, query: str) -> np.ndarray:
|
|
27
|
+
dim = self.embeddings.shape[1] if len(self.embeddings.shape) > 1 else 768
|
|
28
|
+
return np.random.rand(dim)
|
|
29
|
+
|
|
30
|
+
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
31
|
+
a_norm = a / np.linalg.norm(a)
|
|
32
|
+
b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
|
|
33
|
+
return np.dot(b_norm, a_norm)
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"query": "What is Retrieval-Augmented Generation?",
|
|
4
|
+
"expected_answer": "A technique that combines information retrieval with language generation to improve factual accuracy."
|
|
5
|
+
},
|
|
6
|
+
{
|
|
7
|
+
"query": "What is the role of embeddings in a RAG system?",
|
|
8
|
+
"expected_answer": "They represent text as numerical vectors for similarity-based retrieval."
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"query": "What is Maximal Marginal Relevance used for?",
|
|
12
|
+
"expected_answer": "To select diverse and relevant documents during reranking."
|
|
13
|
+
}
|
|
14
|
+
]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interpretability Layer
|
|
3
|
+
----------------------
|
|
4
|
+
Uses Gemini or Anthropic Claude to explain why one RAG configuration
|
|
5
|
+
outperforms another. Falls back gracefully if no API key is provided.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-pro") -> str:
|
|
13
|
+
"""
|
|
14
|
+
Generate a natural-language explanation comparing two RAG experiment results.
|
|
15
|
+
Priority:
|
|
16
|
+
1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
|
|
17
|
+
2. Google Gemini (if GOOGLE_API_KEY is set)
|
|
18
|
+
3. Fallback text message
|
|
19
|
+
"""
|
|
20
|
+
prompt = f"""
|
|
21
|
+
You are an AI evaluation expert.
|
|
22
|
+
Compare these two RAG experiment results and explain why one performs better.
|
|
23
|
+
Metrics A: {json.dumps(results_a, indent=2)}
|
|
24
|
+
Metrics B: {json.dumps(results_b, indent=2)}
|
|
25
|
+
Provide a concise, human-friendly explanation and practical improvement tips.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
29
|
+
google_key = os.getenv("GEMINI_API_KEY")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# 1️⃣ Try Anthropic Claude first
|
|
33
|
+
if anthropic_key:
|
|
34
|
+
try:
|
|
35
|
+
from anthropic import Anthropic
|
|
36
|
+
client = Anthropic(api_key=anthropic_key)
|
|
37
|
+
response = client.messages.create(
|
|
38
|
+
model="claude-3-opus-20240229",
|
|
39
|
+
max_tokens=300,
|
|
40
|
+
messages=[{"role": "user", "content": prompt}],
|
|
41
|
+
)
|
|
42
|
+
return response.content[0].text
|
|
43
|
+
except Exception as e:
|
|
44
|
+
return f"[Claude unavailable] {e}"
|
|
45
|
+
|
|
46
|
+
# 2️⃣ Fallback to Google Gemini
|
|
47
|
+
elif google_key:
|
|
48
|
+
try:
|
|
49
|
+
import google.generativeai as genai
|
|
50
|
+
genai.configure(api_key=google_key)
|
|
51
|
+
response = genai.GenerativeModel(model).generate_content(prompt)
|
|
52
|
+
return response.text
|
|
53
|
+
except Exception as e:
|
|
54
|
+
return f"[Gemini unavailable] {e}"
|
|
55
|
+
|
|
56
|
+
# 3️⃣ Fallback if neither key is available
|
|
57
|
+
else:
|
|
58
|
+
return (
|
|
59
|
+
"[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
|
|
60
|
+
"to enable interpretability via Claude or Gemini."
|
|
61
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Dict, Any, Optional
|
|
5
|
+
from supabase import create_client
|
|
6
|
+
|
|
7
|
+
class Leaderboard:
|
|
8
|
+
def __init__(self, storage_path: Optional[str] = None):
|
|
9
|
+
self.storage_path = storage_path
|
|
10
|
+
url = os.getenv("SUPABASE_URL")
|
|
11
|
+
key = os.getenv("SUPABASE_KEY")
|
|
12
|
+
self.client = None
|
|
13
|
+
if url and key:
|
|
14
|
+
self.client = create_client(url, key)
|
|
15
|
+
elif not storage_path:
|
|
16
|
+
raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
|
|
17
|
+
|
|
18
|
+
def upload(self, run_id: str, config: Dict[str, Any], score: float):
|
|
19
|
+
data = {
|
|
20
|
+
"run_id": run_id,
|
|
21
|
+
"config": config,
|
|
22
|
+
"score": score,
|
|
23
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
24
|
+
}
|
|
25
|
+
if self.client:
|
|
26
|
+
return self.client.table("experiments").insert(data).execute()
|
|
27
|
+
else:
|
|
28
|
+
os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
|
|
29
|
+
with open(self.storage_path, "a", encoding="utf-8") as f:
|
|
30
|
+
f.write(json.dumps(data) + "\n")
|
|
31
|
+
return data
|
|
32
|
+
|
|
33
|
+
def top_results(self, limit: int = 10):
|
|
34
|
+
if self.client:
|
|
35
|
+
return (
|
|
36
|
+
self.client.table("experiments")
|
|
37
|
+
.select("*")
|
|
38
|
+
.order("score", desc=True)
|
|
39
|
+
.limit(limit)
|
|
40
|
+
.execute()
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
with open(self.storage_path, "r", encoding="utf-8") as f:
|
|
44
|
+
lines = [json.loads(line) for line in f]
|
|
45
|
+
return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import random
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List, Iterator, Any
|
|
5
|
+
|
|
6
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GridSearch:
|
|
10
|
+
def __init__(self, search_space: Dict[str, List[Any]]):
|
|
11
|
+
keys = list(search_space.keys())
|
|
12
|
+
values = list(search_space.values())
|
|
13
|
+
self.combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
|
|
14
|
+
|
|
15
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
16
|
+
for combo in self.combinations:
|
|
17
|
+
yield combo
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RandomSearch:
|
|
21
|
+
def __init__(self, search_space: Dict[str, List[Any]], n_trials: int = 10):
|
|
22
|
+
self.search_space = search_space
|
|
23
|
+
self.n_trials = n_trials
|
|
24
|
+
|
|
25
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
26
|
+
keys = list(self.search_space.keys())
|
|
27
|
+
for _ in range(self.n_trials):
|
|
28
|
+
yield {k: random.choice(self.search_space[k]) for k in keys}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BayesianSearch:
|
|
32
|
+
def __init__(self, search_space: Dict[str, List[Any]]):
|
|
33
|
+
try:
|
|
34
|
+
import optuna
|
|
35
|
+
self.optuna = optuna
|
|
36
|
+
except ImportError:
|
|
37
|
+
raise RuntimeError("Optuna not installed; use GridSearch or RandomSearch instead.")
|
|
38
|
+
self.search_space = search_space
|
|
39
|
+
|
|
40
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
41
|
+
keys = list(self.search_space.keys())
|
|
42
|
+
|
|
43
|
+
def objective(trial):
|
|
44
|
+
return {k: trial.suggest_categorical(k, self.search_space[k]) for k in keys}
|
|
45
|
+
|
|
46
|
+
# Example static 5-trial yield for compatibility
|
|
47
|
+
for _ in range(5):
|
|
48
|
+
yield {k: random.choice(self.search_space[k]) for k in keys}
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/ragmint/tests/conftest.py
|
|
2
|
+
import os
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
# Load .env from project root
|
|
7
|
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "../../../.env"))
|
|
8
|
+
|
|
9
|
+
def pytest_configure(config):
|
|
10
|
+
"""Print which keys are loaded (debug)."""
|
|
11
|
+
google = os.getenv("GEMINI_API_KEY")
|
|
12
|
+
anthropic = os.getenv("ANTHROPIC_API_KEY")
|
|
13
|
+
if google:
|
|
14
|
+
print("✅ GOOGLE_API_KEY loaded")
|
|
15
|
+
if anthropic:
|
|
16
|
+
print("✅ ANTHROPIC_API_KEY loaded")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_autorag_recommend_small():
|
|
6
|
+
"""Small corpus should trigger BM25 + OpenAI."""
|
|
7
|
+
tuner = AutoRAGTuner({"size": 500, "avg_len": 150})
|
|
8
|
+
rec = tuner.recommend()
|
|
9
|
+
assert rec["retriever"] == "BM25"
|
|
10
|
+
assert rec["embedding_model"] == "OpenAI"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_autorag_recommend_medium():
|
|
14
|
+
"""Medium corpus should trigger Chroma + SentenceTransformers."""
|
|
15
|
+
tuner = AutoRAGTuner({"size": 5000, "avg_len": 200})
|
|
16
|
+
rec = tuner.recommend()
|
|
17
|
+
assert rec["retriever"] == "Chroma"
|
|
18
|
+
assert rec["embedding_model"] == "SentenceTransformers"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_autorag_recommend_large():
|
|
22
|
+
"""Large corpus should trigger FAISS + InstructorXL."""
|
|
23
|
+
tuner = AutoRAGTuner({"size": 50000, "avg_len": 300})
|
|
24
|
+
rec = tuner.recommend()
|
|
25
|
+
assert rec["retriever"] == "FAISS"
|
|
26
|
+
assert rec["embedding_model"] == "InstructorXL"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_autorag_auto_tune(monkeypatch):
|
|
30
|
+
"""Test auto_tune with a mock validation dataset."""
|
|
31
|
+
tuner = AutoRAGTuner({"size": 12000, "avg_len": 250})
|
|
32
|
+
|
|
33
|
+
# Monkeypatch evaluate_config inside autotuner
|
|
34
|
+
import ragmint.autotuner as autotuner
|
|
35
|
+
def mock_eval(config, data):
|
|
36
|
+
return {"faithfulness": 0.9, "latency": 0.01}
|
|
37
|
+
monkeypatch.setattr(autotuner, "evaluate_config", mock_eval)
|
|
38
|
+
|
|
39
|
+
result = tuner.auto_tune([{"question": "What is AI?", "answer": "Artificial Intelligence"}])
|
|
40
|
+
assert "recommended" in result
|
|
41
|
+
assert "results" in result
|
|
42
|
+
assert isinstance(result["results"], dict)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from ragmint.explainer import explain_results
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_explain_results_gemini():
|
|
6
|
+
"""Gemini explanation should contain model-specific phrasing."""
|
|
7
|
+
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
8
|
+
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
9
|
+
result = explain_results(config_a, config_b, model="gemini")
|
|
10
|
+
assert isinstance(result, str)
|
|
11
|
+
assert "Gemini" in result or "gemini" in result
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_explain_results_claude():
|
|
15
|
+
"""Claude explanation should contain model-specific phrasing."""
|
|
16
|
+
config_a = {"retriever": "FAISS"}
|
|
17
|
+
config_b = {"retriever": "Chroma"}
|
|
18
|
+
result = explain_results(config_a, config_b, model="claude")
|
|
19
|
+
assert isinstance(result, str)
|
|
20
|
+
assert "Claude" in result or "claude" in result
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from ragmint.explainer import explain_results
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.integration
|
|
7
|
+
def test_real_gemini_explanation():
|
|
8
|
+
"""Run real Gemini call if GOOGLE_API_KEY is set."""
|
|
9
|
+
if not os.getenv("GEMINI_API_KEY"):
|
|
10
|
+
pytest.skip("GOOGLE_API_KEY not set")
|
|
11
|
+
|
|
12
|
+
config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
|
|
13
|
+
config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
|
|
14
|
+
|
|
15
|
+
result = explain_results(config_a, config_b, model="gemini-1.5-pro")
|
|
16
|
+
assert isinstance(result, str)
|
|
17
|
+
assert len(result) > 0
|
|
18
|
+
print("\n[Gemini explanation]:", result[:200], "...")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from ragmint.tuner import RAGMint
|
|
3
|
+
from ragmint.autotuner import AutoRAGTuner
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_integration_ragmint_autotune(monkeypatch, tmp_path):
|
|
7
|
+
"""
|
|
8
|
+
Smoke test for integration between AutoRAGTuner and RAGMint.
|
|
9
|
+
Ensures end-to-end flow runs without real retrievers or embeddings.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# --- Mock corpus and validation data ---
|
|
13
|
+
corpus = tmp_path / "docs"
|
|
14
|
+
corpus.mkdir()
|
|
15
|
+
(corpus / "doc1.txt").write_text("This is an AI document.")
|
|
16
|
+
validation_data = [{"question": "What is AI?", "answer": "Artificial Intelligence"}]
|
|
17
|
+
|
|
18
|
+
# --- Mock RAGMint.optimize() to avoid real model work ---
|
|
19
|
+
def mock_optimize(self, validation_set=None, metric="faithfulness", trials=2):
|
|
20
|
+
return (
|
|
21
|
+
{"retriever": "FAISS", "embedding_model": "OpenAI", "score": 0.88},
|
|
22
|
+
[{"trial": 1, "score": 0.88}],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
monkeypatch.setattr(RAGMint, "optimize", mock_optimize)
|
|
26
|
+
|
|
27
|
+
# --- Mock evaluation used by AutoRAGTuner ---
|
|
28
|
+
def mock_evaluate_config(config, data):
|
|
29
|
+
return {"faithfulness": 0.9, "latency": 0.01}
|
|
30
|
+
|
|
31
|
+
import ragmint.autotuner as autotuner
|
|
32
|
+
monkeypatch.setattr(autotuner, "evaluate_config", mock_evaluate_config)
|
|
33
|
+
|
|
34
|
+
# --- Create AutoRAGTuner and RAGMint instances ---
|
|
35
|
+
ragmint = RAGMint(
|
|
36
|
+
docs_path=str(corpus),
|
|
37
|
+
retrievers=["faiss", "chroma"],
|
|
38
|
+
embeddings=["text-embedding-3-small"],
|
|
39
|
+
rerankers=["mmr"],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
tuner = AutoRAGTuner({"size": 2000, "avg_len": 150})
|
|
43
|
+
|
|
44
|
+
# --- Run Auto-Tune and RAG Optimization ---
|
|
45
|
+
recommendation = tuner.recommend()
|
|
46
|
+
assert "retriever" in recommendation
|
|
47
|
+
assert "embedding_model" in recommendation
|
|
48
|
+
|
|
49
|
+
tuning_results = tuner.auto_tune(validation_data)
|
|
50
|
+
assert "results" in tuning_results
|
|
51
|
+
assert isinstance(tuning_results["results"], dict)
|
|
52
|
+
|
|
53
|
+
# --- Run RAGMint optimization flow (mocked) ---
|
|
54
|
+
best_config, results = ragmint.optimize(validation_set=validation_data, trials=2)
|
|
55
|
+
assert isinstance(best_config, dict)
|
|
56
|
+
assert "score" in best_config
|
|
57
|
+
assert isinstance(results, list)
|
|
58
|
+
|
|
59
|
+
# --- Integration Success ---
|
|
60
|
+
print(f"Integration OK: AutoRAG recommended {recommendation}, RAGMint best {best_config}")
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import tempfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from ragmint.leaderboard import Leaderboard
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_leaderboard_add_and_top(tmp_path):
|
|
8
|
+
"""Ensure local leaderboard persistence works without Supabase."""
|
|
9
|
+
file_path = tmp_path / "leaderboard.jsonl"
|
|
10
|
+
lb = Leaderboard(storage_path=str(file_path))
|
|
11
|
+
|
|
12
|
+
# Add two runs
|
|
13
|
+
lb.upload("run1", {"retriever": "FAISS"}, 0.91)
|
|
14
|
+
lb.upload("run2", {"retriever": "Chroma"}, 0.85)
|
|
15
|
+
|
|
16
|
+
# Verify file content
|
|
17
|
+
assert file_path.exists()
|
|
18
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
19
|
+
lines = [json.loads(line) for line in f]
|
|
20
|
+
assert len(lines) == 2
|
|
21
|
+
|
|
22
|
+
# Get top results
|
|
23
|
+
top = lb.top_results(limit=1)
|
|
24
|
+
assert isinstance(top, list)
|
|
25
|
+
assert len(top) == 1
|
|
26
|
+
assert "score" in top[0]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_leaderboard_append_existing(tmp_path):
|
|
30
|
+
"""Ensure multiple uploads append properly."""
|
|
31
|
+
file_path = tmp_path / "leaderboard.jsonl"
|
|
32
|
+
lb = Leaderboard(storage_path=str(file_path))
|
|
33
|
+
|
|
34
|
+
for i in range(3):
|
|
35
|
+
lb.upload(f"run{i}", {"retriever": "BM25"}, 0.8 + i * 0.05)
|
|
36
|
+
|
|
37
|
+
top = lb.top_results(limit=2)
|
|
38
|
+
assert len(top) == 2
|
|
39
|
+
assert top[0]["score"] >= top[1]["score"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from ragmint.core.pipeline import RAGPipeline
|
|
3
|
+
from ragmint.core.retriever import Retriever
|
|
4
|
+
from ragmint.core.reranker import Reranker
|
|
5
|
+
from ragmint.core.evaluation import Evaluator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_pipeline_run():
|
|
9
|
+
docs = ["doc1 text", "doc2 text"]
|
|
10
|
+
embeddings = [np.random.rand(4) for _ in range(2)]
|
|
11
|
+
retriever = Retriever(embeddings, docs)
|
|
12
|
+
reranker = Reranker("mmr")
|
|
13
|
+
evaluator = Evaluator()
|
|
14
|
+
pipeline = RAGPipeline(retriever, reranker, evaluator)
|
|
15
|
+
|
|
16
|
+
result = pipeline.run("what is doc1?")
|
|
17
|
+
assert "query" in result
|
|
18
|
+
assert "answer" in result
|
|
19
|
+
assert "metrics" in result
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from ragmint.core.retriever import Retriever
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_retrieve_basic():
|
|
6
|
+
embeddings = [np.random.rand(5) for _ in range(3)]
|
|
7
|
+
docs = ["doc A", "doc B", "doc C"]
|
|
8
|
+
retriever = Retriever(embeddings, docs)
|
|
9
|
+
|
|
10
|
+
results = retriever.retrieve("sample query", top_k=2)
|
|
11
|
+
assert isinstance(results, list)
|
|
12
|
+
assert len(results) == 2
|
|
13
|
+
assert "text" in results[0]
|
|
14
|
+
assert "score" in results[0]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from ragmint.optimization.search import GridSearch, RandomSearch
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_grid_search_iterates():
|
|
5
|
+
space = {"retriever": ["faiss"], "embedding_model": ["openai"], "reranker": ["mmr"]}
|
|
6
|
+
search = GridSearch(space)
|
|
7
|
+
combos = list(search)
|
|
8
|
+
assert len(combos) == 1
|
|
9
|
+
assert "retriever" in combos[0]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_random_search_n_trials():
|
|
13
|
+
space = {"retriever": ["faiss", "bm25"], "embedding_model": ["openai", "st"], "reranker": ["mmr"]}
|
|
14
|
+
search = RandomSearch(space, n_trials=5)
|
|
15
|
+
combos = list(search)
|
|
16
|
+
assert len(combos) == 5
|
|
17
|
+
assert all("retriever" in c for c in combos)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import pytest
|
|
4
|
+
from ragmint.tuner import RAGMint
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def setup_validation_file(tmp_path):
|
|
8
|
+
"""Create a temporary validation QA dataset."""
|
|
9
|
+
data = [
|
|
10
|
+
{"question": "What is AI?", "answer": "Artificial Intelligence"},
|
|
11
|
+
{"question": "Define ML", "answer": "Machine Learning"}
|
|
12
|
+
]
|
|
13
|
+
file = tmp_path / "validation_qa.json"
|
|
14
|
+
with open(file, "w", encoding="utf-8") as f:
|
|
15
|
+
json.dump(data, f)
|
|
16
|
+
return str(file)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_docs(tmp_path):
|
|
20
|
+
"""Create a small document corpus for testing."""
|
|
21
|
+
corpus = tmp_path / "corpus"
|
|
22
|
+
corpus.mkdir()
|
|
23
|
+
(corpus / "doc1.txt").write_text("This is about Artificial Intelligence.")
|
|
24
|
+
(corpus / "doc2.txt").write_text("This text explains Machine Learning.")
|
|
25
|
+
return str(corpus)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.parametrize("validation_mode", [
|
|
29
|
+
None, # Built-in dataset
|
|
30
|
+
"data/custom_eval.json", # Custom dataset path (mocked below)
|
|
31
|
+
])
|
|
32
|
+
def test_optimize_ragmint(tmp_path, validation_mode, monkeypatch):
|
|
33
|
+
"""Test RAGMint.optimize() with different dataset modes."""
|
|
34
|
+
docs_path = setup_docs(tmp_path)
|
|
35
|
+
val_file = setup_validation_file(tmp_path)
|
|
36
|
+
|
|
37
|
+
# If using custom dataset, mock the path
|
|
38
|
+
if validation_mode and "custom_eval" in validation_mode:
|
|
39
|
+
custom_path = tmp_path / "custom_eval.json"
|
|
40
|
+
os.rename(val_file, custom_path)
|
|
41
|
+
validation_mode = str(custom_path)
|
|
42
|
+
|
|
43
|
+
metric = "faithfulness"
|
|
44
|
+
|
|
45
|
+
# Initialize RAGMint
|
|
46
|
+
rag = RAGMint(
|
|
47
|
+
docs_path=docs_path,
|
|
48
|
+
retrievers=["faiss"],
|
|
49
|
+
embeddings=["text-embedding-3-small"],
|
|
50
|
+
rerankers=["mmr"]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Run optimization
|
|
54
|
+
best, results = rag.optimize(
|
|
55
|
+
validation_set=validation_mode,
|
|
56
|
+
metric=metric,
|
|
57
|
+
trials=2
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Validate results
|
|
61
|
+
assert isinstance(best, dict), "Best config should be a dict"
|
|
62
|
+
assert isinstance(results, list), "Results should be a list of trials"
|
|
63
|
+
assert len(results) > 0, "Optimization should produce results"
|
|
64
|
+
|
|
65
|
+
# The best result can expose either 'score' or the metric name (e.g. 'faithfulness')
|
|
66
|
+
assert any(k in best for k in ("score", metric)), \
|
|
67
|
+
f"Best config should include either 'score' or '{metric}'"
|
|
68
|
+
|
|
69
|
+
# Ensure the metric value is valid
|
|
70
|
+
assert best.get(metric, best.get("score")) >= 0, \
|
|
71
|
+
f"{metric} score should be non-negative"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
5
|
+
from time import perf_counter
|
|
6
|
+
|
|
7
|
+
from .core.pipeline import RAGPipeline
|
|
8
|
+
from .core.embeddings import EmbeddingModel
|
|
9
|
+
from .core.retriever import Retriever
|
|
10
|
+
from .core.reranker import Reranker
|
|
11
|
+
from .core.evaluation import Evaluator
|
|
12
|
+
from .optimization.search import GridSearch, RandomSearch, BayesianSearch
|
|
13
|
+
|
|
14
|
+
from .utils.data_loader import load_validation_set
|
|
15
|
+
|
|
16
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RAGMint:
|
|
20
|
+
"""
|
|
21
|
+
Main RAG pipeline optimizer and evaluator.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
docs_path: str,
|
|
27
|
+
retrievers: List[str],
|
|
28
|
+
embeddings: List[str],
|
|
29
|
+
rerankers: List[str],
|
|
30
|
+
):
|
|
31
|
+
self.docs_path = docs_path
|
|
32
|
+
self.retrievers = retrievers
|
|
33
|
+
self.embeddings = embeddings
|
|
34
|
+
self.rerankers = rerankers
|
|
35
|
+
|
|
36
|
+
self.documents: List[str] = self._load_docs()
|
|
37
|
+
self.embeddings_cache: Dict[str, Any] = {}
|
|
38
|
+
|
|
39
|
+
def _load_docs(self) -> List[str]:
|
|
40
|
+
if not os.path.exists(self.docs_path):
|
|
41
|
+
logging.warning(f"Corpus path not found: {self.docs_path}")
|
|
42
|
+
return []
|
|
43
|
+
docs = []
|
|
44
|
+
for file in os.listdir(self.docs_path):
|
|
45
|
+
if file.endswith(".txt") or file.endswith(".md") or file.endswith(".rst"):
|
|
46
|
+
with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
|
|
47
|
+
docs.append(f.read())
|
|
48
|
+
logging.info(f"Loaded {len(docs)} documents from {self.docs_path}")
|
|
49
|
+
return docs
|
|
50
|
+
|
|
51
|
+
def _embed_docs(self, model_name: str):
|
|
52
|
+
if model_name in self.embeddings_cache:
|
|
53
|
+
return self.embeddings_cache[model_name]
|
|
54
|
+
|
|
55
|
+
model = EmbeddingModel(model_name)
|
|
56
|
+
embeddings = model.encode(self.documents)
|
|
57
|
+
self.embeddings_cache[model_name] = embeddings
|
|
58
|
+
return embeddings
|
|
59
|
+
|
|
60
|
+
def _build_pipeline(self, config: Dict[str, str]) -> RAGPipeline:
|
|
61
|
+
emb_model = EmbeddingModel(config["embedding_model"])
|
|
62
|
+
embeddings = self._embed_docs(config["embedding_model"])
|
|
63
|
+
retriever = Retriever(embeddings, self.documents)
|
|
64
|
+
reranker = Reranker(config["reranker"])
|
|
65
|
+
evaluator = Evaluator()
|
|
66
|
+
return RAGPipeline(retriever, reranker, evaluator)
|
|
67
|
+
|
|
68
|
+
def _evaluate_config(
|
|
69
|
+
self, config: Dict[str, Any], validation: List[Dict[str, str]], metric: str
|
|
70
|
+
) -> Dict[str, float]:
|
|
71
|
+
pipeline = self._build_pipeline(config)
|
|
72
|
+
|
|
73
|
+
scores = []
|
|
74
|
+
start = perf_counter()
|
|
75
|
+
for sample in validation:
|
|
76
|
+
query = sample.get("question") or sample.get("query")
|
|
77
|
+
reference = sample.get("answer")
|
|
78
|
+
result = pipeline.run(query)
|
|
79
|
+
score = result["metrics"].get(metric, 0.0)
|
|
80
|
+
scores.append(score)
|
|
81
|
+
elapsed = perf_counter() - start
|
|
82
|
+
|
|
83
|
+
avg_score = sum(scores) / len(scores) if scores else 0.0
|
|
84
|
+
return {metric: avg_score, "latency": elapsed / max(1, len(validation))}
|
|
85
|
+
|
|
86
|
+
def optimize(
|
|
87
|
+
self,
|
|
88
|
+
validation_set: str,
|
|
89
|
+
metric: str = "faithfulness",
|
|
90
|
+
search_type: str = "random",
|
|
91
|
+
trials: int = 10,
|
|
92
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
93
|
+
validation = load_validation_set(validation_set or "default")
|
|
94
|
+
|
|
95
|
+
search_space = {
|
|
96
|
+
"retriever": self.retrievers,
|
|
97
|
+
"embedding_model": self.embeddings,
|
|
98
|
+
"reranker": self.rerankers,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
logging.info(f"Starting {search_type} optimization with {trials} trials")
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
if search_type == "grid":
|
|
105
|
+
searcher = GridSearch(search_space)
|
|
106
|
+
elif search_type == "bayesian":
|
|
107
|
+
searcher = BayesianSearch(search_space)
|
|
108
|
+
else:
|
|
109
|
+
searcher = RandomSearch(search_space, n_trials=trials)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logging.warning(f"Falling back to RandomSearch due to missing deps: {e}")
|
|
112
|
+
searcher = RandomSearch(search_space, n_trials=trials)
|
|
113
|
+
|
|
114
|
+
results = []
|
|
115
|
+
for config in searcher:
|
|
116
|
+
metrics = self._evaluate_config(config, validation, metric)
|
|
117
|
+
result = {**config, **metrics}
|
|
118
|
+
results.append(result)
|
|
119
|
+
logging.info(f"Tested config: {config} -> {metrics}")
|
|
120
|
+
|
|
121
|
+
best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
|
|
122
|
+
logging.info(f"✅ Best configuration found: {best}")
|
|
123
|
+
return best, results
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
import pickle
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Cache:
|
|
9
|
+
"""
|
|
10
|
+
Simple file-based cache for embeddings or retrievals.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, cache_dir: str = ".ragmint_cache"):
|
|
14
|
+
self.cache_dir = cache_dir
|
|
15
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
def _hash_key(self, key: str) -> str:
|
|
18
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
19
|
+
|
|
20
|
+
def exists(self, key: str) -> bool:
|
|
21
|
+
return os.path.exists(os.path.join(self.cache_dir, self._hash_key(key)))
|
|
22
|
+
|
|
23
|
+
def get(self, key: str) -> Any:
|
|
24
|
+
path = os.path.join(self.cache_dir, self._hash_key(key))
|
|
25
|
+
if not os.path.exists(path):
|
|
26
|
+
return None
|
|
27
|
+
with open(path, "rb") as f:
|
|
28
|
+
return pickle.load(f)
|
|
29
|
+
|
|
30
|
+
def set(self, key: str, value: Any):
|
|
31
|
+
path = os.path.join(self.cache_dir, self._hash_key(key))
|
|
32
|
+
with open(path, "wb") as f:
|
|
33
|
+
pickle.dump(value, f)
|
|
34
|
+
|
|
35
|
+
def clear(self):
|
|
36
|
+
for file in os.listdir(self.cache_dir):
|
|
37
|
+
os.remove(os.path.join(self.cache_dir, file))
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import csv
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from datasets import load_dataset
|
|
9
|
+
except ImportError:
|
|
10
|
+
load_dataset = None # optional dependency
|
|
11
|
+
|
|
12
|
+
DEFAULT_VALIDATION_PATH = Path(__file__).parent.parent / "experiments" / "validation_qa.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_json(path: str) -> List[Dict]:
|
|
16
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
17
|
+
return json.load(f)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_csv(path: str) -> List[Dict]:
|
|
21
|
+
with open(path, newline="", encoding="utf-8") as csvfile:
|
|
22
|
+
reader = csv.DictReader(csvfile)
|
|
23
|
+
return list(reader)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def save_json(path: str, data: Dict):
|
|
27
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
28
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
29
|
+
|
|
30
|
+
def load_validation_set(path: str | None = None) -> List[Dict]:
|
|
31
|
+
"""
|
|
32
|
+
Loads a validation dataset (QA pairs) from:
|
|
33
|
+
- Built-in default JSON file
|
|
34
|
+
- User-provided JSON or CSV
|
|
35
|
+
- Hugging Face dataset by name
|
|
36
|
+
"""
|
|
37
|
+
# Default behavior
|
|
38
|
+
if path is None or path == "default":
|
|
39
|
+
if not DEFAULT_VALIDATION_PATH.exists():
|
|
40
|
+
raise FileNotFoundError(f"Default validation set not found at {DEFAULT_VALIDATION_PATH}")
|
|
41
|
+
return load_json(DEFAULT_VALIDATION_PATH)
|
|
42
|
+
|
|
43
|
+
# Hugging Face dataset
|
|
44
|
+
if not os.path.exists(path) and load_dataset:
|
|
45
|
+
try:
|
|
46
|
+
dataset = load_dataset(path, split="validation")
|
|
47
|
+
data = [
|
|
48
|
+
{"question": q, "answer": a}
|
|
49
|
+
for q, a in zip(dataset["question"], dataset["answers"])
|
|
50
|
+
]
|
|
51
|
+
return data
|
|
52
|
+
except Exception:
|
|
53
|
+
pass # fall through to file loading
|
|
54
|
+
|
|
55
|
+
# Local file
|
|
56
|
+
p = Path(path)
|
|
57
|
+
if not p.exists():
|
|
58
|
+
raise FileNotFoundError(f"Validation file not found: {path}")
|
|
59
|
+
|
|
60
|
+
if p.suffix.lower() == ".json":
|
|
61
|
+
return load_json(path)
|
|
62
|
+
elif p.suffix.lower() in [".csv", ".tsv"]:
|
|
63
|
+
return load_csv(path)
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError("Unsupported validation set format. Use JSON, CSV, or a Hugging Face dataset name.")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Logger:
|
|
6
|
+
"""
|
|
7
|
+
Centralized logger with optional tqdm integration and color formatting.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, name: str = "ragmint", level: int = logging.INFO):
|
|
11
|
+
self.logger = logging.getLogger(name)
|
|
12
|
+
self.logger.setLevel(level)
|
|
13
|
+
|
|
14
|
+
if not self.logger.handlers:
|
|
15
|
+
handler = logging.StreamHandler()
|
|
16
|
+
formatter = logging.Formatter(
|
|
17
|
+
"\033[96m[%(asctime)s]\033[0m \033[93m%(levelname)s\033[0m: %(message)s",
|
|
18
|
+
"%H:%M:%S",
|
|
19
|
+
)
|
|
20
|
+
handler.setFormatter(formatter)
|
|
21
|
+
self.logger.addHandler(handler)
|
|
22
|
+
|
|
23
|
+
def info(self, msg: str):
|
|
24
|
+
self.logger.info(msg)
|
|
25
|
+
|
|
26
|
+
def warning(self, msg: str):
|
|
27
|
+
self.logger.warning(msg)
|
|
28
|
+
|
|
29
|
+
def error(self, msg: str):
|
|
30
|
+
self.logger.error(msg)
|
|
31
|
+
|
|
32
|
+
def progress(self, iterable, desc="Processing", total=None):
|
|
33
|
+
return tqdm(iterable, desc=desc, total=total)
|
|
34
|
+
|
|
35
|
+
def get_logger(name: str = "ragmint") -> Logger:
|
|
36
|
+
return Logger(name)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import numpy as np
|
|
3
|
+
from difflib import SequenceMatcher
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bleu_score(reference: str, candidate: str) -> float:
|
|
7
|
+
"""
|
|
8
|
+
Simple BLEU-like precision approximation.
|
|
9
|
+
"""
|
|
10
|
+
ref_tokens = reference.split()
|
|
11
|
+
cand_tokens = candidate.split()
|
|
12
|
+
if not cand_tokens:
|
|
13
|
+
return 0.0
|
|
14
|
+
|
|
15
|
+
matches = sum(1 for token in cand_tokens if token in ref_tokens)
|
|
16
|
+
return matches / len(cand_tokens)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def rouge_l(reference: str, candidate: str) -> float:
|
|
20
|
+
"""
|
|
21
|
+
Approximation of ROUGE-L using sequence matcher ratio.
|
|
22
|
+
"""
|
|
23
|
+
return SequenceMatcher(None, reference, candidate).ratio()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def mean_score(scores: List[float]) -> float:
|
|
27
|
+
return float(np.mean(scores)) if scores else 0.0
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragmint
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: A modular framework for evaluating and optimizing RAG pipelines.
|
|
5
|
+
Author-email: Andre Oliveira <oandreoliveira@outlook.com>
|
|
6
|
+
License: Apache License 2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/andyolivers/ragmint
|
|
8
|
+
Project-URL: Documentation, https://andyolivers.com
|
|
9
|
+
Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
|
|
10
|
+
Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.23
|
|
15
|
+
Requires-Dist: pandas>=2.0
|
|
16
|
+
Requires-Dist: scikit-learn>=1.3
|
|
17
|
+
Requires-Dist: openai>=1.0
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: chromadb>=0.4
|
|
21
|
+
Requires-Dist: faiss-cpu; sys_platform != "darwin"
|
|
22
|
+
Requires-Dist: optuna>=3.0
|
|
23
|
+
Requires-Dist: pytest
|
|
24
|
+
Requires-Dist: colorama
|
|
25
|
+
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
+
Requires-Dist: supabase>=2.4.0
|
|
27
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/ragmint/__init__.py
|
|
4
|
+
src/ragmint/__main__.py
|
|
5
|
+
src/ragmint/autotuner.py
|
|
6
|
+
src/ragmint/explainer.py
|
|
7
|
+
src/ragmint/leaderboard.py
|
|
8
|
+
src/ragmint/tuner.py
|
|
9
|
+
src/ragmint.egg-info/PKG-INFO
|
|
10
|
+
src/ragmint.egg-info/SOURCES.txt
|
|
11
|
+
src/ragmint.egg-info/dependency_links.txt
|
|
12
|
+
src/ragmint.egg-info/requires.txt
|
|
13
|
+
src/ragmint.egg-info/top_level.txt
|
|
14
|
+
src/ragmint/core/__init__.py
|
|
15
|
+
src/ragmint/core/chunking.py
|
|
16
|
+
src/ragmint/core/embeddings.py
|
|
17
|
+
src/ragmint/core/evaluation.py
|
|
18
|
+
src/ragmint/core/pipeline.py
|
|
19
|
+
src/ragmint/core/reranker.py
|
|
20
|
+
src/ragmint/core/retriever.py
|
|
21
|
+
src/ragmint/experiments/__init__.py
|
|
22
|
+
src/ragmint/experiments/validation_qa.json
|
|
23
|
+
src/ragmint/optimization/__init__.py
|
|
24
|
+
src/ragmint/optimization/search.py
|
|
25
|
+
src/ragmint/tests/__init__.py
|
|
26
|
+
src/ragmint/tests/conftest.py
|
|
27
|
+
src/ragmint/tests/test_autotuner.py
|
|
28
|
+
src/ragmint/tests/test_explainer.py
|
|
29
|
+
src/ragmint/tests/test_explainer_integration.py
|
|
30
|
+
src/ragmint/tests/test_integration_autotuner_ragmint.py
|
|
31
|
+
src/ragmint/tests/test_leaderboard.py
|
|
32
|
+
src/ragmint/tests/test_pipeline.py
|
|
33
|
+
src/ragmint/tests/test_retriever.py
|
|
34
|
+
src/ragmint/tests/test_search.py
|
|
35
|
+
src/ragmint/tests/test_tuner.py
|
|
36
|
+
src/ragmint/utils/__init__.py
|
|
37
|
+
src/ragmint/utils/caching.py
|
|
38
|
+
src/ragmint/utils/data_loader.py
|
|
39
|
+
src/ragmint/utils/logger.py
|
|
40
|
+
src/ragmint/utils/metrics.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ragmint
|