ragmint 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragmint might be problematic. Click here for more details.

ragmint/autotuner.py ADDED
@@ -0,0 +1,33 @@
1
+ """
2
+ Auto-RAG Tuner
3
+ --------------
4
+ Recommends retriever–embedding pairs dynamically based on corpus size
5
+ and dataset characteristics. Integrates seamlessly with RAGMint evaluator.
6
+ """
7
+
8
+ from .core.evaluation import evaluate_config
9
+
10
+
11
+ class AutoRAGTuner:
12
+ def __init__(self, corpus_stats: dict):
13
+ """
14
+ corpus_stats: dict
15
+ Example: {'size': 12000, 'avg_len': 240}
16
+ """
17
+ self.corpus_stats = corpus_stats
18
+
19
+ def recommend(self):
20
+ size = self.corpus_stats.get("size", 0)
21
+ avg_len = self.corpus_stats.get("avg_len", 0)
22
+
23
+ if size < 1000:
24
+ return {"retriever": "BM25", "embedding_model": "OpenAI"}
25
+ elif size < 10000:
26
+ return {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
27
+ else:
28
+ return {"retriever": "FAISS", "embedding_model": "InstructorXL"}
29
+
30
+ def auto_tune(self, validation_data):
31
+ config = self.recommend()
32
+ results = evaluate_config(config, validation_data)
33
+ return {"recommended": config, "results": results}
@@ -25,3 +25,14 @@ class Evaluator:
25
25
 
26
26
  def _similarity(self, a: str, b: str) -> float:
27
27
  return SequenceMatcher(None, a, b).ratio()
28
+
29
+ def evaluate_config(config, validation_data):
30
+ evaluator = Evaluator()
31
+ results = []
32
+ for sample in validation_data:
33
+ query = sample.get("query", "")
34
+ answer = sample.get("answer", "")
35
+ context = sample.get("context", "")
36
+ results.append(evaluator.evaluate(query, answer, context))
37
+ return results
38
+
ragmint/explainer.py ADDED
@@ -0,0 +1,61 @@
1
+ """
2
+ Interpretability Layer
3
+ ----------------------
4
+ Uses Gemini or Anthropic Claude to explain why one RAG configuration
5
+ outperforms another. Falls back gracefully if no API key is provided.
6
+ """
7
+
8
+ import os
9
+ import json
10
+
11
+
12
+ def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-pro") -> str:
13
+ """
14
+ Generate a natural-language explanation comparing two RAG experiment results.
15
+ Priority:
16
+ 1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
17
+ 2. Google Gemini (if GOOGLE_API_KEY is set)
18
+ 3. Fallback text message
19
+ """
20
+ prompt = f"""
21
+ You are an AI evaluation expert.
22
+ Compare these two RAG experiment results and explain why one performs better.
23
+ Metrics A: {json.dumps(results_a, indent=2)}
24
+ Metrics B: {json.dumps(results_b, indent=2)}
25
+ Provide a concise, human-friendly explanation and practical improvement tips.
26
+ """
27
+
28
+ anthropic_key = os.getenv("ANTHROPIC_API_KEY")
29
+ google_key = os.getenv("GEMINI_API_KEY")
30
+
31
+
32
+ # 1️⃣ Try Anthropic Claude first
33
+ if anthropic_key:
34
+ try:
35
+ from anthropic import Anthropic
36
+ client = Anthropic(api_key=anthropic_key)
37
+ response = client.messages.create(
38
+ model="claude-3-opus-20240229",
39
+ max_tokens=300,
40
+ messages=[{"role": "user", "content": prompt}],
41
+ )
42
+ return response.content[0].text
43
+ except Exception as e:
44
+ return f"[Claude unavailable] {e}"
45
+
46
+ # 2️⃣ Fallback to Google Gemini
47
+ elif google_key:
48
+ try:
49
+ import google.generativeai as genai
50
+ genai.configure(api_key=google_key)
51
+ response = genai.GenerativeModel(model).generate_content(prompt)
52
+ return response.text
53
+ except Exception as e:
54
+ return f"[Gemini unavailable] {e}"
55
+
56
+ # 3️⃣ Fallback if neither key is available
57
+ else:
58
+ return (
59
+ "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
60
+ "to enable interpretability via Claude or Gemini."
61
+ )
ragmint/leaderboard.py ADDED
@@ -0,0 +1,45 @@
1
+ import os
2
+ import json
3
+ from datetime import datetime
4
+ from typing import Dict, Any, Optional
5
+ from supabase import create_client
6
+
7
+ class Leaderboard:
8
+ def __init__(self, storage_path: Optional[str] = None):
9
+ self.storage_path = storage_path
10
+ url = os.getenv("SUPABASE_URL")
11
+ key = os.getenv("SUPABASE_KEY")
12
+ self.client = None
13
+ if url and key:
14
+ self.client = create_client(url, key)
15
+ elif not storage_path:
16
+ raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
17
+
18
+ def upload(self, run_id: str, config: Dict[str, Any], score: float):
19
+ data = {
20
+ "run_id": run_id,
21
+ "config": config,
22
+ "score": score,
23
+ "timestamp": datetime.utcnow().isoformat(),
24
+ }
25
+ if self.client:
26
+ return self.client.table("experiments").insert(data).execute()
27
+ else:
28
+ os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
29
+ with open(self.storage_path, "a", encoding="utf-8") as f:
30
+ f.write(json.dumps(data) + "\n")
31
+ return data
32
+
33
+ def top_results(self, limit: int = 10):
34
+ if self.client:
35
+ return (
36
+ self.client.table("experiments")
37
+ .select("*")
38
+ .order("score", desc=True)
39
+ .limit(limit)
40
+ .execute()
41
+ )
42
+ else:
43
+ with open(self.storage_path, "r", encoding="utf-8") as f:
44
+ lines = [json.loads(line) for line in f]
45
+ return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
@@ -0,0 +1,16 @@
1
+ # src/ragmint/tests/conftest.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import pytest
5
+
6
+ # Load .env from project root
7
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "../../../.env"))
8
+
9
+ def pytest_configure(config):
10
+ """Print which keys are loaded (debug)."""
11
+ google = os.getenv("GEMINI_API_KEY")
12
+ anthropic = os.getenv("ANTHROPIC_API_KEY")
13
+ if google:
14
+ print("✅ GOOGLE_API_KEY loaded")
15
+ if anthropic:
16
+ print("✅ ANTHROPIC_API_KEY loaded")
@@ -0,0 +1,42 @@
1
+ import pytest
2
+ from ragmint.autotuner import AutoRAGTuner
3
+
4
+
5
+ def test_autorag_recommend_small():
6
+ """Small corpus should trigger BM25 + OpenAI."""
7
+ tuner = AutoRAGTuner({"size": 500, "avg_len": 150})
8
+ rec = tuner.recommend()
9
+ assert rec["retriever"] == "BM25"
10
+ assert rec["embedding_model"] == "OpenAI"
11
+
12
+
13
+ def test_autorag_recommend_medium():
14
+ """Medium corpus should trigger Chroma + SentenceTransformers."""
15
+ tuner = AutoRAGTuner({"size": 5000, "avg_len": 200})
16
+ rec = tuner.recommend()
17
+ assert rec["retriever"] == "Chroma"
18
+ assert rec["embedding_model"] == "SentenceTransformers"
19
+
20
+
21
+ def test_autorag_recommend_large():
22
+ """Large corpus should trigger FAISS + InstructorXL."""
23
+ tuner = AutoRAGTuner({"size": 50000, "avg_len": 300})
24
+ rec = tuner.recommend()
25
+ assert rec["retriever"] == "FAISS"
26
+ assert rec["embedding_model"] == "InstructorXL"
27
+
28
+
29
+ def test_autorag_auto_tune(monkeypatch):
30
+ """Test auto_tune with a mock validation dataset."""
31
+ tuner = AutoRAGTuner({"size": 12000, "avg_len": 250})
32
+
33
+ # Monkeypatch evaluate_config inside autotuner
34
+ import ragmint.autotuner as autotuner
35
+ def mock_eval(config, data):
36
+ return {"faithfulness": 0.9, "latency": 0.01}
37
+ monkeypatch.setattr(autotuner, "evaluate_config", mock_eval)
38
+
39
+ result = tuner.auto_tune([{"question": "What is AI?", "answer": "Artificial Intelligence"}])
40
+ assert "recommended" in result
41
+ assert "results" in result
42
+ assert isinstance(result["results"], dict)
@@ -0,0 +1,20 @@
1
+ import pytest
2
+ from ragmint.explainer import explain_results
3
+
4
+
5
+ def test_explain_results_gemini():
6
+ """Gemini explanation should contain model-specific phrasing."""
7
+ config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
8
+ config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
9
+ result = explain_results(config_a, config_b, model="gemini")
10
+ assert isinstance(result, str)
11
+ assert "Gemini" in result or "gemini" in result
12
+
13
+
14
+ def test_explain_results_claude():
15
+ """Claude explanation should contain model-specific phrasing."""
16
+ config_a = {"retriever": "FAISS"}
17
+ config_b = {"retriever": "Chroma"}
18
+ result = explain_results(config_a, config_b, model="claude")
19
+ assert isinstance(result, str)
20
+ assert "Claude" in result or "claude" in result
@@ -0,0 +1,18 @@
1
+ import os
2
+ import pytest
3
+ from ragmint.explainer import explain_results
4
+
5
+
6
+ @pytest.mark.integration
7
+ def test_real_gemini_explanation():
8
+ """Run real Gemini call if GOOGLE_API_KEY is set."""
9
+ if not os.getenv("GEMINI_API_KEY"):
10
+ pytest.skip("GOOGLE_API_KEY not set")
11
+
12
+ config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
13
+ config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
14
+
15
+ result = explain_results(config_a, config_b, model="gemini-1.5-pro")
16
+ assert isinstance(result, str)
17
+ assert len(result) > 0
18
+ print("\n[Gemini explanation]:", result[:200], "...")
@@ -0,0 +1,60 @@
1
+ import pytest
2
+ from ragmint.tuner import RAGMint
3
+ from ragmint.autotuner import AutoRAGTuner
4
+
5
+
6
+ def test_integration_ragmint_autotune(monkeypatch, tmp_path):
7
+ """
8
+ Smoke test for integration between AutoRAGTuner and RAGMint.
9
+ Ensures end-to-end flow runs without real retrievers or embeddings.
10
+ """
11
+
12
+ # --- Mock corpus and validation data ---
13
+ corpus = tmp_path / "docs"
14
+ corpus.mkdir()
15
+ (corpus / "doc1.txt").write_text("This is an AI document.")
16
+ validation_data = [{"question": "What is AI?", "answer": "Artificial Intelligence"}]
17
+
18
+ # --- Mock RAGMint.optimize() to avoid real model work ---
19
+ def mock_optimize(self, validation_set=None, metric="faithfulness", trials=2):
20
+ return (
21
+ {"retriever": "FAISS", "embedding_model": "OpenAI", "score": 0.88},
22
+ [{"trial": 1, "score": 0.88}],
23
+ )
24
+
25
+ monkeypatch.setattr(RAGMint, "optimize", mock_optimize)
26
+
27
+ # --- Mock evaluation used by AutoRAGTuner ---
28
+ def mock_evaluate_config(config, data):
29
+ return {"faithfulness": 0.9, "latency": 0.01}
30
+
31
+ import ragmint.autotuner as autotuner
32
+ monkeypatch.setattr(autotuner, "evaluate_config", mock_evaluate_config)
33
+
34
+ # --- Create AutoRAGTuner and RAGMint instances ---
35
+ ragmint = RAGMint(
36
+ docs_path=str(corpus),
37
+ retrievers=["faiss", "chroma"],
38
+ embeddings=["text-embedding-3-small"],
39
+ rerankers=["mmr"],
40
+ )
41
+
42
+ tuner = AutoRAGTuner({"size": 2000, "avg_len": 150})
43
+
44
+ # --- Run Auto-Tune and RAG Optimization ---
45
+ recommendation = tuner.recommend()
46
+ assert "retriever" in recommendation
47
+ assert "embedding_model" in recommendation
48
+
49
+ tuning_results = tuner.auto_tune(validation_data)
50
+ assert "results" in tuning_results
51
+ assert isinstance(tuning_results["results"], dict)
52
+
53
+ # --- Run RAGMint optimization flow (mocked) ---
54
+ best_config, results = ragmint.optimize(validation_set=validation_data, trials=2)
55
+ assert isinstance(best_config, dict)
56
+ assert "score" in best_config
57
+ assert isinstance(results, list)
58
+
59
+ # --- Integration Success ---
60
+ print(f"Integration OK: AutoRAG recommended {recommendation}, RAGMint best {best_config}")
@@ -0,0 +1,39 @@
1
+ import json
2
+ import tempfile
3
+ from pathlib import Path
4
+ from ragmint.leaderboard import Leaderboard
5
+
6
+
7
+ def test_leaderboard_add_and_top(tmp_path):
8
+ """Ensure local leaderboard persistence works without Supabase."""
9
+ file_path = tmp_path / "leaderboard.jsonl"
10
+ lb = Leaderboard(storage_path=str(file_path))
11
+
12
+ # Add two runs
13
+ lb.upload("run1", {"retriever": "FAISS"}, 0.91)
14
+ lb.upload("run2", {"retriever": "Chroma"}, 0.85)
15
+
16
+ # Verify file content
17
+ assert file_path.exists()
18
+ with open(file_path, "r", encoding="utf-8") as f:
19
+ lines = [json.loads(line) for line in f]
20
+ assert len(lines) == 2
21
+
22
+ # Get top results
23
+ top = lb.top_results(limit=1)
24
+ assert isinstance(top, list)
25
+ assert len(top) == 1
26
+ assert "score" in top[0]
27
+
28
+
29
+ def test_leaderboard_append_existing(tmp_path):
30
+ """Ensure multiple uploads append properly."""
31
+ file_path = tmp_path / "leaderboard.jsonl"
32
+ lb = Leaderboard(storage_path=str(file_path))
33
+
34
+ for i in range(3):
35
+ lb.upload(f"run{i}", {"retriever": "BM25"}, 0.8 + i * 0.05)
36
+
37
+ top = lb.top_results(limit=2)
38
+ assert len(top) == 2
39
+ assert top[0]["score"] >= top[1]["score"]
@@ -1,9 +1,11 @@
1
1
  import os
2
2
  import json
3
+ import pytest
3
4
  from ragmint.tuner import RAGMint
4
5
 
5
6
 
6
7
  def setup_validation_file(tmp_path):
8
+ """Create a temporary validation QA dataset."""
7
9
  data = [
8
10
  {"question": "What is AI?", "answer": "Artificial Intelligence"},
9
11
  {"question": "Define ML", "answer": "Machine Learning"}
@@ -15,6 +17,7 @@ def setup_validation_file(tmp_path):
15
17
 
16
18
 
17
19
  def setup_docs(tmp_path):
20
+ """Create a small document corpus for testing."""
18
21
  corpus = tmp_path / "corpus"
19
22
  corpus.mkdir()
20
23
  (corpus / "doc1.txt").write_text("This is about Artificial Intelligence.")
@@ -22,17 +25,47 @@ def setup_docs(tmp_path):
22
25
  return str(corpus)
23
26
 
24
27
 
25
- def test_optimize_random(tmp_path):
28
+ @pytest.mark.parametrize("validation_mode", [
29
+ None, # Built-in dataset
30
+ "data/custom_eval.json", # Custom dataset path (mocked below)
31
+ ])
32
+ def test_optimize_ragmint(tmp_path, validation_mode, monkeypatch):
33
+ """Test RAGMint.optimize() with different dataset modes."""
26
34
  docs_path = setup_docs(tmp_path)
27
35
  val_file = setup_validation_file(tmp_path)
28
36
 
37
+ # If using custom dataset, mock the path
38
+ if validation_mode and "custom_eval" in validation_mode:
39
+ custom_path = tmp_path / "custom_eval.json"
40
+ os.rename(val_file, custom_path)
41
+ validation_mode = str(custom_path)
42
+
43
+ metric = "faithfulness"
44
+
45
+ # Initialize RAGMint
29
46
  rag = RAGMint(
30
47
  docs_path=docs_path,
31
48
  retrievers=["faiss"],
32
- embeddings=["openai/text-embedding-3-small"],
49
+ embeddings=["text-embedding-3-small"],
33
50
  rerankers=["mmr"]
34
51
  )
35
52
 
36
- best, results = rag.optimize(validation_set=val_file, metric="faithfulness", trials=2)
37
- assert isinstance(best, dict)
38
- assert isinstance(results, list)
53
+ # Run optimization
54
+ best, results = rag.optimize(
55
+ validation_set=validation_mode,
56
+ metric=metric,
57
+ trials=2
58
+ )
59
+
60
+ # Validate results
61
+ assert isinstance(best, dict), "Best config should be a dict"
62
+ assert isinstance(results, list), "Results should be a list of trials"
63
+ assert len(results) > 0, "Optimization should produce results"
64
+
65
+ # The best result can expose either 'score' or the metric name (e.g. 'faithfulness')
66
+ assert any(k in best for k in ("score", metric)), \
67
+ f"Best config should include either 'score' or '{metric}'"
68
+
69
+ # Ensure the metric value is valid
70
+ assert best.get(metric, best.get("score")) >= 0, \
71
+ f"{metric} score should be non-negative"
ragmint/tuner.py CHANGED
@@ -90,7 +90,7 @@ class RAGMint:
90
90
  search_type: str = "random",
91
91
  trials: int = 10,
92
92
  ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
93
- validation = load_validation_set(validation_set)
93
+ validation = load_validation_set(validation_set or "default")
94
94
 
95
95
  search_space = {
96
96
  "retriever": self.retrievers,
@@ -2,6 +2,14 @@ import json
2
2
  import csv
3
3
  from typing import List, Dict
4
4
  from pathlib import Path
5
+ import os
6
+
7
+ try:
8
+ from datasets import load_dataset
9
+ except ImportError:
10
+ load_dataset = None # optional dependency
11
+
12
+ DEFAULT_VALIDATION_PATH = Path(__file__).parent.parent / "experiments" / "validation_qa.json"
5
13
 
6
14
 
7
15
  def load_json(path: str) -> List[Dict]:
@@ -19,10 +27,32 @@ def save_json(path: str, data: Dict):
19
27
  with open(path, "w", encoding="utf-8") as f:
20
28
  json.dump(data, f, ensure_ascii=False, indent=2)
21
29
 
22
- def load_validation_set(path: str) -> List[Dict]:
30
+ def load_validation_set(path: str | None = None) -> List[Dict]:
23
31
  """
24
- Loads a validation dataset (QA pairs) from JSON or CSV.
32
+ Loads a validation dataset (QA pairs) from:
33
+ - Built-in default JSON file
34
+ - User-provided JSON or CSV
35
+ - Hugging Face dataset by name
25
36
  """
37
+ # Default behavior
38
+ if path is None or path == "default":
39
+ if not DEFAULT_VALIDATION_PATH.exists():
40
+ raise FileNotFoundError(f"Default validation set not found at {DEFAULT_VALIDATION_PATH}")
41
+ return load_json(DEFAULT_VALIDATION_PATH)
42
+
43
+ # Hugging Face dataset
44
+ if not os.path.exists(path) and load_dataset:
45
+ try:
46
+ dataset = load_dataset(path, split="validation")
47
+ data = [
48
+ {"question": q, "answer": a}
49
+ for q, a in zip(dataset["question"], dataset["answers"])
50
+ ]
51
+ return data
52
+ except Exception:
53
+ pass # fall through to file loading
54
+
55
+ # Local file
26
56
  p = Path(path)
27
57
  if not p.exists():
28
58
  raise FileNotFoundError(f"Validation file not found: {path}")
@@ -32,4 +62,4 @@ def load_validation_set(path: str) -> List[Dict]:
32
62
  elif p.suffix.lower() in [".csv", ".tsv"]:
33
63
  return load_csv(path)
34
64
  else:
35
- raise ValueError("Unsupported validation set format. Use JSON or CSV.")
65
+ raise ValueError("Unsupported validation set format. Use JSON, CSV, or a Hugging Face dataset name.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
@@ -22,6 +22,8 @@ Requires-Dist: faiss-cpu; sys_platform != "darwin"
22
22
  Requires-Dist: optuna>=3.0
23
23
  Requires-Dist: pytest
24
24
  Requires-Dist: colorama
25
+ Requires-Dist: google-generativeai>=0.8.0
26
+ Requires-Dist: supabase>=2.4.0
25
27
  Dynamic: license-file
26
28
 
27
29
  # Ragmint
@@ -36,17 +38,19 @@ Dynamic: license-file
36
38
 
37
39
  **Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
38
40
 
39
- It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**.
41
+ It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**, **Auto-RAG tuning**, and **explainability** through Gemini or Claude.
40
42
 
41
43
  ---
42
44
 
43
45
  ## ✨ Features
44
46
 
45
47
  - ✅ **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
48
+ - 🤖 **Auto-RAG Tuner** — dynamically recommends retriever–embedding pairs based on corpus size
49
+ - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
50
+ - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
46
51
  - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
47
52
  - ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
48
53
  - 🧩 **Embeddings** — OpenAI, HuggingFace
49
- - 🧠 **Rerankers** — MMR, CrossEncoder (extensible via plugin interface)
50
54
  - 💾 **Caching, experiment tracking, and reproducibility** out of the box
51
55
  - 🧰 **Clean modular structure** for easy integration in research and production setups
52
56
 
@@ -103,47 +107,133 @@ print(result)
103
107
 
104
108
  ---
105
109
 
110
+ ## 🧪 Dataset Options
111
+
112
+ Ragmint can automatically load evaluation datasets for your RAG pipeline:
113
+
114
+ | Mode | Example | Description |
115
+ |------|----------|-------------|
116
+ | 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
117
+ | 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
118
+ | 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
119
+
120
+ ### Example
121
+
122
+ ```python
123
+ from ragmint.tuner import RAGMint
124
+
125
+ ragmint = RAGMint(
126
+ docs_path="data/docs/",
127
+ retrievers=["faiss", "chroma"],
128
+ embeddings=["text-embedding-3-small"],
129
+ rerankers=["mmr"],
130
+ )
131
+
132
+ # Use built-in default
133
+ ragmint.optimize(validation_set=None)
134
+
135
+ # Use Hugging Face benchmark
136
+ ragmint.optimize(validation_set="squad")
137
+
138
+ # Use your own dataset
139
+ ragmint.optimize(validation_set="data/custom_qa.json")
140
+ ```
141
+
142
+ ---
143
+
144
+ ## 🧠 Auto-RAG Tuner
145
+
146
+ The **AutoRAGTuner** automatically recommends retriever–embedding combinations
147
+ based on corpus size and average document length.
148
+
149
+ ```python
150
+ from ragmint.autotuner import AutoRAGTuner
151
+
152
+ corpus_stats = {"size": 5000, "avg_len": 250}
153
+ tuner = AutoRAGTuner(corpus_stats)
154
+ recommendation = tuner.recommend()
155
+ print(recommendation)
156
+ # Example output: {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
157
+ ```
158
+
159
+ ---
160
+
161
+ ## 🏆 Leaderboard Tracking
162
+
163
+ Track and visualize your best experiments across runs.
164
+
165
+ ```python
166
+ from ragmint.leaderboard import Leaderboard
167
+
168
+ lb = Leaderboard("experiments/leaderboard.json")
169
+ lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
170
+ lb.show_top(3)
171
+ ```
172
+
173
+ ---
174
+
175
+ ## 🧠 Explainability with Gemini / Claude
176
+
177
+ Compare two RAG configurations and receive natural language insights
178
+ on **why** one performs better.
179
+
180
+ ```python
181
+ from ragmint.explainer import explain_results
182
+
183
+ config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
184
+ config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
185
+
186
+ explanation = explain_results(config_a, config_b, model="gemini")
187
+ print(explanation)
188
+ ```
189
+
190
+ > Set your API keys in a `.env` file or via environment variables:
191
+ > ```
192
+ > export GOOGLE_API_KEY="your_gemini_key"
193
+ > export ANTHROPIC_API_KEY="your_claude_key"
194
+ > ```
195
+
196
+ ---
197
+
106
198
  ## 🧩 Folder Structure
107
199
 
108
200
  ```
109
201
  ragmint/
110
202
  ├── core/
111
- │ ├── pipeline.py # RAGPipeline implementation
112
- │ ├── retriever.py # Retriever logic (FAISS, Chroma)
113
- │ ├── reranker.py # MMR + CrossEncoder rerankers
114
- └── embedding.py # Embedding backends
115
- ├── tuner.py # Grid, Random, Bayesian optimization (Optuna)
116
- ├── utils/ # Metrics, logging, caching helpers
117
- ├── configs/ # Default experiment configs
118
- ├── experiments/ # Saved experiment results
119
- ├── tests/ # Unit tests for all components
120
- ├── main.py # CLI entrypoint for tuning
121
- └── pyproject.toml # Project dependencies & build metadata
203
+ │ ├── pipeline.py
204
+ │ ├── retriever.py
205
+ │ ├── reranker.py
206
+ ├── embedding.py
207
+ │ └── evaluation.py
208
+ ├── autotuner.py
209
+ ├── explainer.py
210
+ ├── leaderboard.py
211
+ ├── tuner.py
212
+ ├── utils/
213
+ ├── configs/
214
+ ├── experiments/
215
+ ├── tests/
216
+ └── main.py
122
217
  ```
123
218
 
124
219
  ---
125
220
 
126
221
  ## 🧪 Running Tests
127
222
 
128
- To verify your setup:
129
-
130
223
  ```bash
131
224
  pytest -v
132
225
  ```
133
226
 
134
- Or to test a specific component (e.g., reranker):
135
-
227
+ To include integration tests with Gemini or Claude APIs:
136
228
  ```bash
137
- pytest tests/test_reranker.py -v
229
+ pytest -m integration
138
230
  ```
139
231
 
140
- All tests are designed for **Pytest** and run with lightweight mock data.
141
-
142
232
  ---
143
233
 
144
234
  ## ⚙️ Configuration via `pyproject.toml`
145
235
 
146
- Your `pyproject.toml` automatically includes:
236
+ Your `pyproject.toml` includes all required dependencies:
147
237
 
148
238
  ```toml
149
239
  [project]
@@ -158,6 +248,8 @@ dependencies = [
158
248
  "pytest",
159
249
  "openai",
160
250
  "tqdm",
251
+ "google-generativeai",
252
+ "google-genai",
161
253
  ]
162
254
  ```
163
255
 
@@ -165,10 +257,10 @@ dependencies = [
165
257
 
166
258
  ## 📊 Example Experiment Workflow
167
259
 
168
- 1. Define your retriever and reranker configuration in YAML
169
- 2. Launch an optimization search (Grid, Random, or Bayesian)
170
- 3. Ragmint evaluates combinations automatically and reports top results
171
- 4. Export best parameters for production pipelines
260
+ 1. Define your retriever, embedding, and reranker setup
261
+ 2. Launch optimization (Grid, Random, Bayesian) or AutoTune
262
+ 3. Compare performance with explainability
263
+ 4. Persist results to leaderboard for later inspection
172
264
 
173
265
  ---
174
266
 
@@ -181,7 +273,7 @@ flowchart TD
181
273
  C --> D[Reranker]
182
274
  D --> E[Generator]
183
275
  E --> F[Evaluation]
184
- F --> G[Optuna Tuner]
276
+ F --> G[Optuna / AutoRAGTuner]
185
277
  G -->|Best Params| B
186
278
  ```
187
279
 
@@ -191,8 +283,9 @@ flowchart TD
191
283
 
192
284
  ```
193
285
  [INFO] Starting Bayesian optimization with Optuna
194
- [INFO] Trial 7 finished: recall=0.83, latency=0.42s
286
+ [INFO] Trial 7 finished: faithfulness=0.83, latency=0.42s
195
287
  [INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
288
+ [INFO] AutoRAGTuner: Suggested retriever=Chroma for medium corpus
196
289
  ```
197
290
 
198
291
  ---
@@ -200,8 +293,9 @@ flowchart TD
200
293
  ## 🧠 Why Ragmint?
201
294
 
202
295
  - Built for **RAG researchers**, **AI engineers**, and **LLM ops**
203
- - Works with **LangChain**, **LlamaIndex**, or standalone RAG setups
204
- - Designed for **extensibility** — plug in your own models, retrievers, or metrics
296
+ - Works with **LangChain**, **LlamaIndex**, or standalone setups
297
+ - Designed for **extensibility** — plug in your own retrievers, models, or metrics
298
+ - Integrated **explainability and leaderboard** modules for research and production
205
299
 
206
300
  ---
207
301
 
@@ -1,10 +1,13 @@
1
1
  ragmint/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  ragmint/__main__.py,sha256=q7hBn56Z1xAckbs03i8ynsuOzJVUXmod2qHddX7gkpc,729
3
- ragmint/tuner.py,sha256=sCUb-qGqk-lz4nUJboomwXFt3us7mYf3oJhwWV9Kzo4,4429
3
+ ragmint/autotuner.py,sha256=eXEH4e_3Os9FPX9y_0N7GnIQsmoHxmFbtjj7xanu17g,1064
4
+ ragmint/explainer.py,sha256=1glGNdC4GlwR6Qs8Bj1oOol7f5_db7Ksnh07HAp-A2c,2077
5
+ ragmint/leaderboard.py,sha256=nILQ5QR63RpZtCrZ__RFfwHXy4bkUIMUcSfH92OQ93Y,1628
6
+ ragmint/tuner.py,sha256=BLPZ66sVk3dh3Wj-GVUYRVmVtgXYTzv3oTQtKJeDlgE,4442
4
7
  ragmint/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
8
  ragmint/core/chunking.py,sha256=Dy9RYyapGSS6ik6Vg9lqbUPCFqSraU1JKpHbYUTkaFo,576
6
9
  ragmint/core/embeddings.py,sha256=6wJjfZ5ukr8G5bJJ1evjIqj0_FMbs_gq4xC-sBBqNlA,566
7
- ragmint/core/evaluation.py,sha256=LcR9AIsL9OyoENrUVSu0hhKzAItcBvEOy33V4i-0DtI,682
10
+ ragmint/core/evaluation.py,sha256=3OFcZU2zZyaP53d9S2zdpknV0CYfTq0KoRB3a_dtjM4,1022
8
11
  ragmint/core/pipeline.py,sha256=2qwGKuG0Du7gtIpieLFn71h_RcwBpjcV-h9PQz2ZOsc,1169
9
12
  ragmint/core/reranker.py,sha256=B2-NDExqpd9jdXHkEHOXC0B_6-FMJm5vdi-_ZbxC3Os,2303
10
13
  ragmint/core/retriever.py,sha256=jbpKy_fGdDq736y0es_utQuLqY9eiWNd71Q8JbU0Sko,1259
@@ -12,17 +15,23 @@ ragmint/experiments/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
12
15
  ragmint/optimization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
16
  ragmint/optimization/search.py,sha256=uiLJeoO_jaLCQEw99L6uI1rnqHHx_rTY81WxfMmlALs,1623
14
17
  ragmint/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ ragmint/tests/conftest.py,sha256=QhYPp5nrQ_DbZlsVH3nLjDgjPATAnLwzJkwl-Y-xrmM,488
19
+ ragmint/tests/test_autotuner.py,sha256=k5nsIH6MYB5zaocR_Wn1wTX-QDYfhH6ugx2chZu9Q8U,1500
20
+ ragmint/tests/test_explainer.py,sha256=K_DRnGGl34WcTA2yaQGmfzWkVi1uEkzjpsTPeZxXeIg,802
21
+ ragmint/tests/test_explainer_integration.py,sha256=tYT62fYqk616bjQ1VxHADVRfJ9vdF_CiF3cz4A9BdbE,620
22
+ ragmint/tests/test_integration_autotuner_ragmint.py,sha256=YCGge0_KOijAdB7VNDGHl2VRJjiOyl_-sJNRLjAXGLw,2182
23
+ ragmint/tests/test_leaderboard.py,sha256=ay81YK6KxAUU6mcG6n1_xV8GPYkBgjzJj9iAIyAzIzA,1163
15
24
  ragmint/tests/test_pipeline.py,sha256=MIMkEKelh-POlbXzbCc4ClMk8XCGzfuj569xXltziic,615
16
25
  ragmint/tests/test_retriever.py,sha256=Ag0uGW8-iMzKA4nJNnsjuzlQHa79sN-T-K1g1cdin-A,421
17
26
  ragmint/tests/test_search.py,sha256=FcC-DEnw9veAEyMnFoRw9DAwzqJC9F6-r63Nqo2nO58,598
18
- ragmint/tests/test_tuner.py,sha256=VFZ23og0dOypBpr3TxkRmSngilkNgyboZc6u9qB0pME,1101
27
+ ragmint/tests/test_tuner.py,sha256=LOvtIxAbUsoRHQudZ23UVr60FYAU0a1SBNvAN0mLpfU,2322
19
28
  ragmint/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
29
  ragmint/utils/caching.py,sha256=LPE2JorOQ90BgVf6NUiS0-bdt-FGpNxDy7FnuwEHzy0,1060
21
- ragmint/utils/data_loader.py,sha256=Q3pBO77XZ1rl4fuMn3TK7x3mSM2eLdV_OJTyy_eL3Ys,988
30
+ ragmint/utils/data_loader.py,sha256=GXU9Nc3o0UWxtBeRwiskD1aCjSiNNuRoAokIUODn7q8,2024
22
31
  ragmint/utils/logger.py,sha256=X7hTNb3st3fUeQIzSghuoV5B8FWXzm_O3DRkSfJvhmI,1033
23
32
  ragmint/utils/metrics.py,sha256=DR8mrdumHtQerK0VrugwYKIG1oNptEcsFqodXq3i2kY,717
24
- ragmint-0.1.0.dist-info/licenses/LICENSE,sha256=ahkhYfFLI8tGrdxdO2_GaT6OJW2eNwyFT3kYi85QQhc,692
25
- ragmint-0.1.0.dist-info/METADATA,sha256=BgMj5BxH2C2_5GweYpClkopepUBCVen5tWAFcOby8o8,5643
26
- ragmint-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- ragmint-0.1.0.dist-info/top_level.txt,sha256=K2ulzMHuvFm6xayvvJdGABeRJAvKDBn6M3EI-3SbYLw,8
28
- ragmint-0.1.0.dist-info/RECORD,,
33
+ ragmint-0.2.0.dist-info/licenses/LICENSE,sha256=ahkhYfFLI8tGrdxdO2_GaT6OJW2eNwyFT3kYi85QQhc,692
34
+ ragmint-0.2.0.dist-info/METADATA,sha256=uwavcr5XnbneN7d7kfKiBD-Uc5TIIZFThmVhNjGWb0o,7948
35
+ ragmint-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ ragmint-0.2.0.dist-info/top_level.txt,sha256=K2ulzMHuvFm6xayvvJdGABeRJAvKDBn6M3EI-3SbYLw,8
37
+ ragmint-0.2.0.dist-info/RECORD,,