ragmint 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragmint might be problematic. Click here for more details.

Files changed (54) hide show
  1. {ragmint-0.4.0/src/ragmint.egg-info → ragmint-0.4.2}/PKG-INFO +19 -10
  2. {ragmint-0.4.0 → ragmint-0.4.2}/README.md +18 -8
  3. {ragmint-0.4.0 → ragmint-0.4.2}/pyproject.toml +1 -4
  4. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/autotuner.py +33 -15
  5. ragmint-0.4.2/src/ragmint/explainer.py +88 -0
  6. ragmint-0.4.2/src/ragmint/leaderboard.py +51 -0
  7. ragmint-0.4.2/src/ragmint/tests/test_explainer.py +36 -0
  8. ragmint-0.4.2/src/ragmint/tests/test_leaderboard.py +92 -0
  9. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tuner.py +23 -2
  10. {ragmint-0.4.0 → ragmint-0.4.2/src/ragmint.egg-info}/PKG-INFO +19 -10
  11. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/SOURCES.txt +0 -1
  12. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/requires.txt +0 -1
  13. ragmint-0.4.0/src/ragmint/explainer.py +0 -63
  14. ragmint-0.4.0/src/ragmint/leaderboard.py +0 -45
  15. ragmint-0.4.0/src/ragmint/tests/test_explainer.py +0 -20
  16. ragmint-0.4.0/src/ragmint/tests/test_explainer_integration.py +0 -18
  17. ragmint-0.4.0/src/ragmint/tests/test_leaderboard.py +0 -39
  18. {ragmint-0.4.0 → ragmint-0.4.2}/LICENSE +0 -0
  19. {ragmint-0.4.0 → ragmint-0.4.2}/MANIFEST.in +0 -0
  20. {ragmint-0.4.0 → ragmint-0.4.2}/setup.cfg +0 -0
  21. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/__init__.py +0 -0
  22. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/__main__.py +0 -0
  23. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/__init__.py +0 -0
  24. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/chunking.py +0 -0
  25. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/embeddings.py +0 -0
  26. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/evaluation.py +0 -0
  27. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/pipeline.py +0 -0
  28. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/reranker.py +0 -0
  29. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/core/retriever.py +0 -0
  30. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/experiments/__init__.py +0 -0
  31. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/experiments/validation_qa.json +0 -0
  32. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/__init__.py +0 -0
  33. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/config_adapter.py +0 -0
  34. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/integrations/langchain_prebuilder.py +0 -0
  35. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/optimization/__init__.py +0 -0
  36. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/optimization/search.py +0 -0
  37. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/__init__.py +0 -0
  38. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/conftest.py +0 -0
  39. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_autotuner.py +0 -0
  40. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_config_adapter.py +0 -0
  41. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_embeddings.py +0 -0
  42. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_integration_autotuner_ragmint.py +0 -0
  43. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_langchain_prebuilder.py +0 -0
  44. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_pipeline.py +0 -0
  45. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_retriever.py +0 -0
  46. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_search.py +0 -0
  47. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tests/test_tuner.py +0 -0
  48. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/__init__.py +0 -0
  49. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/caching.py +0 -0
  50. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/data_loader.py +0 -0
  51. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/logger.py +0 -0
  52. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/utils/metrics.py +0 -0
  53. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/dependency_links.txt +0 -0
  54. {ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
27
27
  Requires-Dist: openai>=1.0.0
28
28
  Requires-Dist: google-generativeai>=0.8.0
29
29
  Requires-Dist: anthropic>=0.25.0
30
- Requires-Dist: supabase>=2.4.0
31
30
  Requires-Dist: pytest
32
31
  Requires-Dist: langchain>=0.2.5
33
32
  Requires-Dist: langchain-community>=0.2.5
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
273
272
  ```python
274
273
  from ragmint.leaderboard import Leaderboard
275
274
 
276
- lb = Leaderboard("experiments/leaderboard.json")
277
- lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
278
- lb.show_top(3)
275
+ # Initialize local leaderboard
276
+ leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
277
+
278
+ # Retrieve top 5 runs
279
+ print("\n🏅 Top 5 Experiments:")
280
+ for result in leaderboard.top_results(limit=5):
281
+ print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
279
282
  ```
280
283
 
281
284
  ---
282
285
 
283
286
  ## 🧠 Explainability with Gemini / Claude
284
287
 
285
- Compare two RAG configurations and receive **natural language insights** on why one performs better.
288
+ Compare RAG configurations and receive **natural language insights** on why one performs better.
286
289
 
287
290
  ```python
291
+ from ragmint.autotuner import AutoRAGTuner
288
292
  from ragmint.explainer import explain_results
289
293
 
290
- config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
291
- config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
294
+ tuner = AutoRAGTuner(docs_path="data/docs/")
295
+ best, results = tuner.auto_tune(
296
+ validation_set='data/docs/validation_qa.json',
297
+ metric="faithfulness",
298
+ trials=5,
299
+ search_type='bayesian'
300
+ )
292
301
 
293
- explanation = explain_results(config_a, config_b, model="gemini")
294
- print(explanation)
302
+ analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
303
+ print(analysis)
295
304
  ```
296
305
 
297
306
  > Set your API keys in a `.env` file or via environment variables:
@@ -229,25 +229,35 @@ Track and visualize your best experiments across runs.
229
229
  ```python
230
230
  from ragmint.leaderboard import Leaderboard
231
231
 
232
- lb = Leaderboard("experiments/leaderboard.json")
233
- lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
234
- lb.show_top(3)
232
+ # Initialize local leaderboard
233
+ leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
234
+
235
+ # Retrieve top 5 runs
236
+ print("\n🏅 Top 5 Experiments:")
237
+ for result in leaderboard.top_results(limit=5):
238
+ print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
235
239
  ```
236
240
 
237
241
  ---
238
242
 
239
243
  ## 🧠 Explainability with Gemini / Claude
240
244
 
241
- Compare two RAG configurations and receive **natural language insights** on why one performs better.
245
+ Compare RAG configurations and receive **natural language insights** on why one performs better.
242
246
 
243
247
  ```python
248
+ from ragmint.autotuner import AutoRAGTuner
244
249
  from ragmint.explainer import explain_results
245
250
 
246
- config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
247
- config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
251
+ tuner = AutoRAGTuner(docs_path="data/docs/")
252
+ best, results = tuner.auto_tune(
253
+ validation_set='data/docs/validation_qa.json',
254
+ metric="faithfulness",
255
+ trials=5,
256
+ search_type='bayesian'
257
+ )
248
258
 
249
- explanation = explain_results(config_a, config_b, model="gemini")
250
- print(explanation)
259
+ analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
260
+ print(analysis)
251
261
  ```
252
262
 
253
263
  > Set your API keys in a `.env` file or via environment variables:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ragmint"
7
- version = "0.4.0"
7
+ version = "0.4.2"
8
8
  description = "A modular framework for evaluating and optimizing RAG pipelines."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache License 2.0" }
@@ -40,9 +40,6 @@ dependencies = [
40
40
  "google-generativeai>=0.8.0",
41
41
  "anthropic>=0.25.0",
42
42
 
43
- # Integration / storage
44
- "supabase>=2.4.0",
45
-
46
43
  # Testing
47
44
  "pytest",
48
45
 
@@ -63,7 +63,8 @@ class AutoRAGTuner:
63
63
  def suggest_chunk_sizes(
64
64
  self,
65
65
  model_name: Optional[str] = None,
66
- num_pairs: Optional[int] = None
66
+ num_pairs: Optional[int] = None,
67
+ step: int = 10
67
68
  ) -> List[Tuple[int, int]]:
68
69
  if num_pairs is None:
69
70
  raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
@@ -74,21 +75,27 @@ class AutoRAGTuner:
74
75
 
75
76
  model = SentenceTransformer(model_name)
76
77
  max_tokens = getattr(model, "max_seq_length", 256)
77
-
78
78
  approx_words = max(1, int(max_tokens * 0.75))
79
79
  avg_len = self.corpus_stats.get("avg_len", 400)
80
80
 
81
- chunk_sizes = []
82
- for _ in range(num_pairs):
83
- max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
84
- low = max(10, int(max_chunk * 0.5))
85
- high = max(low, max_chunk)
86
- chunk_size = random.randint(low, high)
87
- overlap = random.randint(10, min(300, chunk_size // 2))
88
- chunk_sizes.append((chunk_size, overlap))
81
+ max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
82
+
83
+ # Safe chunk and overlap ranges
84
+ chunk_sizes = list(range(50, max_chunk + 1, step))
85
+ overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
86
+ if not overlaps:
87
+ overlaps = [max(1, max_chunk // 4)]
88
+
89
+ candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
90
+
91
+ # Randomly sample requested number of pairs
92
+ if num_pairs >= len(candidates):
93
+ sampled = candidates
94
+ else:
95
+ sampled = random.sample(candidates, num_pairs)
89
96
 
90
- logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {chunk_sizes}")
91
- return chunk_sizes
97
+ logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
98
+ return sampled
92
99
 
93
100
  # -----------------------------
94
101
  # Recommendation Logic
@@ -130,7 +137,16 @@ class AutoRAGTuner:
130
137
  logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
131
138
 
132
139
  # Suggest chunk sizes
133
- chunk_candidates = self.suggest_chunk_sizes(embedding_model, num_pairs=num_chunk_pairs)
140
+ # Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
141
+ chunk_candidates = self.suggest_chunk_sizes(
142
+ model_name=embedding_model,
143
+ num_pairs=num_chunk_pairs
144
+ )
145
+
146
+ # Safety check
147
+ if not chunk_candidates:
148
+ raise RuntimeError("No chunk candidates generated.")
149
+
134
150
  # Pick the first pair as default recommendation
135
151
  chunk_size, overlap = chunk_candidates[0]
136
152
 
@@ -176,6 +192,8 @@ class AutoRAGTuner:
176
192
  """
177
193
  rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
178
194
 
195
+ chunk_candidates = rec["chunk_candidates"]
196
+
179
197
  logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
180
198
 
181
199
  tuner = RAGMint(
@@ -183,8 +201,8 @@ class AutoRAGTuner:
183
201
  retrievers=[rec["retriever"]],
184
202
  embeddings=[rec["embedding_model"]],
185
203
  rerankers=["mmr"],
186
- chunk_sizes=[rec["chunk_size"]],
187
- overlaps=[rec["overlap"]],
204
+ chunk_sizes=[c[0] for c in chunk_candidates],
205
+ overlaps=[c[1] for c in chunk_candidates],
188
206
  strategies=[rec["strategy"]],
189
207
  )
190
208
 
@@ -0,0 +1,88 @@
1
+ """
2
+ Interpretability Layer
3
+ ----------------------
4
+ Uses Gemini or Anthropic Claude to explain why a particular RAG configuration
5
+ performed best, considering both optimizer results and corpus characteristics.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ from dotenv import load_dotenv
11
+
12
+ # Load .env if available
13
+ load_dotenv()
14
+
15
+ def explain_results(best_result: dict, all_results: list, corpus_stats: dict = None,
16
+ model: str = "gemini-2.5-flash-lite") -> str:
17
+ """
18
+ Generate a detailed natural-language explanation for RAG optimization results.
19
+
20
+ Parameters:
21
+ - best_result: dict containing the best configuration and metrics.
22
+ - all_results: list of all trial results with metrics and configs.
23
+ - corpus_stats: optional dict with corpus info (size, avg_len, num_docs).
24
+ - model: LLM model name (Gemini or Claude).
25
+
26
+ Returns:
27
+ A natural-language explanation string.
28
+ """
29
+
30
+ anthropic_key = os.getenv("ANTHROPIC_API_KEY")
31
+ google_key = os.getenv("GOOGLE_API_KEY")
32
+
33
+ # Build dynamic context
34
+ corpus_info = json.dumps(corpus_stats or {}, indent=2)
35
+ best_json = json.dumps(best_result, indent=2)
36
+ all_json = json.dumps(list(all_results)[:10], indent=2) #cap for safety
37
+
38
+ prompt = f"""
39
+ You are an expert AI researcher specializing in Retrieval-Augmented Generation (RAG) optimization.
40
+
41
+ A RAG auto-tuner was run on a corpus with these characteristics:
42
+ {corpus_info}
43
+
44
+ The tuner evaluated multiple configurations and metrics. Below are:
45
+ - The BEST configuration:
46
+ {best_json}
47
+
48
+ - A sample of ALL evaluated configurations:
49
+ {all_json}
50
+
51
+ Please:
52
+ 1. Explain WHY this best configuration likely performs better than others.
53
+ 2. Highlight trade-offs between accuracy, latency, and resource usage.
54
+ 3. Suggest potential improvements (different chunking, embedding, retriever, etc.).
55
+ 4. Provide a concise summary of which setup you recommend for this corpus.
56
+ Keep it structured, under 300 words, and easy to read.
57
+ """
58
+
59
+ # --- 1️⃣ Anthropic Claude first ---
60
+ if anthropic_key:
61
+ try:
62
+ from anthropic import Anthropic
63
+ client = Anthropic(api_key=anthropic_key)
64
+ response = client.messages.create(
65
+ model="claude-3-opus-20240229",
66
+ max_tokens=500,
67
+ messages=[{"role": "user", "content": prompt}],
68
+ )
69
+ return response.content[0].text
70
+ except Exception as e:
71
+ return f"[Claude unavailable] {e}"
72
+
73
+ # --- 2️⃣ Gemini fallback ---
74
+ elif google_key:
75
+ try:
76
+ import google.generativeai as genai
77
+ genai.configure(api_key=google_key)
78
+ response = genai.GenerativeModel(model).generate_content(prompt)
79
+ return response.text
80
+ except Exception as e:
81
+ return f"[Gemini unavailable] {e}"
82
+
83
+ # --- 3️⃣ Fallback message ---
84
+ else:
85
+ return (
86
+ "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
87
+ "to enable interpretability via Claude or Gemini."
88
+ )
@@ -0,0 +1,51 @@
1
+ import os
2
+ import json
3
+ from datetime import datetime
4
+ from typing import Dict, Any, List, Optional
5
+
6
+
7
+ class Leaderboard:
8
+ def __init__(self, storage_path: Optional[str] = "leaderboard.jsonl"):
9
+ self.storage_path = storage_path
10
+ os.makedirs(os.path.dirname(self.storage_path) or ".", exist_ok=True)
11
+
12
+ if not os.path.exists(self.storage_path):
13
+ open(self.storage_path, "w", encoding="utf-8").close()
14
+
15
+ def upload(
16
+ self,
17
+ run_id: str,
18
+ best_config: Dict[str, Any],
19
+ best_score: float,
20
+ all_results: List[Dict[str, Any]],
21
+ documents: List[str],
22
+ model: str,
23
+ corpus_stats: Optional[Dict[str, Any]] = None,
24
+ ):
25
+ """Persist a full experiment run to local leaderboard."""
26
+ data = {
27
+ "run_id": run_id,
28
+ "timestamp": datetime.utcnow().isoformat(),
29
+ "best_config": best_config,
30
+ "best_score": best_score,
31
+ "all_results": all_results,
32
+ "documents": [os.path.basename(d) for d in documents],
33
+ "model": model,
34
+ "corpus_stats": corpus_stats or {},
35
+ }
36
+
37
+ with open(self.storage_path, "a", encoding="utf-8") as f:
38
+ f.write(json.dumps(data) + "\n")
39
+
40
+ return data
41
+
42
+ def all_results(self) -> List[Dict[str, Any]]:
43
+ if not os.path.exists(self.storage_path):
44
+ return []
45
+ with open(self.storage_path, "r", encoding="utf-8") as f:
46
+ return [json.loads(line) for line in f if line.strip()]
47
+
48
+ def top_results(self, limit: int = 10) -> List[Dict[str, Any]]:
49
+ """Return top experiments by score."""
50
+ results = self.all_results()
51
+ return sorted(results, key=lambda x: x.get("best_score", 0.0), reverse=True)[:limit]
@@ -0,0 +1,36 @@
1
+ import pytest
2
+ import sys
3
+ import types
4
+ from ragmint.explainer import explain_results
5
+
6
+
7
+ def test_explain_results_with_claude(monkeypatch):
8
+ """Claude explanation should use Anthropic API path when ANTHROPIC_API_KEY is set."""
9
+ monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key")
10
+ monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
11
+
12
+ # Create a fake anthropic module with the required interface
13
+ mock_anthropic = types.ModuleType("anthropic")
14
+
15
+ class MockContent:
16
+ text = "Claude: The best configuration performs well due to optimized chunk size."
17
+
18
+ class MockMessages:
19
+ def create(self, *args, **kwargs):
20
+ return type("MockResponse", (), {"content": [MockContent()]})()
21
+
22
+ class MockClient:
23
+ def __init__(self, api_key):
24
+ self.messages = MockMessages()
25
+
26
+ mock_anthropic.Anthropic = MockClient
27
+ sys.modules["anthropic"] = mock_anthropic # Inject fake module
28
+
29
+ best = {"retriever": "Chroma", "metric": 0.9}
30
+ all_results = [{"retriever": "FAISS", "metric": 0.85}]
31
+ corpus_stats = {"size": 10000, "avg_len": 400, "num_docs": 20}
32
+
33
+ result = explain_results(best, all_results, corpus_stats, model="claude-3-opus-20240229")
34
+
35
+ assert isinstance(result, str)
36
+ assert "Claude" in result or "claude" in result
@@ -0,0 +1,92 @@
1
+ import os
2
+ import json
3
+ import tempfile
4
+ import pytest
5
+ from datetime import datetime
6
+ from ragmint.leaderboard import Leaderboard
7
+
8
+
9
+ @pytest.fixture
10
+ def temp_leaderboard():
11
+ """Create a temporary leaderboard file for testing."""
12
+ with tempfile.TemporaryDirectory() as tmpdir:
13
+ path = os.path.join(tmpdir, "leaderboard.jsonl")
14
+ lb = Leaderboard(storage_path=path)
15
+ yield lb, path
16
+
17
+
18
+ def test_upload_and_persistence(temp_leaderboard):
19
+ lb, path = temp_leaderboard
20
+
21
+ # --- Mock experiment data ---
22
+ run_id = "run_001"
23
+ best_config = {"retriever": "FAISS", "embedding_model": "all-MiniLM"}
24
+ best_score = 0.92
25
+ all_results = [
26
+ {"retriever": "FAISS", "score": 0.92},
27
+ {"retriever": "BM25", "score": 0.85},
28
+ ]
29
+ documents = ["docs/a.txt", "docs/b.txt"]
30
+ model = "gemini"
31
+ corpus_stats = {"size": 20000, "avg_len": 400, "num_docs": 10}
32
+
33
+ # --- Upload ---
34
+ record = lb.upload(
35
+ run_id=run_id,
36
+ best_config=best_config,
37
+ best_score=best_score,
38
+ all_results=all_results,
39
+ documents=documents,
40
+ model=model,
41
+ corpus_stats=corpus_stats,
42
+ )
43
+
44
+ # --- Validate returned record ---
45
+ assert record["run_id"] == run_id
46
+ assert record["model"] == "gemini"
47
+ assert "timestamp" in record
48
+ assert record["best_score"] == 0.92
49
+ assert all(doc in record["documents"] for doc in ["a.txt", "b.txt"])
50
+
51
+ # --- File should contain JSON line ---
52
+ with open(path, "r", encoding="utf-8") as f:
53
+ lines = f.readlines()
54
+ assert len(lines) == 1
55
+ parsed = json.loads(lines[0])
56
+ assert parsed["run_id"] == run_id
57
+
58
+
59
+ def test_top_results_ordering(temp_leaderboard):
60
+ lb, _ = temp_leaderboard
61
+
62
+ # Upload multiple runs with varying scores
63
+ for i, score in enumerate([0.8, 0.95, 0.7]):
64
+ lb.upload(
65
+ run_id=f"run_{i}",
66
+ best_config={"retriever": "FAISS"},
67
+ best_score=score,
68
+ all_results=[],
69
+ documents=["file.txt"],
70
+ model="claude",
71
+ )
72
+
73
+ # --- Get top results ---
74
+ top = lb.top_results(limit=2)
75
+ assert len(top) == 2
76
+
77
+ # --- Ensure ordering descending by score ---
78
+ assert top[0]["best_score"] >= top[1]["best_score"]
79
+ assert top[0]["best_score"] == 0.95
80
+
81
+
82
+ def test_all_results_reads_all_entries(temp_leaderboard):
83
+ lb, _ = temp_leaderboard
84
+
85
+ # Add two runs
86
+ lb.upload("run_a", {}, 0.5, [], ["doc1.txt"], "gemini")
87
+ lb.upload("run_b", {}, 0.7, [], ["doc2.txt"], "claude")
88
+
89
+ results = lb.all_results()
90
+ assert len(results) == 2
91
+ run_ids = {r["run_id"] for r in results}
92
+ assert {"run_a", "run_b"} <= run_ids
@@ -1,5 +1,4 @@
1
1
  import os
2
- import json
3
2
  import logging
4
3
  from typing import Any, Dict, List, Tuple
5
4
  from time import perf_counter
@@ -11,6 +10,8 @@ from .core.reranker import Reranker
11
10
  from .core.evaluation import Evaluator
12
11
  from .optimization.search import GridSearch, RandomSearch, BayesianSearch
13
12
  from .utils.data_loader import load_validation_set
13
+ from .leaderboard import Leaderboard
14
+ from uuid import uuid4
14
15
 
15
16
  logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
16
17
 
@@ -151,7 +152,7 @@ class RAGMint:
151
152
  """Run optimization search over retrievers, embeddings, rerankers, and chunking."""
152
153
  validation = load_validation_set(validation_set or "default")
153
154
 
154
- # ✅ Add chunking parameters to the search space
155
+ # search space
155
156
  search_space = {
156
157
  "retriever": self.retrievers,
157
158
  "embedding_model": self.embeddings,
@@ -186,4 +187,24 @@ class RAGMint:
186
187
  best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
187
188
  logging.info(f"🏆 Best configuration: {best}")
188
189
 
190
+ # Save to leaderboard
191
+ run_id = f"run_{uuid4().hex[:8]}"
192
+ leaderboard = Leaderboard()
193
+
194
+ corpus_stats = {
195
+ "num_docs": len(self.documents),
196
+ "avg_len": sum(len(d.split()) for d in self.documents) / max(1, len(self.documents)),
197
+ "corpus_size": sum(len(d) for d in self.documents),
198
+ }
199
+
200
+ leaderboard.upload(
201
+ run_id=run_id,
202
+ best_config=best,
203
+ best_score=best.get(metric, 0.0),
204
+ all_results=results,
205
+ documents=os.listdir(self.docs_path),
206
+ model=best.get("embedding_model", "unknown"),
207
+ corpus_stats=corpus_stats,
208
+ )
209
+
189
210
  return best, results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
27
27
  Requires-Dist: openai>=1.0.0
28
28
  Requires-Dist: google-generativeai>=0.8.0
29
29
  Requires-Dist: anthropic>=0.25.0
30
- Requires-Dist: supabase>=2.4.0
31
30
  Requires-Dist: pytest
32
31
  Requires-Dist: langchain>=0.2.5
33
32
  Requires-Dist: langchain-community>=0.2.5
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
273
272
  ```python
274
273
  from ragmint.leaderboard import Leaderboard
275
274
 
276
- lb = Leaderboard("experiments/leaderboard.json")
277
- lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
278
- lb.show_top(3)
275
+ # Initialize local leaderboard
276
+ leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
277
+
278
+ # Retrieve top 5 runs
279
+ print("\n🏅 Top 5 Experiments:")
280
+ for result in leaderboard.top_results(limit=5):
281
+ print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
279
282
  ```
280
283
 
281
284
  ---
282
285
 
283
286
  ## 🧠 Explainability with Gemini / Claude
284
287
 
285
- Compare two RAG configurations and receive **natural language insights** on why one performs better.
288
+ Compare RAG configurations and receive **natural language insights** on why one performs better.
286
289
 
287
290
  ```python
291
+ from ragmint.autotuner import AutoRAGTuner
288
292
  from ragmint.explainer import explain_results
289
293
 
290
- config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
291
- config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
294
+ tuner = AutoRAGTuner(docs_path="data/docs/")
295
+ best, results = tuner.auto_tune(
296
+ validation_set='data/docs/validation_qa.json',
297
+ metric="faithfulness",
298
+ trials=5,
299
+ search_type='bayesian'
300
+ )
292
301
 
293
- explanation = explain_results(config_a, config_b, model="gemini")
294
- print(explanation)
302
+ analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
303
+ print(analysis)
295
304
  ```
296
305
 
297
306
  > Set your API keys in a `.env` file or via environment variables:
@@ -33,7 +33,6 @@ src/ragmint/tests/test_autotuner.py
33
33
  src/ragmint/tests/test_config_adapter.py
34
34
  src/ragmint/tests/test_embeddings.py
35
35
  src/ragmint/tests/test_explainer.py
36
- src/ragmint/tests/test_explainer_integration.py
37
36
  src/ragmint/tests/test_integration_autotuner_ragmint.py
38
37
  src/ragmint/tests/test_langchain_prebuilder.py
39
38
  src/ragmint/tests/test_leaderboard.py
@@ -12,7 +12,6 @@ python-dotenv
12
12
  openai>=1.0.0
13
13
  google-generativeai>=0.8.0
14
14
  anthropic>=0.25.0
15
- supabase>=2.4.0
16
15
  pytest
17
16
  langchain>=0.2.5
18
17
  langchain-community>=0.2.5
@@ -1,63 +0,0 @@
1
- """
2
- Interpretability Layer
3
- ----------------------
4
- Uses Gemini or Anthropic Claude to explain why one RAG configuration
5
- outperforms another. Falls back gracefully if no API key is provided.
6
- """
7
-
8
- import os
9
- import json
10
- from dotenv import load_dotenv
11
-
12
- # Load environment variables from .env file if available
13
- load_dotenv()
14
-
15
- def explain_results(results_a: dict, results_b: dict, model: str = "gemini-2.5-flash-lite") -> str:
16
- """
17
- Generate a natural-language explanation comparing two RAG experiment results.
18
- Priority:
19
- 1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
20
- 2. Google Gemini (if GOOGLE_API_KEY is set)
21
- 3. Fallback text message
22
- """
23
- prompt = f"""
24
- You are an AI evaluation expert.
25
- Compare these two RAG experiment results and explain why one performs better.
26
- Metrics A: {json.dumps(results_a, indent=2)}
27
- Metrics B: {json.dumps(results_b, indent=2)}
28
- Provide a concise, human-friendly explanation and practical improvement tips.
29
- """
30
-
31
- anthropic_key = os.getenv("ANTHROPIC_API_KEY")
32
- google_key = os.getenv("GOOGLE_API_KEY") # fixed var name
33
-
34
- # 1️⃣ Try Anthropic Claude first
35
- if anthropic_key:
36
- try:
37
- from anthropic import Anthropic
38
- client = Anthropic(api_key=anthropic_key)
39
- response = client.messages.create(
40
- model="claude-3-opus-20240229",
41
- max_tokens=300,
42
- messages=[{"role": "user", "content": prompt}],
43
- )
44
- return response.content[0].text
45
- except Exception as e:
46
- return f"[Claude unavailable] {e}"
47
-
48
- # 2️⃣ Fallback to Google Gemini
49
- elif google_key:
50
- try:
51
- import google.generativeai as genai
52
- genai.configure(api_key=google_key)
53
- response = genai.GenerativeModel(model).generate_content(prompt)
54
- return response.text
55
- except Exception as e:
56
- return f"[Gemini unavailable] {e}"
57
-
58
- # 3️⃣ Fallback if neither key is available
59
- else:
60
- return (
61
- "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
62
- "to enable interpretability via Claude or Gemini."
63
- )
@@ -1,45 +0,0 @@
1
- import os
2
- import json
3
- from datetime import datetime
4
- from typing import Dict, Any, Optional
5
- from supabase import create_client
6
-
7
- class Leaderboard:
8
- def __init__(self, storage_path: Optional[str] = None):
9
- self.storage_path = storage_path
10
- url = os.getenv("SUPABASE_URL")
11
- key = os.getenv("SUPABASE_KEY")
12
- self.client = None
13
- if url and key:
14
- self.client = create_client(url, key)
15
- elif not storage_path:
16
- raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
17
-
18
- def upload(self, run_id: str, config: Dict[str, Any], score: float):
19
- data = {
20
- "run_id": run_id,
21
- "config": config,
22
- "score": score,
23
- "timestamp": datetime.utcnow().isoformat(),
24
- }
25
- if self.client:
26
- return self.client.table("experiments").insert(data).execute()
27
- else:
28
- os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
29
- with open(self.storage_path, "a", encoding="utf-8") as f:
30
- f.write(json.dumps(data) + "\n")
31
- return data
32
-
33
- def top_results(self, limit: int = 10):
34
- if self.client:
35
- return (
36
- self.client.table("experiments")
37
- .select("*")
38
- .order("score", desc=True)
39
- .limit(limit)
40
- .execute()
41
- )
42
- else:
43
- with open(self.storage_path, "r", encoding="utf-8") as f:
44
- lines = [json.loads(line) for line in f]
45
- return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
@@ -1,20 +0,0 @@
1
- import pytest
2
- from ragmint.explainer import explain_results
3
-
4
-
5
- def test_explain_results_gemini():
6
- """Gemini explanation should contain model-specific phrasing."""
7
- config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
8
- config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
9
- result = explain_results(config_a, config_b, model="gemini")
10
- assert isinstance(result, str)
11
- assert "Gemini" in result or "gemini" in result
12
-
13
-
14
- def test_explain_results_claude():
15
- """Claude explanation should contain model-specific phrasing."""
16
- config_a = {"retriever": "FAISS"}
17
- config_b = {"retriever": "Chroma"}
18
- result = explain_results(config_a, config_b, model="claude")
19
- assert isinstance(result, str)
20
- assert "Claude" in result or "claude" in result
@@ -1,18 +0,0 @@
1
- import os
2
- import pytest
3
- from ragmint.explainer import explain_results
4
-
5
-
6
- @pytest.mark.integration
7
- def test_real_gemini_explanation():
8
- """Run real Gemini call if GOOGLE_API_KEY is set."""
9
- if not os.getenv("GEMINI_API_KEY"):
10
- pytest.skip("GEMINI_API_KEY not set")
11
-
12
- config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
13
- config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
14
-
15
- result = explain_results(config_a, config_b, model="gemini-1.5-pro")
16
- assert isinstance(result, str)
17
- assert len(result) > 0
18
- print("\n[Gemini explanation]:", result[:200], "...")
@@ -1,39 +0,0 @@
1
- import json
2
- import tempfile
3
- from pathlib import Path
4
- from ragmint.leaderboard import Leaderboard
5
-
6
-
7
- def test_leaderboard_add_and_top(tmp_path):
8
- """Ensure local leaderboard persistence works without Supabase."""
9
- file_path = tmp_path / "leaderboard.jsonl"
10
- lb = Leaderboard(storage_path=str(file_path))
11
-
12
- # Add two runs
13
- lb.upload("run1", {"retriever": "FAISS"}, 0.91)
14
- lb.upload("run2", {"retriever": "Chroma"}, 0.85)
15
-
16
- # Verify file content
17
- assert file_path.exists()
18
- with open(file_path, "r", encoding="utf-8") as f:
19
- lines = [json.loads(line) for line in f]
20
- assert len(lines) == 2
21
-
22
- # Get top results
23
- top = lb.top_results(limit=1)
24
- assert isinstance(top, list)
25
- assert len(top) == 1
26
- assert "score" in top[0]
27
-
28
-
29
- def test_leaderboard_append_existing(tmp_path):
30
- """Ensure multiple uploads append properly."""
31
- file_path = tmp_path / "leaderboard.jsonl"
32
- lb = Leaderboard(storage_path=str(file_path))
33
-
34
- for i in range(3):
35
- lb.upload(f"run{i}", {"retriever": "BM25"}, 0.8 + i * 0.05)
36
-
37
- top = lb.top_results(limit=2)
38
- assert len(top) == 2
39
- assert top[0]["score"] >= top[1]["score"]
File without changes
File without changes
File without changes
File without changes
File without changes