PyPI - ragmint - Versions diffs - 0.4.0__tar.gz → 0.4.2__tar.gz - Mend

ragmint 0.4.0tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ragmint might be problematic. Click here for more details.

Files changed (54) hide show

{ragmint-0.4.0/src/ragmint.egg-info → ragmint-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragmint
-Version: 0.4.0
+Version: 0.4.2
 Summary: A modular framework for evaluating and optimizing RAG pipelines.
 Author-email: Andre Oliveira <oandreoliveira@outlook.com>
 License: Apache License 2.0
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
 Requires-Dist: openai>=1.0.0
 Requires-Dist: google-generativeai>=0.8.0
 Requires-Dist: anthropic>=0.25.0
-Requires-Dist: supabase>=2.4.0
 Requires-Dist: pytest
 Requires-Dist: langchain>=0.2.5
 Requires-Dist: langchain-community>=0.2.5
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
 ```python
 from ragmint.leaderboard import Leaderboard
-lb = Leaderboard("experiments/leaderboard.json")
-lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
-lb.show_top(3)
+# Initialize local leaderboard
+leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
+# Retrieve top 5 runs
+print("\n🏅 Top 5 Experiments:")
+for result in leaderboard.top_results(limit=5):
+    print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
 ```
 ---
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive **natural language insights** on why one performs better.
+Compare RAG configurations and receive **natural language insights** on why one performs better.
 ```python
+from ragmint.autotuner import AutoRAGTuner
 from ragmint.explainer import explain_results
-config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
-config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
+tuner = AutoRAGTuner(docs_path="data/docs/")
+best, results = tuner.auto_tune(
+    validation_set='data/docs/validation_qa.json',
+    metric="faithfulness",
+    trials=5,
+    search_type='bayesian'
+)
-explanation = explain_results(config_a, config_b, model="gemini")
-print(explanation)
+analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
+print(analysis)
 ```
 > Set your API keys in a `.env` file or via environment variables:

{ragmint-0.4.0 → ragmint-0.4.2}/README.md RENAMED Viewed

@@ -229,25 +229,35 @@ Track and visualize your best experiments across runs.
 ```python
 from ragmint.leaderboard import Leaderboard
-lb = Leaderboard("experiments/leaderboard.json")
-lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
-lb.show_top(3)
+# Initialize local leaderboard
+leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
+# Retrieve top 5 runs
+print("\n🏅 Top 5 Experiments:")
+for result in leaderboard.top_results(limit=5):
+    print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
 ```
 ---
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive **natural language insights** on why one performs better.
+Compare RAG configurations and receive **natural language insights** on why one performs better.
 ```python
+from ragmint.autotuner import AutoRAGTuner
 from ragmint.explainer import explain_results
-config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
-config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
+tuner = AutoRAGTuner(docs_path="data/docs/")
+best, results = tuner.auto_tune(
+    validation_set='data/docs/validation_qa.json',
+    metric="faithfulness",
+    trials=5,
+    search_type='bayesian'
+)
-explanation = explain_results(config_a, config_b, model="gemini")
-print(explanation)
+analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
+print(analysis)
 ```
 > Set your API keys in a `.env` file or via environment variables:

{ragmint-0.4.0 → ragmint-0.4.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ragmint"
-version = "0.4.0"
+version = "0.4.2"
 description = "A modular framework for evaluating and optimizing RAG pipelines."
 readme = "README.md"
 license = { text = "Apache License 2.0" }
@@ -40,9 +40,6 @@ dependencies = [
   "google-generativeai>=0.8.0",
   "anthropic>=0.25.0",
-  # Integration / storage
-  "supabase>=2.4.0",
   # Testing
   "pytest",

{ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/autotuner.py RENAMED Viewed

@@ -63,7 +63,8 @@ class AutoRAGTuner:
     def suggest_chunk_sizes(
             self,
             model_name: Optional[str] = None,
-            num_pairs: Optional[int] = None
+            num_pairs: Optional[int] = None,
+            step: int = 10
     ) -> List[Tuple[int, int]]:
         if num_pairs is None:
             raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
@@ -74,21 +75,27 @@ class AutoRAGTuner:
         model = SentenceTransformer(model_name)
         max_tokens = getattr(model, "max_seq_length", 256)
         approx_words = max(1, int(max_tokens * 0.75))
         avg_len = self.corpus_stats.get("avg_len", 400)
-        chunk_sizes = []
-        for _ in range(num_pairs):
-            max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
-            low = max(10, int(max_chunk * 0.5))
-            high = max(low, max_chunk)
-            chunk_size = random.randint(low, high)
-            overlap = random.randint(10, min(300, chunk_size // 2))
-            chunk_sizes.append((chunk_size, overlap))
+        max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
+        # Safe chunk and overlap ranges
+        chunk_sizes = list(range(50, max_chunk + 1, step))
+        overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
+        if not overlaps:
+            overlaps = [max(1, max_chunk // 4)]
+        candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
+        # Randomly sample requested number of pairs
+        if num_pairs >= len(candidates):
+            sampled = candidates
+        else:
+            sampled = random.sample(candidates, num_pairs)
-        logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {chunk_sizes}")
-        return chunk_sizes
+        logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
+        return sampled
     # -----------------------------
     # Recommendation Logic
@@ -130,7 +137,16 @@ class AutoRAGTuner:
             logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
         # Suggest chunk sizes
-        chunk_candidates = self.suggest_chunk_sizes(embedding_model, num_pairs=num_chunk_pairs)
+        # Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
+        chunk_candidates = self.suggest_chunk_sizes(
+            model_name=embedding_model,
+            num_pairs=num_chunk_pairs
+        )
+        # Safety check
+        if not chunk_candidates:
+            raise RuntimeError("No chunk candidates generated.")
         # Pick the first pair as default recommendation
         chunk_size, overlap = chunk_candidates[0]
@@ -176,6 +192,8 @@ class AutoRAGTuner:
         """
         rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
+        chunk_candidates = rec["chunk_candidates"]
         logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
         tuner = RAGMint(
@@ -183,8 +201,8 @@ class AutoRAGTuner:
             retrievers=[rec["retriever"]],
             embeddings=[rec["embedding_model"]],
             rerankers=["mmr"],
-            chunk_sizes=[rec["chunk_size"]],
-            overlaps=[rec["overlap"]],
+            chunk_sizes=[c[0] for c in chunk_candidates],
+            overlaps=[c[1] for c in chunk_candidates],
             strategies=[rec["strategy"]],
         )

ragmint-0.4.2/src/ragmint/explainer.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+Interpretability Layer
+----------------------
+Uses Gemini or Anthropic Claude to explain why a particular RAG configuration
+performed best, considering both optimizer results and corpus characteristics.
+"""
+import os
+import json
+from dotenv import load_dotenv
+# Load .env if available
+load_dotenv()
+def explain_results(best_result: dict, all_results: list, corpus_stats: dict = None,
+                    model: str = "gemini-2.5-flash-lite") -> str:
+    """
+    Generate a detailed natural-language explanation for RAG optimization results.
+    Parameters:
+      - best_result: dict containing the best configuration and metrics.
+      - all_results: list of all trial results with metrics and configs.
+      - corpus_stats: optional dict with corpus info (size, avg_len, num_docs).
+      - model: LLM model name (Gemini or Claude).
+    Returns:
+      A natural-language explanation string.
+    """
+    anthropic_key = os.getenv("ANTHROPIC_API_KEY")
+    google_key = os.getenv("GOOGLE_API_KEY")
+    # Build dynamic context
+    corpus_info = json.dumps(corpus_stats or {}, indent=2)
+    best_json = json.dumps(best_result, indent=2)
+    all_json = json.dumps(list(all_results)[:10], indent=2) #cap for safety
+    prompt = f"""
+    You are an expert AI researcher specializing in Retrieval-Augmented Generation (RAG) optimization.
+    A RAG auto-tuner was run on a corpus with these characteristics:
+    {corpus_info}
+    The tuner evaluated multiple configurations and metrics. Below are:
+    - The BEST configuration:
+    {best_json}
+    - A sample of ALL evaluated configurations:
+    {all_json}
+    Please:
+    1. Explain WHY this best configuration likely performs better than others.
+    2. Highlight trade-offs between accuracy, latency, and resource usage.
+    3. Suggest potential improvements (different chunking, embedding, retriever, etc.).
+    4. Provide a concise summary of which setup you recommend for this corpus.
+    Keep it structured, under 300 words, and easy to read.
+    """
+    # --- 1️⃣ Anthropic Claude first ---
+    if anthropic_key:
+        try:
+            from anthropic import Anthropic
+            client = Anthropic(api_key=anthropic_key)
+            response = client.messages.create(
+                model="claude-3-opus-20240229",
+                max_tokens=500,
+                messages=[{"role": "user", "content": prompt}],
+            )
+            return response.content[0].text
+        except Exception as e:
+            return f"[Claude unavailable] {e}"
+    # --- 2️⃣ Gemini fallback ---
+    elif google_key:
+        try:
+            import google.generativeai as genai
+            genai.configure(api_key=google_key)
+            response = genai.GenerativeModel(model).generate_content(prompt)
+            return response.text
+        except Exception as e:
+            return f"[Gemini unavailable] {e}"
+    # --- 3️⃣ Fallback message ---
+    else:
+        return (
+            "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
+            "to enable interpretability via Claude or Gemini."
+        )

ragmint-0.4.2/src/ragmint/leaderboard.py ADDED Viewed

@@ -0,0 +1,51 @@
+import os
+import json
+from datetime import datetime
+from typing import Dict, Any, List, Optional
+class Leaderboard:
+    def __init__(self, storage_path: Optional[str] = "leaderboard.jsonl"):
+        self.storage_path = storage_path
+        os.makedirs(os.path.dirname(self.storage_path) or ".", exist_ok=True)
+        if not os.path.exists(self.storage_path):
+            open(self.storage_path, "w", encoding="utf-8").close()
+    def upload(
+        self,
+        run_id: str,
+        best_config: Dict[str, Any],
+        best_score: float,
+        all_results: List[Dict[str, Any]],
+        documents: List[str],
+        model: str,
+        corpus_stats: Optional[Dict[str, Any]] = None,
+    ):
+        """Persist a full experiment run to local leaderboard."""
+        data = {
+            "run_id": run_id,
+            "timestamp": datetime.utcnow().isoformat(),
+            "best_config": best_config,
+            "best_score": best_score,
+            "all_results": all_results,
+            "documents": [os.path.basename(d) for d in documents],
+            "model": model,
+            "corpus_stats": corpus_stats or {},
+        }
+        with open(self.storage_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(data) + "\n")
+        return data
+    def all_results(self) -> List[Dict[str, Any]]:
+        if not os.path.exists(self.storage_path):
+            return []
+        with open(self.storage_path, "r", encoding="utf-8") as f:
+            return [json.loads(line) for line in f if line.strip()]
+    def top_results(self, limit: int = 10) -> List[Dict[str, Any]]:
+        """Return top experiments by score."""
+        results = self.all_results()
+        return sorted(results, key=lambda x: x.get("best_score", 0.0), reverse=True)[:limit]

ragmint-0.4.2/src/ragmint/tests/test_explainer.py ADDED Viewed

@@ -0,0 +1,36 @@
+import pytest
+import sys
+import types
+from ragmint.explainer import explain_results
+def test_explain_results_with_claude(monkeypatch):
+    """Claude explanation should use Anthropic API path when ANTHROPIC_API_KEY is set."""
+    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key")
+    monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
+    # Create a fake anthropic module with the required interface
+    mock_anthropic = types.ModuleType("anthropic")
+    class MockContent:
+        text = "Claude: The best configuration performs well due to optimized chunk size."
+    class MockMessages:
+        def create(self, *args, **kwargs):
+            return type("MockResponse", (), {"content": [MockContent()]})()
+    class MockClient:
+        def __init__(self, api_key):
+            self.messages = MockMessages()
+    mock_anthropic.Anthropic = MockClient
+    sys.modules["anthropic"] = mock_anthropic  # Inject fake module
+    best = {"retriever": "Chroma", "metric": 0.9}
+    all_results = [{"retriever": "FAISS", "metric": 0.85}]
+    corpus_stats = {"size": 10000, "avg_len": 400, "num_docs": 20}
+    result = explain_results(best, all_results, corpus_stats, model="claude-3-opus-20240229")
+    assert isinstance(result, str)
+    assert "Claude" in result or "claude" in result

ragmint-0.4.2/src/ragmint/tests/test_leaderboard.py ADDED Viewed

@@ -0,0 +1,92 @@
+import os
+import json
+import tempfile
+import pytest
+from datetime import datetime
+from ragmint.leaderboard import Leaderboard
+@pytest.fixture
+def temp_leaderboard():
+    """Create a temporary leaderboard file for testing."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, "leaderboard.jsonl")
+        lb = Leaderboard(storage_path=path)
+        yield lb, path
+def test_upload_and_persistence(temp_leaderboard):
+    lb, path = temp_leaderboard
+    # --- Mock experiment data ---
+    run_id = "run_001"
+    best_config = {"retriever": "FAISS", "embedding_model": "all-MiniLM"}
+    best_score = 0.92
+    all_results = [
+        {"retriever": "FAISS", "score": 0.92},
+        {"retriever": "BM25", "score": 0.85},
+    ]
+    documents = ["docs/a.txt", "docs/b.txt"]
+    model = "gemini"
+    corpus_stats = {"size": 20000, "avg_len": 400, "num_docs": 10}
+    # --- Upload ---
+    record = lb.upload(
+        run_id=run_id,
+        best_config=best_config,
+        best_score=best_score,
+        all_results=all_results,
+        documents=documents,
+        model=model,
+        corpus_stats=corpus_stats,
+    )
+    # --- Validate returned record ---
+    assert record["run_id"] == run_id
+    assert record["model"] == "gemini"
+    assert "timestamp" in record
+    assert record["best_score"] == 0.92
+    assert all(doc in record["documents"] for doc in ["a.txt", "b.txt"])
+    # --- File should contain JSON line ---
+    with open(path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    assert len(lines) == 1
+    parsed = json.loads(lines[0])
+    assert parsed["run_id"] == run_id
+def test_top_results_ordering(temp_leaderboard):
+    lb, _ = temp_leaderboard
+    # Upload multiple runs with varying scores
+    for i, score in enumerate([0.8, 0.95, 0.7]):
+        lb.upload(
+            run_id=f"run_{i}",
+            best_config={"retriever": "FAISS"},
+            best_score=score,
+            all_results=[],
+            documents=["file.txt"],
+            model="claude",
+        )
+    # --- Get top results ---
+    top = lb.top_results(limit=2)
+    assert len(top) == 2
+    # --- Ensure ordering descending by score ---
+    assert top[0]["best_score"] >= top[1]["best_score"]
+    assert top[0]["best_score"] == 0.95
+def test_all_results_reads_all_entries(temp_leaderboard):
+    lb, _ = temp_leaderboard
+    # Add two runs
+    lb.upload("run_a", {}, 0.5, [], ["doc1.txt"], "gemini")
+    lb.upload("run_b", {}, 0.7, [], ["doc2.txt"], "claude")
+    results = lb.all_results()
+    assert len(results) == 2
+    run_ids = {r["run_id"] for r in results}
+    assert {"run_a", "run_b"} <= run_ids

{ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint/tuner.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import os
-import json
 import logging
 from typing import Any, Dict, List, Tuple
 from time import perf_counter
@@ -11,6 +10,8 @@ from .core.reranker import Reranker
 from .core.evaluation import Evaluator
 from .optimization.search import GridSearch, RandomSearch, BayesianSearch
 from .utils.data_loader import load_validation_set
+from .leaderboard import Leaderboard
+from uuid import uuid4
 logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
@@ -151,7 +152,7 @@ class RAGMint:
         """Run optimization search over retrievers, embeddings, rerankers, and chunking."""
         validation = load_validation_set(validation_set or "default")
-        # ✅ Add chunking parameters to the search space
+        # search space
         search_space = {
             "retriever": self.retrievers,
             "embedding_model": self.embeddings,
@@ -186,4 +187,24 @@ class RAGMint:
         best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
         logging.info(f"🏆 Best configuration: {best}")
+        # Save to leaderboard
+        run_id = f"run_{uuid4().hex[:8]}"
+        leaderboard = Leaderboard()
+        corpus_stats = {
+            "num_docs": len(self.documents),
+            "avg_len": sum(len(d.split()) for d in self.documents) / max(1, len(self.documents)),
+            "corpus_size": sum(len(d) for d in self.documents),
+        }
+        leaderboard.upload(
+            run_id=run_id,
+            best_config=best,
+            best_score=best.get(metric, 0.0),
+            all_results=results,
+            documents=os.listdir(self.docs_path),
+            model=best.get("embedding_model", "unknown"),
+            corpus_stats=corpus_stats,
+        )
         return best, results

{ragmint-0.4.0 → ragmint-0.4.2/src/ragmint.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragmint
-Version: 0.4.0
+Version: 0.4.2
 Summary: A modular framework for evaluating and optimizing RAG pipelines.
 Author-email: Andre Oliveira <oandreoliveira@outlook.com>
 License: Apache License 2.0
@@ -27,7 +27,6 @@ Requires-Dist: python-dotenv
 Requires-Dist: openai>=1.0.0
 Requires-Dist: google-generativeai>=0.8.0
 Requires-Dist: anthropic>=0.25.0
-Requires-Dist: supabase>=2.4.0
 Requires-Dist: pytest
 Requires-Dist: langchain>=0.2.5
 Requires-Dist: langchain-community>=0.2.5
@@ -273,25 +272,35 @@ Track and visualize your best experiments across runs.
 ```python
 from ragmint.leaderboard import Leaderboard
-lb = Leaderboard("experiments/leaderboard.json")
-lb.add_entry({"trial": 1, "faithfulness": 0.87, "latency": 0.12})
-lb.show_top(3)
+# Initialize local leaderboard
+leaderboard = Leaderboard(storage_path="leaderboard.jsonl")
+# Retrieve top 5 runs
+print("\n🏅 Top 5 Experiments:")
+for result in leaderboard.top_results(limit=5):
+    print(f"{result['run_id']} | Score: {result['best_score']:.2f} | Model: {result['model']}")
 ```
 ---
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive **natural language insights** on why one performs better.
+Compare RAG configurations and receive **natural language insights** on why one performs better.
 ```python
+from ragmint.autotuner import AutoRAGTuner
 from ragmint.explainer import explain_results
-config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
-config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
+tuner = AutoRAGTuner(docs_path="data/docs/")
+best, results = tuner.auto_tune(
+    validation_set='data/docs/validation_qa.json',
+    metric="faithfulness",
+    trials=5,
+    search_type='bayesian'
+)
-explanation = explain_results(config_a, config_b, model="gemini")
-print(explanation)
+analysis = explain_results(best, results, corpus_stats=tuner.corpus_stats)
+print(analysis)
 ```
 > Set your API keys in a `.env` file or via environment variables:

{ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,7 +33,6 @@ src/ragmint/tests/test_autotuner.py
 src/ragmint/tests/test_config_adapter.py
 src/ragmint/tests/test_embeddings.py
 src/ragmint/tests/test_explainer.py
-src/ragmint/tests/test_explainer_integration.py
 src/ragmint/tests/test_integration_autotuner_ragmint.py
 src/ragmint/tests/test_langchain_prebuilder.py
 src/ragmint/tests/test_leaderboard.py

{ragmint-0.4.0 → ragmint-0.4.2}/src/ragmint.egg-info/requires.txt RENAMED Viewed

@@ -12,7 +12,6 @@ python-dotenv
 openai>=1.0.0
 google-generativeai>=0.8.0
 anthropic>=0.25.0
-supabase>=2.4.0
 pytest
 langchain>=0.2.5
 langchain-community>=0.2.5

ragmint-0.4.0/src/ragmint/explainer.py DELETED Viewed

@@ -1,63 +0,0 @@
-"""
-Interpretability Layer
-----------------------
-Uses Gemini or Anthropic Claude to explain why one RAG configuration
-outperforms another. Falls back gracefully if no API key is provided.
-"""
-import os
-import json
-from dotenv import load_dotenv
-# Load environment variables from .env file if available
-load_dotenv()
-def explain_results(results_a: dict, results_b: dict, model: str = "gemini-2.5-flash-lite") -> str:
-    """
-    Generate a natural-language explanation comparing two RAG experiment results.
-    Priority:
-      1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
-      2. Google Gemini (if GOOGLE_API_KEY is set)
-      3. Fallback text message
-    """
-    prompt = f"""
-    You are an AI evaluation expert.
-    Compare these two RAG experiment results and explain why one performs better.
-    Metrics A: {json.dumps(results_a, indent=2)}
-    Metrics B: {json.dumps(results_b, indent=2)}
-    Provide a concise, human-friendly explanation and practical improvement tips.
-    """
-    anthropic_key = os.getenv("ANTHROPIC_API_KEY")
-    google_key = os.getenv("GOOGLE_API_KEY")  # fixed var name
-    # 1️⃣ Try Anthropic Claude first
-    if anthropic_key:
-        try:
-            from anthropic import Anthropic
-            client = Anthropic(api_key=anthropic_key)
-            response = client.messages.create(
-                model="claude-3-opus-20240229",
-                max_tokens=300,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            return response.content[0].text
-        except Exception as e:
-            return f"[Claude unavailable] {e}"
-    # 2️⃣ Fallback to Google Gemini
-    elif google_key:
-        try:
-            import google.generativeai as genai
-            genai.configure(api_key=google_key)
-            response = genai.GenerativeModel(model).generate_content(prompt)
-            return response.text
-        except Exception as e:
-            return f"[Gemini unavailable] {e}"
-    # 3️⃣ Fallback if neither key is available
-    else:
-        return (
-            "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
-            "to enable interpretability via Claude or Gemini."
-        )

ragmint-0.4.0/src/ragmint/leaderboard.py DELETED Viewed

@@ -1,45 +0,0 @@
-import os
-import json
-from datetime import datetime
-from typing import Dict, Any, Optional
-from supabase import create_client
-class Leaderboard:
-    def __init__(self, storage_path: Optional[str] = None):
-        self.storage_path = storage_path
-        url = os.getenv("SUPABASE_URL")
-        key = os.getenv("SUPABASE_KEY")
-        self.client = None
-        if url and key:
-            self.client = create_client(url, key)
-        elif not storage_path:
-            raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
-    def upload(self, run_id: str, config: Dict[str, Any], score: float):
-        data = {
-            "run_id": run_id,
-            "config": config,
-            "score": score,
-            "timestamp": datetime.utcnow().isoformat(),
-        }
-        if self.client:
-            return self.client.table("experiments").insert(data).execute()
-        else:
-            os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
-            with open(self.storage_path, "a", encoding="utf-8") as f:
-                f.write(json.dumps(data) + "\n")
-            return data
-    def top_results(self, limit: int = 10):
-        if self.client:
-            return (
-                self.client.table("experiments")
-                .select("*")
-                .order("score", desc=True)
-                .limit(limit)
-                .execute()
-            )
-        else:
-            with open(self.storage_path, "r", encoding="utf-8") as f:
-                lines = [json.loads(line) for line in f]
-            return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]

ragmint-0.4.0/src/ragmint/tests/test_explainer.py DELETED Viewed

@@ -1,20 +0,0 @@
-import pytest
-from ragmint.explainer import explain_results
-def test_explain_results_gemini():
-    """Gemini explanation should contain model-specific phrasing."""
-    config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
-    config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
-    result = explain_results(config_a, config_b, model="gemini")
-    assert isinstance(result, str)
-    assert "Gemini" in result or "gemini" in result
-def test_explain_results_claude():
-    """Claude explanation should contain model-specific phrasing."""
-    config_a = {"retriever": "FAISS"}
-    config_b = {"retriever": "Chroma"}
-    result = explain_results(config_a, config_b, model="claude")
-    assert isinstance(result, str)
-    assert "Claude" in result or "claude" in result

ragmint-0.4.0/src/ragmint/tests/test_explainer_integration.py DELETED Viewed

@@ -1,18 +0,0 @@
-import os
-import pytest
-from ragmint.explainer import explain_results
-@pytest.mark.integration
-def test_real_gemini_explanation():
-    """Run real Gemini call if GOOGLE_API_KEY is set."""
-    if not os.getenv("GEMINI_API_KEY"):
-        pytest.skip("GEMINI_API_KEY not set")
-    config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
-    config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
-    result = explain_results(config_a, config_b, model="gemini-1.5-pro")
-    assert isinstance(result, str)
-    assert len(result) > 0
-    print("\n[Gemini explanation]:", result[:200], "...")

ragmint-0.4.0/src/ragmint/tests/test_leaderboard.py DELETED Viewed

@@ -1,39 +0,0 @@
-import json
-import tempfile
-from pathlib import Path
-from ragmint.leaderboard import Leaderboard
-def test_leaderboard_add_and_top(tmp_path):
-    """Ensure local leaderboard persistence works without Supabase."""
-    file_path = tmp_path / "leaderboard.jsonl"
-    lb = Leaderboard(storage_path=str(file_path))
-    # Add two runs
-    lb.upload("run1", {"retriever": "FAISS"}, 0.91)
-    lb.upload("run2", {"retriever": "Chroma"}, 0.85)
-    # Verify file content
-    assert file_path.exists()
-    with open(file_path, "r", encoding="utf-8") as f:
-        lines = [json.loads(line) for line in f]
-    assert len(lines) == 2
-    # Get top results
-    top = lb.top_results(limit=1)
-    assert isinstance(top, list)
-    assert len(top) == 1
-    assert "score" in top[0]
-def test_leaderboard_append_existing(tmp_path):
-    """Ensure multiple uploads append properly."""
-    file_path = tmp_path / "leaderboard.jsonl"
-    lb = Leaderboard(storage_path=str(file_path))
-    for i in range(3):
-        lb.upload(f"run{i}", {"retriever": "BM25"}, 0.8 + i * 0.05)
-    top = lb.top_results(limit=2)
-    assert len(top) == 2
-    assert top[0]["score"] >= top[1]["score"]