PyPI - ragmint - Versions diffs - 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

ragmint 0.2.1py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ragmint/app.py +512 -0
ragmint/autotuner.py +201 -17
ragmint/core/chunking.py +68 -4
ragmint/core/embeddings.py +46 -10
ragmint/core/evaluation.py +33 -14
ragmint/core/pipeline.py +34 -10
ragmint/core/retriever.py +152 -20
ragmint/experiments/validation_qa.json +1 -14
ragmint/explainer.py +47 -20
ragmint/integrations/__init__.py +0 -0
ragmint/integrations/config_adapter.py +96 -0
ragmint/integrations/langchain_prebuilder.py +99 -0
ragmint/leaderboard.py +41 -35
ragmint/qa_generator.py +190 -0
ragmint/tests/test_autotuner.py +52 -30
ragmint/tests/test_config_adapter.py +39 -0
ragmint/tests/test_embeddings.py +46 -0
ragmint/tests/test_explainer.py +28 -12
ragmint/tests/test_integration_autotuner_ragmint.py +39 -52
ragmint/tests/test_langchain_prebuilder.py +82 -0
ragmint/tests/test_leaderboard.py +78 -25
ragmint/tests/test_pipeline.py +3 -2
ragmint/tests/test_qa_generator.py +66 -0
ragmint/tests/test_retriever.py +3 -2
ragmint/tests/test_tuner.py +1 -1
ragmint/tuner.py +109 -22
ragmint-0.4.6.data/data/README.md +485 -0
ragmint-0.4.6.dist-info/METADATA +530 -0
ragmint-0.4.6.dist-info/RECORD +48 -0
ragmint-0.4.6.dist-info/licenses/LICENSE +19 -0
ragmint/tests/test_explainer_integration.py +0 -18
ragmint-0.2.1.dist-info/METADATA +0 -27
ragmint-0.2.1.dist-info/RECORD +0 -38
{ragmint-0.2.1.dist-info/licenses → ragmint-0.4.6.data/data}/LICENSE +0 -0
{ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/WHEEL +0 -0
{ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/top_level.txt +0 -0

ragmint/core/retriever.py CHANGED Viewed

@@ -1,33 +1,165 @@
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Optional
 import numpy as np
+from .embeddings import Embeddings
+# Optional imports
+try:
+    import faiss
+except ImportError:
+    faiss = None
+try:
+    import chromadb
+except ImportError:
+    chromadb = None
+try:
+    from sklearn.neighbors import BallTree
+except ImportError:
+    BallTree = None
+try:
+    from rank_bm25 import BM25Okapi
+except ImportError:
+    BM25Okapi = None
 class Retriever:
     """
-    Simple vector retriever using cosine similarity.
+    Multi-backend retriever supporting:
+        - "numpy"  : basic cosine similarity (dense)
+        - "faiss"  : high-performance dense retriever
+        - "chroma" : persistent vector DB
+        - "sklearn": BallTree (cosine or Euclidean)
+        - "bm25"   : lexical retriever using Rank-BM25
+    Example:
+        retriever = Retriever(embedder, documents=["A", "B", "C"], backend="bm25")
+        results = retriever.retrieve("example query", top_k=3)
     """
-    def __init__(self, embeddings: List[np.ndarray], documents: List[str]):
-        if len(embeddings) == 0:
-            self.embeddings = np.zeros((1, 768))
-        else:
-            self.embeddings = np.array(embeddings)
-        self.documents = documents or [""]
+    def __init__(
+        self,
+        embedder: Optional[Embeddings] = None,
+        documents: Optional[List[str]] = None,
+        embeddings: Optional[np.ndarray] = None,
+        backend: str = "numpy",
+    ):
+        self.embedder = embedder
+        self.documents = documents or []
+        self.backend = backend.lower()
+        self.embeddings = None
+        self.index = None
+        self.client = None
+        self.bm25 = None
+        # Initialize embeddings for dense backends
+        if self.backend not in ["bm25"]:
+            if embeddings is not None:
+                self.embeddings = np.array(embeddings)
+            elif self.documents and self.embedder:
+                self.embeddings = self.embedder.encode(self.documents)
+            else:
+                self.embeddings = np.zeros((0, getattr(self.embedder, "dim", 768)))
+            # Normalize for cosine
+            if self.embeddings.size > 0:
+                self.embeddings = self._normalize(self.embeddings)
+        # Initialize backend
+        self._init_backend()
+    # ------------------------
+    # Backend Initialization
+    # ------------------------
+    def _init_backend(self):
+        if self.backend == "faiss":
+            if faiss is None:
+                raise ImportError("faiss not installed. Run `pip install faiss-cpu`.")
+            self.index = faiss.IndexFlatIP(self.embedder.dim)
+            self.index.add(self.embeddings.astype("float32"))
+        elif self.backend == "chroma":
+            if chromadb is None:
+                raise ImportError("chromadb not installed. Run `pip install chromadb`.")
+            self.client = chromadb.Client()
+            self.collection = self.client.create_collection(name="ragmint_retriever")
+            for i, doc in enumerate(self.documents):
+                self.collection.add(
+                    ids=[str(i)],
+                    documents=[doc],
+                    embeddings=[self.embeddings[i].tolist()],
+                )
+        elif self.backend == "sklearn":
+            if BallTree is None:
+                raise ImportError("scikit-learn not installed. Run `pip install scikit-learn`.")
+            self.index = BallTree(self.embeddings)
+        elif self.backend == "bm25":
+            if BM25Okapi is None:
+                raise ImportError("rank-bm25 not installed. Run `pip install rank-bm25`.")
+            tokenized_corpus = [doc.lower().split() for doc in self.documents]
+            self.bm25 = BM25Okapi(tokenized_corpus)
+        elif self.backend != "numpy":
+            raise ValueError(f"Unsupported retriever backend: {self.backend}")
+    # ------------------------
+    # Retrieval
+    # ------------------------
     def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
-        if self.embeddings.size == 0 or len(self.documents) == 0:
+        if len(self.documents) == 0:
+            return [{"text": "", "score": 0.0}]
+        # BM25 retrieval (lexical)
+        if self.backend == "bm25":
+            tokenized_query = query.lower().split()
+            scores = self.bm25.get_scores(tokenized_query)
+            top_indices = np.argsort(scores)[::-1][:top_k]
+            return [
+                {"text": self.documents[i], "score": float(scores[i])}
+                for i in top_indices
+            ]
+        # Dense retrieval (others)
+        if self.embeddings is None or self.embeddings.size == 0:
             return [{"text": "", "score": 0.0}]
-        query_vec = self._embed(query)
-        scores = self._cosine_similarity(query_vec, self.embeddings)
-        top_indices = np.argsort(scores)[::-1][:min(top_k, len(scores))]
-        return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
+        query_vec = self.embedder.encode([query])[0]
+        query_vec = self._normalize(query_vec)
-    def _embed(self, query: str) -> np.ndarray:
-        dim = self.embeddings.shape[1] if len(self.embeddings.shape) > 1 else 768
-        return np.random.rand(dim)
+        if self.backend == "numpy":
+            scores = np.dot(self.embeddings, query_vec)
+            top_indices = np.argsort(scores)[::-1][:top_k]
+            return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
+        elif self.backend == "faiss":
+            query_vec = np.expand_dims(query_vec.astype("float32"), axis=0)
+            scores, indices = self.index.search(query_vec, top_k)
+            return [{"text": self.documents[int(i)], "score": float(scores[0][j])} for j, i in enumerate(indices[0])]
+        elif self.backend == "chroma":
+            results = self.collection.query(query_texts=[query], n_results=top_k)
+            docs = results["documents"][0]
+            scores = results["distances"][0]
+            return [{"text": d, "score": 1 - s} for d, s in zip(docs, scores)]
+        elif self.backend == "sklearn":
+            distances, indices = self.index.query([query_vec], k=top_k)
+            scores = 1 - distances[0]
+            return [{"text": self.documents[int(i)], "score": float(scores[j])} for j, i in enumerate(indices[0])]
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
-    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
-        a_norm = a / np.linalg.norm(a)
-        b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
-        return np.dot(b_norm, a_norm)
+    # ------------------------
+    # Utils
+    # ------------------------
+    @staticmethod
+    def _normalize(vectors: np.ndarray) -> np.ndarray:
+        if vectors.ndim == 1:
+            norm = np.linalg.norm(vectors)
+            return vectors / norm if norm > 0 else vectors
+        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
+        return np.divide(vectors, norms, out=np.zeros_like(vectors), where=norms != 0)

ragmint/experiments/validation_qa.json CHANGED Viewed

@@ -1,14 +1 @@
-[
-  {
-    "query": "What is Retrieval-Augmented Generation?",
-    "expected_answer": "A technique that combines information retrieval with language generation to improve factual accuracy."
-  },
-  {
-    "query": "What is the role of embeddings in a RAG system?",
-    "expected_answer": "They represent text as numerical vectors for similarity-based retrieval."
-  },
-  {
-    "query": "What is Maximal Marginal Relevance used for?",
-    "expected_answer": "To select diverse and relevant documents during reranking."
-  }
-]
+[]

ragmint/explainer.py CHANGED Viewed

@@ -1,49 +1,76 @@
 """
 Interpretability Layer
 ----------------------
-Uses Gemini or Anthropic Claude to explain why one RAG configuration
-outperforms another. Falls back gracefully if no API key is provided.
+Uses Gemini or Anthropic Claude to explain why a particular RAG configuration
+performed best, considering both optimizer results and corpus characteristics.
 """
 import os
 import json
+from dotenv import load_dotenv
+# Load .env if available
+load_dotenv()
-def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-pro") -> str:
+def explain_results(best_result: dict, all_results: list, corpus_stats: dict = None,
+                    model: str = "gemini-2.5-flash-lite") -> str:
     """
-    Generate a natural-language explanation comparing two RAG experiment results.
-    Priority:
-      1. Anthropic Claude (if ANTHROPIC_API_KEY is set)
-      2. Google Gemini (if GOOGLE_API_KEY is set)
-      3. Fallback text message
-    """
-    prompt = f"""
-    You are an AI evaluation expert.
-    Compare these two RAG experiment results and explain why one performs better.
-    Metrics A: {json.dumps(results_a, indent=2)}
-    Metrics B: {json.dumps(results_b, indent=2)}
-    Provide a concise, human-friendly explanation and practical improvement tips.
+    Generate a detailed natural-language explanation for RAG optimization results.
+    Parameters:
+      - best_result: dict containing the best configuration and metrics.
+      - all_results: list of all trial results with metrics and configs.
+      - corpus_stats: optional dict with corpus info (size, avg_len, num_docs).
+      - model: LLM model name (Gemini or Claude).
+    Returns:
+      A natural-language explanation string.
     """
     anthropic_key = os.getenv("ANTHROPIC_API_KEY")
-    google_key = os.getenv("GEMINI_API_KEY")
+    google_key = os.getenv("GOOGLE_API_KEY")
+    # Build dynamic context
+    corpus_info = json.dumps(corpus_stats or {}, indent=2)
+    best_json = json.dumps(best_result, indent=2)
+    all_json = json.dumps(list(all_results)[:10], indent=2) #cap for safety
+    prompt = f"""
+    You are an expert AI researcher specializing in Retrieval-Augmented Generation (RAG) optimization.
+    A RAG auto-tuner was run on a corpus with these characteristics:
+    {corpus_info}
+    The tuner evaluated multiple configurations and metrics. Below are:
+    - The BEST configuration:
+    {best_json}
+    - A sample of ALL evaluated configurations:
+    {all_json}
+    Please:
+    1. Explain WHY this best configuration likely performs better than others.
+    2. Highlight trade-offs between accuracy, latency, and resource usage.
+    3. Suggest potential improvements (different chunking, embedding, retriever, etc.).
+    4. Provide a concise summary of which setup you recommend for this corpus.
+    Keep it structured, under 300 words, and easy to read.
+    """
-    # 1️⃣ Try Anthropic Claude first
+    # --- 1️⃣ Anthropic Claude first ---
     if anthropic_key:
         try:
             from anthropic import Anthropic
             client = Anthropic(api_key=anthropic_key)
             response = client.messages.create(
                 model="claude-3-opus-20240229",
-                max_tokens=300,
+                max_tokens=500,
                 messages=[{"role": "user", "content": prompt}],
             )
             return response.content[0].text
         except Exception as e:
             return f"[Claude unavailable] {e}"
-    # 2️⃣ Fallback to Google Gemini
+    # --- 2️⃣ Gemini fallback ---
     elif google_key:
         try:
             import google.generativeai as genai
@@ -53,7 +80,7 @@ def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-p
         except Exception as e:
             return f"[Gemini unavailable] {e}"
-    # 3️⃣ Fallback if neither key is available
+    # --- 3️⃣ Fallback message ---
     else:
         return (
             "[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "

ragmint/integrations/__init__.py ADDED Viewed

File without changes

ragmint/integrations/config_adapter.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""
+RAGMint → LangChain Config Adapter
+----------------------------------
+Takes RAGMint or AutoRAGTuner recommendations and converts them into
+a normalized, pickle-safe configuration that can be used to build
+a LangChain RAG pipeline later.
+"""
+import json
+import pickle
+from pathlib import Path
+from typing import Dict, Any
+class LangchainConfigAdapter:
+    """
+    Converts RAGMint recommendations into LangChain-compatible configs.
+    Example:
+        adapter = LangChainConfigAdapter()
+        cfg = adapter.prepare(recommendation)
+        adapter.save(cfg, "best_config.pkl")
+    """
+    DEFAULT_EMBEDDINGS = {
+        "OpenAI": "sentence-transformers/all-MiniLM-L6-v2",
+        "SentenceTransformers": "sentence-transformers/all-MiniLM-L6-v2",
+        "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
+        "InstructorXL": "hkunlp/instructor-xl"
+    }
+    SUPPORTED_RETRIEVERS = {"faiss", "chroma", "bm25", "numpy", "sklearn"}
+    def __init__(self, recommendation: Dict[str, Any] | None = None):
+        self.recommendation = recommendation
+    def prepare(self, recommendation: Dict[str, Any] | None = None) -> Dict[str, Any]:
+        recommendation = recommendation or self.recommendation or {}
+        """
+        Normalize and validate configuration for LangChain use.
+        Returns:
+            dict with clean retriever, embedding, and chunking settings.
+        """
+        retriever = recommendation.get("retriever", "faiss").lower()
+        embedding_model = recommendation.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
+        chunk_size = recommendation.get("chunk_size", 400)
+        overlap = recommendation.get("overlap", 100)
+        # Normalize embedding model names
+        embedding_model = self.DEFAULT_EMBEDDINGS.get(embedding_model, embedding_model)
+        # Validate retriever backend
+        if retriever not in self.SUPPORTED_RETRIEVERS:
+            raise ValueError(f"Unsupported retriever backend: {retriever}")
+        config = {
+            "retriever": retriever,
+            "embedding_model": embedding_model,
+            "chunk_size": int(chunk_size),
+            "overlap": int(overlap),
+        }
+        return config
+    def save(self, config: Dict[str, Any], path: str):
+        """
+        Save configuration to a pickle file.
+        """
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "wb") as f:
+            pickle.dump(config, f)
+        print(f"💾 Saved LangChain config → {path}")
+    def load(self, path: str) -> Dict[str, Any]:
+        """
+        Load configuration from a pickle file.
+        """
+        with open(path, "rb") as f:
+            cfg = pickle.load(f)
+        print(f"✅ Loaded LangChain config ← {path}")
+        return cfg
+    def to_json(self, config: Dict[str, Any], path: str):
+        """
+        Save configuration as JSON (for human readability).
+        """
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2)
+        print(f"📝 Exported LangChain config → {path}")
+    # Alias for backward compatibility
+    def to_standard_config(self, recommendation: Dict[str, Any] | None = None) -> Dict[str, Any]:
+        """Alias for backward compatibility with older test suites."""
+        return self.prepare(recommendation)

ragmint/integrations/langchain_prebuilder.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""
+LangChain Pre-Build Integration
+-------------------------------
+This module bridges RAGMint's auto-tuning system with LangChain,
+returning retriever and embedding components that can plug directly
+into any LangChain RAG pipeline.
+Example:
+    from ragmint.integrations.langchain_prebuilder import LangChainPrebuilder
+    from langchain.chains import RetrievalQA
+    from langchain_openai import ChatOpenAI
+    prebuilder = LangChainPrebuilder(best_cfg)
+    retriever, embeddings = prebuilder.prepare(documents)
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
+"""
+from typing import List, Tuple, Dict, Any
+try:
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+except ImportError:
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS, Chroma
+from langchain_community.retrievers import BM25Retriever
+class LangchainPrebuilder:
+    """
+    Dynamically builds LangChain retriever and embedding objects
+    based on a RAGMint configuration dictionary.
+    """
+    def __init__(self, cfg: Dict[str, Any]):
+        """
+        Args:
+            cfg (dict): RAGMint configuration with keys:
+                - retriever: "faiss" | "chroma" | "bm25"
+                - embedding_model: HuggingFace model name
+                - chunk_size: int (default=500)
+                - overlap: int (default=100)
+        """
+        self.cfg = cfg
+        self.retriever_backend = cfg.get("retriever", "faiss").lower()
+        self.embedding_model = cfg.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
+        self.chunk_size = int(cfg.get("chunk_size", 500))
+        self.overlap = int(cfg.get("overlap", 100))
+    def prepare(self, documents: List[str]) -> Tuple[Any, Any]:
+        """
+        Prepares LangChain-compatible retriever and embeddings.
+        Args:
+            documents (list[str]): Corpus texts
+        Returns:
+            (retriever, embeddings): Tuple of initialized LangChain retriever and embedding model
+        """
+        # 1️⃣ Split into chunks
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.overlap
+        )
+        docs = splitter.create_documents(documents)
+        # 2️⃣ Create embeddings
+        embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)
+        # 3️⃣ Build retriever
+        retriever = self._build_retriever(docs, embeddings)
+        return retriever, embeddings
+    def _build_retriever(self, docs, embeddings):
+        """Internal helper for building retriever backend."""
+        backend = self.retriever_backend
+        if backend == "faiss":
+            db = FAISS.from_documents(docs, embeddings)
+            return db.as_retriever(search_kwargs={"k": 5})
+        elif backend == "chroma":
+            db = Chroma.from_documents(docs, embeddings, collection_name="ragmint_docs")
+            return db.as_retriever(search_kwargs={"k": 5})
+        elif backend == "bm25":
+            # Support both Document objects and raw text strings
+            texts = [getattr(d, "page_content", d) for d in docs]
+            retriever = BM25Retriever.from_texts(texts)
+            retriever.k = 5
+            return retriever
+        else:
+            raise ValueError(f"Unsupported retriever backend: {backend}")

ragmint/leaderboard.py CHANGED Viewed

@@ -1,45 +1,51 @@
 import os
 import json
 from datetime import datetime
-from typing import Dict, Any, Optional
-from supabase import create_client
+from typing import Dict, Any, List, Optional
 class Leaderboard:
-    def __init__(self, storage_path: Optional[str] = None):
+    def __init__(self, storage_path: Optional[str] = "leaderboard.jsonl"):
         self.storage_path = storage_path
-        url = os.getenv("SUPABASE_URL")
-        key = os.getenv("SUPABASE_KEY")
-        self.client = None
-        if url and key:
-            self.client = create_client(url, key)
-        elif not storage_path:
-            raise EnvironmentError("Set SUPABASE_URL/SUPABASE_KEY or pass storage_path")
-    def upload(self, run_id: str, config: Dict[str, Any], score: float):
+        os.makedirs(os.path.dirname(self.storage_path) or ".", exist_ok=True)
+        if not os.path.exists(self.storage_path):
+            open(self.storage_path, "w", encoding="utf-8").close()
+    def upload(
+        self,
+        run_id: str,
+        best_config: Dict[str, Any],
+        best_score: float,
+        all_results: List[Dict[str, Any]],
+        documents: List[str],
+        model: str,
+        corpus_stats: Optional[Dict[str, Any]] = None,
+    ):
+        """Persist a full experiment run to local leaderboard."""
         data = {
             "run_id": run_id,
-            "config": config,
-            "score": score,
             "timestamp": datetime.utcnow().isoformat(),
+            "best_config": best_config,
+            "best_score": best_score,
+            "all_results": all_results,
+            "documents": [os.path.basename(d) for d in documents],
+            "model": model,
+            "corpus_stats": corpus_stats or {},
         }
-        if self.client:
-            return self.client.table("experiments").insert(data).execute()
-        else:
-            os.makedirs(os.path.dirname(self.storage_path), exist_ok=True)
-            with open(self.storage_path, "a", encoding="utf-8") as f:
-                f.write(json.dumps(data) + "\n")
-            return data
-    def top_results(self, limit: int = 10):
-        if self.client:
-            return (
-                self.client.table("experiments")
-                .select("*")
-                .order("score", desc=True)
-                .limit(limit)
-                .execute()
-            )
-        else:
-            with open(self.storage_path, "r", encoding="utf-8") as f:
-                lines = [json.loads(line) for line in f]
-            return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
+        with open(self.storage_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(data) + "\n")
+        return data
+    def all_results(self) -> List[Dict[str, Any]]:
+        if not os.path.exists(self.storage_path):
+            return []
+        with open(self.storage_path, "r", encoding="utf-8") as f:
+            return [json.loads(line) for line in f if line.strip()]
+    def top_results(self, limit: int = 10) -> List[Dict[str, Any]]:
+        """Return top experiments by score."""
+        results = self.all_results()
+        return sorted(results, key=lambda x: x.get("best_score", 0.0), reverse=True)[:limit]

ragmint 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl

ragmint 0.2.1py3-none-any.whl → 0.4.6py3-none-any.whl