PyPI - ragmint - Versions diffs - 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

ragmint 0.2.1py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ragmint/app.py +512 -0
ragmint/autotuner.py +201 -17
ragmint/core/chunking.py +68 -4
ragmint/core/embeddings.py +46 -10
ragmint/core/evaluation.py +33 -14
ragmint/core/pipeline.py +34 -10
ragmint/core/retriever.py +152 -20
ragmint/experiments/validation_qa.json +1 -14
ragmint/explainer.py +47 -20
ragmint/integrations/__init__.py +0 -0
ragmint/integrations/config_adapter.py +96 -0
ragmint/integrations/langchain_prebuilder.py +99 -0
ragmint/leaderboard.py +41 -35
ragmint/qa_generator.py +190 -0
ragmint/tests/test_autotuner.py +52 -30
ragmint/tests/test_config_adapter.py +39 -0
ragmint/tests/test_embeddings.py +46 -0
ragmint/tests/test_explainer.py +28 -12
ragmint/tests/test_integration_autotuner_ragmint.py +39 -52
ragmint/tests/test_langchain_prebuilder.py +82 -0
ragmint/tests/test_leaderboard.py +78 -25
ragmint/tests/test_pipeline.py +3 -2
ragmint/tests/test_qa_generator.py +66 -0
ragmint/tests/test_retriever.py +3 -2
ragmint/tests/test_tuner.py +1 -1
ragmint/tuner.py +109 -22
ragmint-0.4.6.data/data/README.md +485 -0
ragmint-0.4.6.dist-info/METADATA +530 -0
ragmint-0.4.6.dist-info/RECORD +48 -0
ragmint-0.4.6.dist-info/licenses/LICENSE +19 -0
ragmint/tests/test_explainer_integration.py +0 -18
ragmint-0.2.1.dist-info/METADATA +0 -27
ragmint-0.2.1.dist-info/RECORD +0 -38
{ragmint-0.2.1.dist-info/licenses → ragmint-0.4.6.data/data}/LICENSE +0 -0
{ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/WHEEL +0 -0
{ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/top_level.txt +0 -0

ragmint/autotuner.py CHANGED Viewed

@@ -1,33 +1,217 @@
 """
 Auto-RAG Tuner
 --------------
-Recommends retriever–embedding pairs dynamically based on corpus size
-and dataset characteristics. Integrates seamlessly with RAGMint evaluator.
+Automatically recommends and optimizes RAG configurations based on corpus statistics.
+Integrates with RAGMint to perform full end-to-end tuning.
 """
-from .core.evaluation import evaluate_config
+import os
+import logging
+from statistics import mean
+from typing import Dict, Any, Tuple, List, Optional
+import random
+from sentence_transformers import SentenceTransformer
+from .tuner import RAGMint
+logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
 class AutoRAGTuner:
-    def __init__(self, corpus_stats: dict):
+    DEFAULT_EMBEDDINGS = "sentence-transformers/all-MiniLM-L6-v2"
+    def __init__(self, docs_path: str):
+        """
+        AutoRAGTuner automatically analyzes a corpus and runs an optimized RAG tuning pipeline.
+        Args:
+            docs_path (str): Path to the directory containing documents (.txt, .md, .rst)
         """
-        corpus_stats: dict
-            Example: {'size': 12000, 'avg_len': 240}
+        self.docs_path = docs_path
+        self.corpus_stats = self._analyze_corpus()
+    # -----------------------------
+    # Corpus Analysis
+    # -----------------------------
+    def _analyze_corpus(self) -> Dict[str, Any]:
+        """Compute corpus size, average length, and number of documents."""
+        docs = []
+        total_chars = 0
+        num_docs = 0
+        if not os.path.exists(self.docs_path):
+            logging.warning(f"⚠️ Corpus path not found: {self.docs_path}")
+            return {"size": 0, "avg_len": 0, "num_docs": 0}
+        for file in os.listdir(self.docs_path):
+            if file.endswith((".txt", ".md", ".rst")):
+                with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
+                    content = f.read()
+                    docs.append(content)
+                    total_chars += len(content)
+                    num_docs += 1
+        avg_len = int(mean([len(d) for d in docs])) if docs else 0
+        stats = {"size": total_chars, "avg_len": avg_len, "num_docs": num_docs}
+        logging.info(f"📊 Corpus stats: {stats}")
+        return stats
+    # -----------------------------
+    # Chunk Size Suggestion
+    # -----------------------------
+    def suggest_chunk_sizes(
+            self,
+            model_name: Optional[str] = None,
+            num_pairs: Optional[int] = None,
+            step: int = 10
+    ) -> List[Tuple[int, int]]:
+        if num_pairs is None:
+            raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
+        if model_name is None:
+            model_name = self.DEFAULT_EMBEDDINGS
+            logging.warning(f"⚠️ No embedding model provided. Using default: {model_name}")
+        model = SentenceTransformer(model_name)
+        max_tokens = getattr(model, "max_seq_length", 256)
+        approx_words = max(1, int(max_tokens * 0.75))
+        avg_len = self.corpus_stats.get("avg_len", 400)
+        max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
+        # Safe chunk and overlap ranges
+        chunk_sizes = list(range(50, max_chunk + 1, step))
+        overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
+        if not overlaps:
+            overlaps = [max(1, max_chunk // 4)]
+        candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
+        # Randomly sample requested number of pairs
+        if num_pairs >= len(candidates):
+            sampled = candidates
+        else:
+            sampled = random.sample(candidates, num_pairs)
+        logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
+        return sampled
+    # -----------------------------
+    # Recommendation Logic
+    # -----------------------------
+    def recommend(
+        self,
+        embedding_model: Optional[str] = None,
+        num_chunk_pairs: Optional[int] = 5
+    ) -> Dict[str, Any]:
         """
-        self.corpus_stats = corpus_stats
+        Recommend retriever, embedding, chunking, and strategy based on corpus stats.
+        Args:
+            embedding_model (str, optional): User-provided embedding model.
+            num_chunk_pairs (int, optional): Number of (chunk_size, overlap) pairs to generate.
-    def recommend(self):
+        Returns:
+            Dict[str, Any]: Recommended RAG configuration
+        """
         size = self.corpus_stats.get("size", 0)
         avg_len = self.corpus_stats.get("avg_len", 0)
-        if size < 1000:
-            return {"retriever": "BM25", "embedding_model": "OpenAI"}
-        elif size < 10000:
-            return {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
+        # Determine retriever
+        if size <= 2000:
+            retriever = "BM25"
+            if embedding_model is None:
+                embedding_model = self.DEFAULT_EMBEDDINGS
+        elif size <= 10000:
+            retriever = "Chroma"
+            if embedding_model is None:
+                embedding_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
         else:
-            return {"retriever": "FAISS", "embedding_model": "InstructorXL"}
+            retriever = "FAISS"
+            if embedding_model is None:
+                embedding_model = "sentence-transformers/all-mpnet-base-v2"
+        if embedding_model is None:
+            embedding_model = self.DEFAULT_EMBEDDINGS
+            logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
+        # Suggest chunk sizes
+        # Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
+        chunk_candidates = self.suggest_chunk_sizes(
+            model_name=embedding_model,
+            num_pairs=num_chunk_pairs
+        )
+        # Safety check
+        if not chunk_candidates:
+            raise RuntimeError("No chunk candidates generated.")
+        # Pick the first pair as default recommendation
+        chunk_size, overlap = chunk_candidates[0]
+        strategy = "fixed" if avg_len < 400 else "sentence"
+        recommendation = {
+            "retriever": retriever,
+            "embedding_model": embedding_model,
+            "chunk_size": chunk_size,
+            "overlap": overlap,
+            "strategy": strategy,
+            "chunk_candidates": chunk_candidates,
+        }
+        logging.info(f"🔮 AutoRAG Recommendation: {recommendation}")
+        return recommendation
+    # -----------------------------
+    # Full Auto-Tuning
+    # -----------------------------
+    def auto_tune(
+        self,
+        validation_set: str = None,
+        metric: str = "faithfulness",
+        trials: int = 5,
+        search_type: str = "random",
+        embedding_model: Optional[str] = None,
+        num_chunk_pairs: Optional[int] = 5
+    ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        """
+        Run a full automatic optimization using RAGMint.
+        Args:
+            validation_set (str): Path to validation set.
+            metric (str): Metric to optimize.
+            trials (int): Number of optimization trials.
+            search_type (str): Search strategy.
+            embedding_model (str, optional): User-provided embedding model.
+            num_chunk_pairs (int, optional): Number of chunk pairs to try.
+        Returns:
+            Tuple[Dict[str, Any], List[Dict[str, Any]]]: Best configuration and all trial results.
+        """
+        rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
+        chunk_candidates = rec["chunk_candidates"]
+        logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
+        tuner = RAGMint(
+            docs_path=self.docs_path,
+            retrievers=[rec["retriever"]],
+            embeddings=[rec["embedding_model"]],
+            rerankers=["mmr"],
+            chunk_sizes=[c[0] for c in chunk_candidates],
+            overlaps=[c[1] for c in chunk_candidates],
+            strategies=[rec["strategy"]],
+        )
+        best, results = tuner.optimize(
+            validation_set=validation_set,
+            metric=metric,
+            trials=trials,
+            search_type=search_type,
+        )
-    def auto_tune(self, validation_data):
-        config = self.recommend()
-        results = evaluate_config(config, validation_data)
-        return {"recommended": config, "results": results}
+        logging.info(f"🏁 AutoRAG tuning complete. Best: {best}")
+        return best, results

ragmint/core/chunking.py CHANGED Viewed

@@ -1,18 +1,45 @@
 from typing import List
+import re
+try:
+    import tiktoken
+except ImportError:
+    tiktoken = None
+try:
+    import nltk
+    nltk.download("punkt", quiet=True)
+    from nltk.tokenize import sent_tokenize
+except ImportError:
+    sent_tokenize = None
 class Chunker:
     """
-    Handles text chunking and splitting strategies:
-    - Fixed size chunks
-    - Overlapping windows
+    Handles text chunking strategies:
+    - fixed: character-based
+    - token: token-based (requires tiktoken)
+    - sentence: splits by full sentences (requires nltk)
     """
-    def __init__(self, chunk_size: int = 500, overlap: int = 100):
+    def __init__(self, chunk_size: int = 500, overlap: int = 100, strategy: str = "fixed"):
         self.chunk_size = chunk_size
         self.overlap = overlap
+        self.strategy = strategy
     def chunk_text(self, text: str) -> List[str]:
+        """Dispatches to the correct chunking strategy."""
+        if self.strategy == "token" and tiktoken:
+            return self._chunk_by_tokens(text)
+        elif self.strategy == "sentence" and sent_tokenize:
+            return self._chunk_by_sentences(text)
+        else:
+            return self._chunk_fixed(text)
+    # -------------------------------
+    # Fixed-length (default)
+    # -------------------------------
+    def _chunk_fixed(self, text: str) -> List[str]:
         chunks = []
         start = 0
         while start < len(text):
@@ -20,3 +47,40 @@ class Chunker:
             chunks.append(text[start:end])
             start += self.chunk_size - self.overlap
         return chunks
+    # -------------------------------
+    # Token-based (for LLM embedding)
+    # -------------------------------
+    def _chunk_by_tokens(self, text: str) -> List[str]:
+        if not tiktoken:
+            raise ImportError("tiktoken is required for token-based chunking.")
+        enc = tiktoken.get_encoding("cl100k_base")
+        tokens = enc.encode(text)
+        chunks = []
+        for i in range(0, len(tokens), self.chunk_size - self.overlap):
+            chunk_tokens = tokens[i:i + self.chunk_size]
+            chunks.append(enc.decode(chunk_tokens))
+        return chunks
+    # -------------------------------
+    # Sentence-based
+    # -------------------------------
+    def _chunk_by_sentences(self, text: str) -> List[str]:
+        if not sent_tokenize:
+            raise ImportError("nltk is required for sentence-based chunking.")
+        sentences = sent_tokenize(text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= self.chunk_size:
+                current_chunk += " " + sentence
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks

ragmint/core/embeddings.py CHANGED Viewed

@@ -1,19 +1,55 @@
 import numpy as np
+from dotenv import load_dotenv
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
-class EmbeddingModel:
+class Embeddings:
     """
-    Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
+    Wrapper for embedding backends: HuggingFace (SentenceTransformers) or Dummy.
+    Example:
+        model = Embeddings("huggingface", model_name="all-MiniLM-L6-v2")
+        embeddings = model.encode(["example text"])
     """
-    def __init__(self, backend: str = "dummy"):
-        self.backend = backend
+    def __init__(self, backend: str = "huggingface", model_name: str = None):
+        load_dotenv()
+        self.backend = backend.lower()
+        self.model_name = model_name or "all-MiniLM-L6-v2"
+        if self.backend == "huggingface":
+            if SentenceTransformer is None:
+                raise ImportError("Please install `sentence-transformers` to use HuggingFace embeddings.")
+            self.model = SentenceTransformer(self.model_name)
+            self.dim = self.model.get_sentence_embedding_dimension()
+        elif self.backend == "dummy":
+            self.model = None
+            self.dim = 768  # Default embedding dimension for dummy backend
+        else:
+            raise ValueError(f"Unsupported embedding backend: {backend}")
     def encode(self, texts):
-        if self.backend == "openai":
-            # Example placeholder — integrate with actual OpenAI API
-            return [np.random.rand(768) for _ in texts]
-        elif self.backend == "huggingface":
-            return [np.random.rand(768) for _ in texts]
+        if isinstance(texts, str):
+            texts = [texts]
+        if self.backend == "huggingface":
+            embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+        elif self.backend == "dummy":
+            # Return a NumPy array of shape (len(texts), dim)
+            embeddings = np.random.rand(len(texts), self.dim).astype(np.float32)
         else:
-            return [np.random.rand(768) for _ in texts]
+            raise ValueError(f"Unknown embedding backend: {self.backend}")
+        # ✅ Always ensure NumPy array output
+        if not isinstance(embeddings, np.ndarray):
+            embeddings = np.array(embeddings, dtype=np.float32)
+        return embeddings

ragmint/core/evaluation.py CHANGED Viewed

@@ -1,33 +1,53 @@
 import time
-from typing import Dict, Any
-from difflib import SequenceMatcher
+from typing import Dict, Any, List
+import numpy as np
+from .embeddings import Embeddings
 class Evaluator:
     """
-    Simple evaluation of generated answers:
-      - Faithfulness (similarity between answer and context)
-      - Latency
+    Semantic evaluation of generated answers:
+      - Faithfulness: cosine similarity between answer and context embeddings
+      - Latency: time to compute embeddings and similarity
     """
-    def __init__(self):
-        pass
+    def __init__(self, embeddings: Embeddings = None):
+        self.embeddings = embeddings or Embeddings()  # default to HuggingFace all-MiniLM-L6-v2
     def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
         start = time.time()
-        faithfulness = self._similarity(answer, context)
-        latency = time.time() - start
+        # Compute embeddings
+        emb_answer = self.embeddings.encode(answer)
+        emb_context = self.embeddings.encode(context)
+        # Compute cosine similarity
+        faithfulness = self._cosine_similarity(emb_answer, emb_context)
+        faithfulness = np.clip(faithfulness, 0.0, 1.0)
+        latency = time.time() - start
         return {
             "faithfulness": faithfulness,
             "latency": latency,
         }
-    def _similarity(self, a: str, b: str) -> float:
-        return SequenceMatcher(None, a, b).ratio()
+    @staticmethod
+    def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+        # Ensure vectors are 1D
+        a = a.flatten()
+        b = b.flatten()
+        if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
+            return 0.0
+        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
-def evaluate_config(config, validation_data):
-    evaluator = Evaluator()
+def evaluate_config(config: Dict[str, Any], validation_data: List[Dict[str, str]], embeddings: Embeddings = None) -> \
+List[Dict[str, Any]]:
+    """
+    Evaluate a set of model outputs against validation data.
+    """
+    evaluator = Evaluator(embeddings=embeddings)
     results = []
     for sample in validation_data:
         query = sample.get("query", "")
@@ -35,4 +55,3 @@ def evaluate_config(config, validation_data):
         context = sample.get("context", "")
         results.append(evaluator.evaluate(query, answer, context))
     return results

ragmint/core/pipeline.py CHANGED Viewed

@@ -1,33 +1,57 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, Optional
 from .retriever import Retriever
 from .reranker import Reranker
 from .evaluation import Evaluator
+from .chunking import Chunker
 class RAGPipeline:
     """
     Core Retrieval-Augmented Generation pipeline.
-    Simplified (no generator). It retrieves, reranks, and evaluates.
+    Retrieves, reranks, and evaluates a query given the configured backends.
+    Supports text chunking for optimal retrieval performance.
     """
-    def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
+    def __init__(
+        self,
+        retriever: Retriever,
+        reranker: Reranker,
+        evaluator: Evaluator,
+        chunk_size: int = 500,
+        overlap: int = 100,
+        chunking_strategy: str = "fixed"
+    ):
         self.retriever = retriever
         self.reranker = reranker
         self.evaluator = evaluator
-    def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
+        # Initialize chunker for preprocessing
+        self.chunker = Chunker(chunk_size=chunk_size, overlap=overlap, strategy=chunking_strategy)
+    def preprocess_docs(self, documents):
+        """Applies the selected chunking strategy to the document set."""
+        all_chunks = []
+        for doc in documents:
+            chunks = self.chunker.chunk_text(doc)
+            all_chunks.extend(chunks)
+        return all_chunks
+    def run(self, query: str, top_k: int = 5, use_chunking: bool = True) -> Dict[str, Any]:
+        # Optional preprocessing step
+        if use_chunking and hasattr(self.retriever, "documents") and self.retriever.documents:
+            self.retriever.documents = self.preprocess_docs(self.retriever.documents)
         # Retrieve documents
         retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
         # Rerank
         reranked_docs = self.reranker.rerank(query, retrieved_docs)
-        # Use top document as pseudo-answer
-        if reranked_docs:
-            answer = reranked_docs[0]["text"]
-        else:
-            answer = ""
+        # Construct pseudo-answer
+        answer = reranked_docs[0]["text"] if reranked_docs else ""
         context = "\n".join([d["text"] for d in reranked_docs])
+        # Evaluate
         metrics = self.evaluator.evaluate(query, answer, context)
         return {

ragmint 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl

ragmint 0.2.1py3-none-any.whl → 0.4.6py3-none-any.whl