PyPI - ragmint - Versions diffs - 0.2.3__tar.gz → 0.3.0__tar.gz - Mend

ragmint 0.2.3tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{ragmint-0.2.3/src/ragmint.egg-info → ragmint-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragmint
-Version: 0.2.3
+Version: 0.3.0
 Summary: A modular framework for evaluating and optimizing RAG pipelines.
 Author-email: Andre Oliveira <oandreoliveira@outlook.com>
 License: Apache License 2.0
@@ -11,7 +11,7 @@ Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy>=1.23
+Requires-Dist: numpy<2.0.0
 Requires-Dist: pandas>=2.0
 Requires-Dist: scikit-learn>=1.3
 Requires-Dist: openai>=1.0
@@ -24,6 +24,8 @@ Requires-Dist: pytest
 Requires-Dist: colorama
 Requires-Dist: google-generativeai>=0.8.0
 Requires-Dist: supabase>=2.4.0
+Requires-Dist: python-dotenv
+Requires-Dist: sentence-transformers
 Dynamic: license-file
 # Ragmint
@@ -49,8 +51,8 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
 - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
 - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
 - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
-- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
-- 🧩 **Embeddings** — OpenAI, HuggingFace
+- ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
+- 🧩 **Embeddings** — Hugging Face
 - 💾 **Caching, experiment tracking, and reproducibility** out of the box
 - 🧰 **Clean modular structure** for easy integration in research and production setups
@@ -94,15 +96,69 @@ optimization:
 ### 3️⃣ Manual Pipeline Usage
 ```python
-from ragmint.core.pipeline import RAGPipeline
+from ragmint.tuner import RAGMint
+# Initialize RAGMint with available components
+rag = RAGMint(
+    docs_path="data/docs/",
+    retrievers=["faiss", "chroma", "sklearn"],
+    embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
+    rerankers=["mmr"]
+)
+# Run optimization over 3 trials using the default validation set
+best, results = rag.optimize(
+    validation_set=None,
+    metric="faithfulness",
+    trials=3
+)
+print("Best configuration:", best)
+```
+---
+# 🧩 Embeddings and Retrievers
+**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
+---
+## 🔤 Available Embeddings (Hugging Face / OpenAI)
+You can select from the following models:
+* `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
+* `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
+* `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
+* `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
-pipeline = RAGPipeline({
-    "embedding_model": "text-embedding-3-small",
-    "retriever": "faiss",
-})
-result = pipeline.run("What is retrieval-augmented generation?")
-print(result)
+### Configuration Example
+Use the following format in your config file to specify the embedding model:
+```yaml
+embedding_model: sentence-transformers/all-MiniLM-L6-v2
+```
+---
+## 🔍 Available Retrievers
+**Ragmint** integrates multiple **retrieval backends** to suit different needs:
+| Retriever | Description |
+| :--- | :--- |
+| **FAISS** | Fast vector similarity search; efficient for dense embeddings |
+| **Chroma** | Persistent vector DB; works well for incremental updates |
+| **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
+### Configuration Example
+To specify the retriever in your configuration file, use the following format:
+```yaml
+retriever: faiss
 ```
 ---
@@ -174,8 +230,7 @@ lb.show_top(3)
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive natural language insights
-on **why** one performs better.
+Compare two RAG configurations and receive **natural language insights** on why one performs better.
 ```python
 from ragmint.explainer import explain_results
@@ -189,7 +244,7 @@ print(explanation)
 > Set your API keys in a `.env` file or via environment variables:
 > ```
-> export GOOGLE_API_KEY="your_gemini_key"
+> export GEMINI_API_KEY="your_gemini_key"
 > export ANTHROPIC_API_KEY="your_claude_key"
 > ```
@@ -240,16 +295,21 @@ Your `pyproject.toml` includes all required dependencies:
 name = "ragmint"
 version = "0.1.0"
 dependencies = [
-    "numpy",
-    "optuna",
-    "scikit-learn",
-    "faiss-cpu",
-    "chromadb",
-    "pytest",
-    "openai",
-    "tqdm",
-    "google-generativeai",
-    "google-genai",
+  "numpy<2.0.0",
+  "pandas>=2.0",
+  "scikit-learn>=1.3",
+  "openai>=1.0",
+  "tqdm",
+  "pyyaml",
+  "chromadb>=0.4",
+  "faiss-cpu; sys_platform != 'darwin'",
+  "optuna>=3.0",
+  "pytest",
+  "colorama",
+  "google-generativeai>=0.8.0",
+  "supabase>=2.4.0",
+  "python-dotenv",
+  "sentence-transformers"
 ]
 ```

{ragmint-0.2.3 → ragmint-0.3.0}/README.md RENAMED Viewed

@@ -21,8 +21,8 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
 - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
 - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
 - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
-- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
-- 🧩 **Embeddings** — OpenAI, HuggingFace
+- ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
+- 🧩 **Embeddings** — Hugging Face
 - 💾 **Caching, experiment tracking, and reproducibility** out of the box
 - 🧰 **Clean modular structure** for easy integration in research and production setups
@@ -66,15 +66,69 @@ optimization:
 ### 3️⃣ Manual Pipeline Usage
 ```python
-from ragmint.core.pipeline import RAGPipeline
+from ragmint.tuner import RAGMint
+# Initialize RAGMint with available components
+rag = RAGMint(
+    docs_path="data/docs/",
+    retrievers=["faiss", "chroma", "sklearn"],
+    embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
+    rerankers=["mmr"]
+)
+# Run optimization over 3 trials using the default validation set
+best, results = rag.optimize(
+    validation_set=None,
+    metric="faithfulness",
+    trials=3
+)
+print("Best configuration:", best)
+```
+---
+# 🧩 Embeddings and Retrievers
+**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
+---
+## 🔤 Available Embeddings (Hugging Face / OpenAI)
+You can select from the following models:
+* `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
+* `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
+* `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
+* `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
-pipeline = RAGPipeline({
-    "embedding_model": "text-embedding-3-small",
-    "retriever": "faiss",
-})
-result = pipeline.run("What is retrieval-augmented generation?")
-print(result)
+### Configuration Example
+Use the following format in your config file to specify the embedding model:
+```yaml
+embedding_model: sentence-transformers/all-MiniLM-L6-v2
+```
+---
+## 🔍 Available Retrievers
+**Ragmint** integrates multiple **retrieval backends** to suit different needs:
+| Retriever | Description |
+| :--- | :--- |
+| **FAISS** | Fast vector similarity search; efficient for dense embeddings |
+| **Chroma** | Persistent vector DB; works well for incremental updates |
+| **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
+### Configuration Example
+To specify the retriever in your configuration file, use the following format:
+```yaml
+retriever: faiss
 ```
 ---
@@ -146,8 +200,7 @@ lb.show_top(3)
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive natural language insights
-on **why** one performs better.
+Compare two RAG configurations and receive **natural language insights** on why one performs better.
 ```python
 from ragmint.explainer import explain_results
@@ -161,7 +214,7 @@ print(explanation)
 > Set your API keys in a `.env` file or via environment variables:
 > ```
-> export GOOGLE_API_KEY="your_gemini_key"
+> export GEMINI_API_KEY="your_gemini_key"
 > export ANTHROPIC_API_KEY="your_claude_key"
 > ```
@@ -212,16 +265,21 @@ Your `pyproject.toml` includes all required dependencies:
 name = "ragmint"
 version = "0.1.0"
 dependencies = [
-    "numpy",
-    "optuna",
-    "scikit-learn",
-    "faiss-cpu",
-    "chromadb",
-    "pytest",
-    "openai",
-    "tqdm",
-    "google-generativeai",
-    "google-genai",
+  "numpy<2.0.0",
+  "pandas>=2.0",
+  "scikit-learn>=1.3",
+  "openai>=1.0",
+  "tqdm",
+  "pyyaml",
+  "chromadb>=0.4",
+  "faiss-cpu; sys_platform != 'darwin'",
+  "optuna>=3.0",
+  "pytest",
+  "colorama",
+  "google-generativeai>=0.8.0",
+  "supabase>=2.4.0",
+  "python-dotenv",
+  "sentence-transformers"
 ]
 ```

{ragmint-0.2.3 → ragmint-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ragmint"
-version = "0.2.3"
+version = "0.3.0"
 description = "A modular framework for evaluating and optimizing RAG pipelines."
 readme = "README.md"
 license = { text = "Apache License 2.0" }
@@ -14,7 +14,7 @@ authors = [
 keywords = ["RAG", "LLM", "retrieval", "optimization", "AI", "evaluation"]
 requires-python = ">=3.9"
 dependencies = [
-  "numpy>=1.23",
+  "numpy<2.0.0",
   "pandas>=2.0",
   "scikit-learn>=1.3",
   "openai>=1.0",
@@ -26,7 +26,9 @@ dependencies = [
   "pytest",
   "colorama",
   "google-generativeai>=0.8.0",
-  "supabase>=2.4.0"
+  "supabase>=2.4.0",
+  "python-dotenv",
+  "sentence-transformers"
 ]
 [project.urls]

ragmint-0.3.0/src/ragmint/core/embeddings.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+from dotenv import load_dotenv
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
+class Embeddings:
+    """
+    Wrapper for embedding backends: HuggingFace (SentenceTransformers) or Dummy.
+    Example:
+        model = Embeddings("huggingface", model_name="all-MiniLM-L6-v2")
+        embeddings = model.encode(["example text"])
+    """
+    def __init__(self, backend: str = "huggingface", model_name: str = None):
+        load_dotenv()
+        self.backend = backend.lower()
+        self.model_name = model_name or "all-MiniLM-L6-v2"
+        if self.backend == "huggingface":
+            if SentenceTransformer is None:
+                raise ImportError("Please install `sentence-transformers` to use HuggingFace embeddings.")
+            self.model = SentenceTransformer(self.model_name)
+            self.dim = self.model.get_sentence_embedding_dimension()
+        elif self.backend == "dummy":
+            self.model = None
+            self.dim = 768  # Default embedding dimension for dummy backend
+        else:
+            raise ValueError(f"Unsupported embedding backend: {backend}")
+    def encode(self, texts):
+        if isinstance(texts, str):
+            texts = [texts]
+        if self.backend == "huggingface":
+            embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+        elif self.backend == "dummy":
+            # Return a NumPy array of shape (len(texts), dim)
+            embeddings = np.random.rand(len(texts), self.dim).astype(np.float32)
+        else:
+            raise ValueError(f"Unknown embedding backend: {self.backend}")
+        # ✅ Always ensure NumPy array output
+        if not isinstance(embeddings, np.ndarray):
+            embeddings = np.array(embeddings, dtype=np.float32)
+        return embeddings

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/core/pipeline.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict
 from .retriever import Retriever
 from .reranker import Reranker
 from .evaluation import Evaluator
@@ -7,7 +7,7 @@ from .evaluation import Evaluator
 class RAGPipeline:
     """
     Core Retrieval-Augmented Generation pipeline.
-    Simplified (no generator). It retrieves, reranks, and evaluates.
+    Retrieves, reranks, and evaluates a query given the configured backends.
     """
     def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
@@ -16,18 +16,17 @@ class RAGPipeline:
         self.evaluator = evaluator
     def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
-        # Retrieve documents
+        # Retrieve
         retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
         # Rerank
         reranked_docs = self.reranker.rerank(query, retrieved_docs)
-        # Use top document as pseudo-answer
-        if reranked_docs:
-            answer = reranked_docs[0]["text"]
-        else:
-            answer = ""
+        # Construct pseudo-answer from top doc
+        answer = reranked_docs[0]["text"] if reranked_docs else ""
         context = "\n".join([d["text"] for d in reranked_docs])
+        # Evaluate
         metrics = self.evaluator.evaluate(query, answer, context)
         return {

ragmint-0.3.0/src/ragmint/core/retriever.py ADDED Viewed

@@ -0,0 +1,148 @@
+from typing import List, Dict, Any, Optional
+import numpy as np
+from .embeddings import Embeddings
+# Optional imports
+try:
+    import faiss
+except ImportError:
+    faiss = None
+try:
+    import chromadb
+except ImportError:
+    chromadb = None
+try:
+    from sklearn.neighbors import BallTree
+except ImportError:
+    BallTree = None
+class Retriever:
+    """
+    Multi-backend retriever supporting NumPy, FAISS, Chroma, and Scikit-learn BallTree.
+    Backends:
+        - "numpy"  : basic cosine similarity using NumPy (default)
+        - "faiss"  : fast dense vector search (in-memory)
+        - "chroma" : persistent local vector database
+        - "sklearn": BallTree for cosine or Euclidean distance
+    Example:
+        retriever = Retriever(embedder, documents=["A", "B", "C"], backend="faiss")
+        retriever.retrieve("example query", top_k=3)
+    """
+    def __init__(
+        self,
+        embedder: Embeddings,
+        documents: Optional[List[str]] = None,
+        embeddings: Optional[np.ndarray] = None,
+        backend: str = "numpy",
+    ):
+        self.embedder = embedder
+        self.documents = documents or []
+        self.backend = backend.lower()
+        self.embeddings = None
+        self.index = None
+        self.client = None
+        # Initialize embeddings
+        if embeddings is not None:
+            self.embeddings = np.array(embeddings)
+        elif self.documents:
+            self.embeddings = self.embedder.encode(self.documents)
+        else:
+            self.embeddings = np.zeros((0, self.embedder.dim))
+        # Normalize for cosine
+        if self.embeddings.size > 0:
+            self.embeddings = self._normalize(self.embeddings)
+        # Initialize backend
+        self._init_backend()
+    # ------------------------
+    # Backend Initialization
+    # ------------------------
+    def _init_backend(self):
+        if self.backend == "faiss":
+            if faiss is None:
+                raise ImportError("faiss not installed. Run `pip install faiss-cpu`.")
+            self.index = faiss.IndexFlatIP(self.embedder.dim)
+            self.index.add(self.embeddings.astype("float32"))
+        elif self.backend == "chroma":
+            if chromadb is None:
+                raise ImportError("chromadb not installed. Run `pip install chromadb`.")
+            self.client = chromadb.Client()
+            self.collection = self.client.create_collection(name="ragmint_retriever")
+            for i, doc in enumerate(self.documents):
+                self.collection.add(
+                    ids=[str(i)],
+                    documents=[doc],
+                    embeddings=[self.embeddings[i].tolist()],
+                )
+        elif self.backend == "sklearn":
+            if BallTree is None:
+                raise ImportError("scikit-learn not installed. Run `pip install scikit-learn`.")
+            self.index = BallTree(self.embeddings)
+        elif self.backend != "numpy":
+            raise ValueError(f"Unsupported retriever backend: {self.backend}")
+    # ------------------------
+    # Retrieval
+    # ------------------------
+    def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        if len(self.documents) == 0 or self.embeddings.size == 0:
+            return [{"text": "", "score": 0.0}]
+        query_vec = self.embedder.encode([query])[0]
+        query_vec = self._normalize(query_vec)
+        if self.backend == "numpy":
+            scores = np.dot(self.embeddings, query_vec)
+            top_indices = np.argsort(scores)[::-1][:top_k]
+            return [
+                {"text": self.documents[i], "score": float(scores[i])}
+                for i in top_indices
+            ]
+        elif self.backend == "faiss":
+            query_vec = np.expand_dims(query_vec.astype("float32"), axis=0)
+            scores, indices = self.index.search(query_vec, top_k)
+            return [
+                {"text": self.documents[int(i)], "score": float(scores[0][j])}
+                for j, i in enumerate(indices[0])
+            ]
+        elif self.backend == "chroma":
+            results = self.collection.query(query_texts=[query], n_results=top_k)
+            docs = results["documents"][0]
+            scores = results["distances"][0]
+            return [{"text": d, "score": 1 - s} for d, s in zip(docs, scores)]
+        elif self.backend == "sklearn":
+            distances, indices = self.index.query([query_vec], k=top_k)
+            scores = 1 - distances[0]
+            return [
+                {"text": self.documents[int(i)], "score": float(scores[j])}
+                for j, i in enumerate(indices[0])
+            ]
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
+    # ------------------------
+    # Utils
+    # ------------------------
+    @staticmethod
+    def _normalize(vectors: np.ndarray) -> np.ndarray:
+        if vectors.ndim == 1:
+            norm = np.linalg.norm(vectors)
+            return vectors / norm if norm > 0 else vectors
+        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
+        return np.divide(vectors, norms, out=np.zeros_like(vectors), where=norms != 0)

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/explainer.py RENAMED Viewed

@@ -7,9 +7,12 @@ outperforms another. Falls back gracefully if no API key is provided.
 import os
 import json
+from dotenv import load_dotenv
+# Load environment variables from .env file if available
+load_dotenv()
-def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-pro") -> str:
+def explain_results(results_a: dict, results_b: dict, model: str = "gemini-2.5-flash-lite") -> str:
     """
     Generate a natural-language explanation comparing two RAG experiment results.
     Priority:
@@ -26,8 +29,7 @@ def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-p
     """
     anthropic_key = os.getenv("ANTHROPIC_API_KEY")
-    google_key = os.getenv("GEMINI_API_KEY")
+    google_key = os.getenv("GOOGLE_API_KEY")  # fixed var name
     # 1️⃣ Try Anthropic Claude first
     if anthropic_key:

ragmint-0.3.0/src/ragmint/tests/test_embeddings.py ADDED Viewed

@@ -0,0 +1,46 @@
+import numpy as np
+import pytest
+from ragmint.core.embeddings import Embeddings
+def test_dummy_backend_output_shape():
+    model = Embeddings(backend="dummy")
+    texts = ["hello", "world"]
+    embeddings = model.encode(texts)
+    # Expect 2x768 array
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.shape == (2, 768)
+    assert embeddings.dtype == np.float32
+def test_dummy_backend_single_string():
+    model = Embeddings(backend="dummy")
+    text = "test"
+    embeddings = model.encode(text)
+    assert embeddings.shape == (1, 768)
+    assert isinstance(embeddings, np.ndarray)
+'''@pytest.mark.skipif(
+    not hasattr(__import__('importlib').util.find_spec("sentence_transformers"), "loader"),
+    reason="sentence-transformers not installed"
+)
+def test_huggingface_backend_output_shape():
+    model = Embeddings(backend="huggingface", model_name="all-MiniLM-L6-v2")
+    texts = ["This is a test.", "Another sentence."]
+    embeddings = model.encode(texts)
+    # Expect 2x384 for MiniLM-L6-v2
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.ndim == 2
+    assert embeddings.shape[0] == len(texts)
+    assert embeddings.dtype == np.float32
+'''
+def test_invalid_backend():
+    try:
+        Embeddings(backend="unknown")
+    except ValueError as e:
+        assert "Unsupported embedding backend" in str(e)

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/tests/test_explainer_integration.py RENAMED Viewed

@@ -7,7 +7,7 @@ from ragmint.explainer import explain_results
 def test_real_gemini_explanation():
     """Run real Gemini call if GOOGLE_API_KEY is set."""
     if not os.getenv("GEMINI_API_KEY"):
-        pytest.skip("GOOGLE_API_KEY not set")
+        pytest.skip("GEMINI_API_KEY not set")
     config_a = {"retriever": "FAISS", "embedding_model": "OpenAI"}
     config_b = {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/tests/test_pipeline.py RENAMED Viewed

@@ -1,14 +1,15 @@
 import numpy as np
 from ragmint.core.pipeline import RAGPipeline
 from ragmint.core.retriever import Retriever
+from ragmint.core.embeddings import Embeddings
 from ragmint.core.reranker import Reranker
 from ragmint.core.evaluation import Evaluator
 def test_pipeline_run():
     docs = ["doc1 text", "doc2 text"]
-    embeddings = [np.random.rand(4) for _ in range(2)]
-    retriever = Retriever(embeddings, docs)
+    embedder = Embeddings(backend="dummy")
+    retriever = Retriever(embedder=embedder, documents=docs)
     reranker = Reranker("mmr")
     evaluator = Evaluator()
     pipeline = RAGPipeline(retriever, reranker, evaluator)

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/tests/test_retriever.py RENAMED Viewed

@@ -1,11 +1,12 @@
 import numpy as np
 from ragmint.core.retriever import Retriever
+from ragmint.core.embeddings import Embeddings
 def test_retrieve_basic():
-    embeddings = [np.random.rand(5) for _ in range(3)]
     docs = ["doc A", "doc B", "doc C"]
-    retriever = Retriever(embeddings, docs)
+    embedder = Embeddings(backend="dummy")
+    retriever = Retriever(embedder=embedder, documents=docs)
     results = retriever.retrieve("sample query", top_k=2)
     assert isinstance(results, list)

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/tests/test_tuner.py RENAMED Viewed

@@ -46,7 +46,7 @@ def test_optimize_ragmint(tmp_path, validation_mode, monkeypatch):
     rag = RAGMint(
         docs_path=docs_path,
         retrievers=["faiss"],
-        embeddings=["text-embedding-3-small"],
+        embeddings=["all-MiniLM-L6-v2"],
         rerankers=["mmr"]
     )

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint/tuner.py RENAMED Viewed

@@ -1,16 +1,15 @@
 import os
 import json
 import logging
-from typing import Any, Dict, List, Tuple, Optional
+from typing import Any, Dict, List, Tuple
 from time import perf_counter
 from .core.pipeline import RAGPipeline
-from .core.embeddings import EmbeddingModel
+from .core.embeddings import Embeddings
 from .core.retriever import Retriever
 from .core.reranker import Reranker
 from .core.evaluation import Evaluator
 from .optimization.search import GridSearch, RandomSearch, BayesianSearch
 from .utils.data_loader import load_validation_set
 logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
@@ -19,6 +18,7 @@ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
 class RAGMint:
     """
     Main RAG pipeline optimizer and evaluator.
+    Runs combinations of retrievers, embeddings, and rerankers to find the best setup.
     """
     def __init__(
@@ -36,53 +36,91 @@ class RAGMint:
         self.documents: List[str] = self._load_docs()
         self.embeddings_cache: Dict[str, Any] = {}
+    # -------------------------
+    # Document Loading
+    # -------------------------
     def _load_docs(self) -> List[str]:
         if not os.path.exists(self.docs_path):
             logging.warning(f"Corpus path not found: {self.docs_path}")
             return []
         docs = []
         for file in os.listdir(self.docs_path):
-            if file.endswith(".txt") or file.endswith(".md") or file.endswith(".rst"):
+            if file.endswith((".txt", ".md", ".rst")):
                 with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
                     docs.append(f.read())
-        logging.info(f"Loaded {len(docs)} documents from {self.docs_path}")
+        logging.info(f"📚 Loaded {len(docs)} documents from {self.docs_path}")
         return docs
-    def _embed_docs(self, model_name: str):
+    # -------------------------
+    # Embedding Cache
+    # -------------------------
+    def _embed_docs(self, model_name: str) -> Any:
+        """Compute and cache document embeddings."""
         if model_name in self.embeddings_cache:
             return self.embeddings_cache[model_name]
-        model = EmbeddingModel(model_name)
+        model = Embeddings(backend="huggingface", model_name=model_name)
         embeddings = model.encode(self.documents)
         self.embeddings_cache[model_name] = embeddings
         return embeddings
+    # -------------------------
+    # Build Pipeline
+    # -------------------------
     def _build_pipeline(self, config: Dict[str, str]) -> RAGPipeline:
-        emb_model = EmbeddingModel(config["embedding_model"])
-        embeddings = self._embed_docs(config["embedding_model"])
-        retriever = Retriever(embeddings, self.documents)
-        reranker = Reranker(config["reranker"])
+        """Builds a pipeline from one configuration."""
+        retriever_backend = config["retriever"]
+        model_name = config["embedding_model"]
+        reranker_name = config["reranker"]
+        # Load embeddings (cached)
+        embeddings = self._embed_docs(model_name)
+        embedder = Embeddings(backend="huggingface", model_name=model_name)
+        # Initialize retriever with backend
+        logging.info(f"⚙️ Initializing retriever backend: {retriever_backend}")
+        retriever = Retriever(
+            embedder=embedder,
+            documents=self.documents,
+            embeddings=embeddings,
+            backend=retriever_backend,
+        )
+        reranker = Reranker(reranker_name)
         evaluator = Evaluator()
         return RAGPipeline(retriever, reranker, evaluator)
+    # -------------------------
+    # Evaluate Configuration
+    # -------------------------
     def _evaluate_config(
         self, config: Dict[str, Any], validation: List[Dict[str, str]], metric: str
     ) -> Dict[str, float]:
+        """Evaluates a single configuration."""
         pipeline = self._build_pipeline(config)
         scores = []
         start = perf_counter()
         for sample in validation:
-            query = sample.get("question") or sample.get("query")
-            reference = sample.get("answer")
+            query = sample.get("question") or sample.get("query") or ""
             result = pipeline.run(query)
             score = result["metrics"].get(metric, 0.0)
             scores.append(score)
-        elapsed = perf_counter() - start
+        elapsed = perf_counter() - start
         avg_score = sum(scores) / len(scores) if scores else 0.0
-        return {metric: avg_score, "latency": elapsed / max(1, len(validation))}
+        return {
+            metric: avg_score,
+            "latency": elapsed / max(1, len(validation)),
+        }
+    # -------------------------
+    # Optimize
+    # -------------------------
     def optimize(
         self,
         validation_set: str,
@@ -90,6 +128,7 @@ class RAGMint:
         search_type: str = "random",
         trials: int = 10,
     ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        """Run optimization search over retrievers/embeddings/rerankers."""
         validation = load_validation_set(validation_set or "default")
         search_space = {
@@ -98,8 +137,9 @@ class RAGMint:
             "reranker": self.rerankers,
         }
-        logging.info(f"Starting {search_type} optimization with {trials} trials")
+        logging.info(f"🚀 Starting {search_type} optimization with {trials} trials")
+        # Select search strategy
         try:
             if search_type == "grid":
                 searcher = GridSearch(search_space)
@@ -108,16 +148,18 @@ class RAGMint:
             else:
                 searcher = RandomSearch(search_space, n_trials=trials)
         except Exception as e:
-            logging.warning(f"Falling back to RandomSearch due to missing deps: {e}")
+            logging.warning(f"⚠️ Fallback to RandomSearch due to missing deps: {e}")
             searcher = RandomSearch(search_space, n_trials=trials)
+        # Run trials
         results = []
         for config in searcher:
             metrics = self._evaluate_config(config, validation, metric)
             result = {**config, **metrics}
             results.append(result)
-            logging.info(f"Tested config: {config} -> {metrics}")
+            logging.info(f"🔹 Tested config: {config} -> {metrics}")
         best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
-        logging.info(f"✅ Best configuration found: {best}")
+        logging.info(f"🏆 Best configuration: {best}")
         return best, results

{ragmint-0.2.3 → ragmint-0.3.0/src/ragmint.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragmint
-Version: 0.2.3
+Version: 0.3.0
 Summary: A modular framework for evaluating and optimizing RAG pipelines.
 Author-email: Andre Oliveira <oandreoliveira@outlook.com>
 License: Apache License 2.0
@@ -11,7 +11,7 @@ Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy>=1.23
+Requires-Dist: numpy<2.0.0
 Requires-Dist: pandas>=2.0
 Requires-Dist: scikit-learn>=1.3
 Requires-Dist: openai>=1.0
@@ -24,6 +24,8 @@ Requires-Dist: pytest
 Requires-Dist: colorama
 Requires-Dist: google-generativeai>=0.8.0
 Requires-Dist: supabase>=2.4.0
+Requires-Dist: python-dotenv
+Requires-Dist: sentence-transformers
 Dynamic: license-file
 # Ragmint
@@ -49,8 +51,8 @@ It provides a complete toolkit for **retriever selection**, **embedding model tu
 - 🧠 **Explainability Layer** — interprets RAG performance via Gemini or Claude APIs
 - 🏆 **Leaderboard Tracking** — stores and ranks experiment runs via JSON or external DB
 - 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
-- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
-- 🧩 **Embeddings** — OpenAI, HuggingFace
+- ⚙️ **Retrievers** — FAISS, Chroma, scikit-learn
+- 🧩 **Embeddings** — Hugging Face
 - 💾 **Caching, experiment tracking, and reproducibility** out of the box
 - 🧰 **Clean modular structure** for easy integration in research and production setups
@@ -94,15 +96,69 @@ optimization:
 ### 3️⃣ Manual Pipeline Usage
 ```python
-from ragmint.core.pipeline import RAGPipeline
+from ragmint.tuner import RAGMint
+# Initialize RAGMint with available components
+rag = RAGMint(
+    docs_path="data/docs/",
+    retrievers=["faiss", "chroma", "sklearn"],
+    embeddings=["all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"],
+    rerankers=["mmr"]
+)
+# Run optimization over 3 trials using the default validation set
+best, results = rag.optimize(
+    validation_set=None,
+    metric="faithfulness",
+    trials=3
+)
+print("Best configuration:", best)
+```
+---
+# 🧩 Embeddings and Retrievers
+**Ragmint** supports a flexible set of embeddings and retrievers, allowing you to adapt easily to various **RAG architectures**.
+---
+## 🔤 Available Embeddings (Hugging Face / OpenAI)
+You can select from the following models:
+* `sentence-transformers/all-MiniLM-L6-v2` — **lightweight**, general-purpose
+* `sentence-transformers/all-mpnet-base-v2` — **higher accuracy**, slower
+* `BAAI/bge-base-en-v1.5` — **multilingual**, dense embeddings
+* `intfloat/multilingual-e5-base` — ideal for **multilingual corpora**
-pipeline = RAGPipeline({
-    "embedding_model": "text-embedding-3-small",
-    "retriever": "faiss",
-})
-result = pipeline.run("What is retrieval-augmented generation?")
-print(result)
+### Configuration Example
+Use the following format in your config file to specify the embedding model:
+```yaml
+embedding_model: sentence-transformers/all-MiniLM-L6-v2
+```
+---
+## 🔍 Available Retrievers
+**Ragmint** integrates multiple **retrieval backends** to suit different needs:
+| Retriever | Description |
+| :--- | :--- |
+| **FAISS** | Fast vector similarity search; efficient for dense embeddings |
+| **Chroma** | Persistent vector DB; works well for incremental updates |
+| **scikit-learn (NearestNeighbors)** | Lightweight, zero-dependency local retriever |
+### Configuration Example
+To specify the retriever in your configuration file, use the following format:
+```yaml
+retriever: faiss
 ```
 ---
@@ -174,8 +230,7 @@ lb.show_top(3)
 ## 🧠 Explainability with Gemini / Claude
-Compare two RAG configurations and receive natural language insights
-on **why** one performs better.
+Compare two RAG configurations and receive **natural language insights** on why one performs better.
 ```python
 from ragmint.explainer import explain_results
@@ -189,7 +244,7 @@ print(explanation)
 > Set your API keys in a `.env` file or via environment variables:
 > ```
-> export GOOGLE_API_KEY="your_gemini_key"
+> export GEMINI_API_KEY="your_gemini_key"
 > export ANTHROPIC_API_KEY="your_claude_key"
 > ```
@@ -240,16 +295,21 @@ Your `pyproject.toml` includes all required dependencies:
 name = "ragmint"
 version = "0.1.0"
 dependencies = [
-    "numpy",
-    "optuna",
-    "scikit-learn",
-    "faiss-cpu",
-    "chromadb",
-    "pytest",
-    "openai",
-    "tqdm",
-    "google-generativeai",
-    "google-genai",
+  "numpy<2.0.0",
+  "pandas>=2.0",
+  "scikit-learn>=1.3",
+  "openai>=1.0",
+  "tqdm",
+  "pyyaml",
+  "chromadb>=0.4",
+  "faiss-cpu; sys_platform != 'darwin'",
+  "optuna>=3.0",
+  "pytest",
+  "colorama",
+  "google-generativeai>=0.8.0",
+  "supabase>=2.4.0",
+  "python-dotenv",
+  "sentence-transformers"
 ]
 ```

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint.egg-info/SOURCES.txt RENAMED Viewed

@@ -27,6 +27,7 @@ src/ragmint/optimization/search.py
 src/ragmint/tests/__init__.py
 src/ragmint/tests/conftest.py
 src/ragmint/tests/test_autotuner.py
+src/ragmint/tests/test_embeddings.py
 src/ragmint/tests/test_explainer.py
 src/ragmint/tests/test_explainer_integration.py
 src/ragmint/tests/test_integration_autotuner_ragmint.py

{ragmint-0.2.3 → ragmint-0.3.0}/src/ragmint.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-numpy>=1.23
+numpy<2.0.0
 pandas>=2.0
 scikit-learn>=1.3
 openai>=1.0
@@ -10,6 +10,8 @@ pytest
 colorama
 google-generativeai>=0.8.0
 supabase>=2.4.0
+python-dotenv
+sentence-transformers
 [:sys_platform != "darwin"]
 faiss-cpu

ragmint-0.2.3/src/ragmint/core/embeddings.py DELETED Viewed

@@ -1,19 +0,0 @@
-import numpy as np
-class EmbeddingModel:
-    """
-    Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
-    """
-    def __init__(self, backend: str = "dummy"):
-        self.backend = backend
-    def encode(self, texts):
-        if self.backend == "openai":
-            # Example placeholder — integrate with actual OpenAI API
-            return [np.random.rand(768) for _ in texts]
-        elif self.backend == "huggingface":
-            return [np.random.rand(768) for _ in texts]
-        else:
-            return [np.random.rand(768) for _ in texts]

ragmint-0.2.3/src/ragmint/core/retriever.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import List, Dict, Any
-import numpy as np
-class Retriever:
-    """
-    Simple vector retriever using cosine similarity.
-    """
-    def __init__(self, embeddings: List[np.ndarray], documents: List[str]):
-        if len(embeddings) == 0:
-            self.embeddings = np.zeros((1, 768))
-        else:
-            self.embeddings = np.array(embeddings)
-        self.documents = documents or [""]
-    def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
-        if self.embeddings.size == 0 or len(self.documents) == 0:
-            return [{"text": "", "score": 0.0}]
-        query_vec = self._embed(query)
-        scores = self._cosine_similarity(query_vec, self.embeddings)
-        top_indices = np.argsort(scores)[::-1][:min(top_k, len(scores))]
-        return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
-    def _embed(self, query: str) -> np.ndarray:
-        dim = self.embeddings.shape[1] if len(self.embeddings.shape) > 1 else 768
-        return np.random.rand(dim)
-    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
-        a_norm = a / np.linalg.norm(a)
-        b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
-        return np.dot(b_norm, a_norm)