PyPI - ragmint - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ragmint 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ragmint might be problematic. Click here for more details.

Files changed (28) hide show

ragmint/__init__.py +0 -0
ragmint/__main__.py +28 -0
ragmint/core/__init__.py +0 -0
ragmint/core/chunking.py +22 -0
ragmint/core/embeddings.py +19 -0
ragmint/core/evaluation.py +27 -0
ragmint/core/pipeline.py +38 -0
ragmint/core/reranker.py +62 -0
ragmint/core/retriever.py +33 -0
ragmint/experiments/__init__.py +0 -0
ragmint/optimization/__init__.py +0 -0
ragmint/optimization/search.py +48 -0
ragmint/tests/__init__.py +0 -0
ragmint/tests/test_pipeline.py +19 -0
ragmint/tests/test_retriever.py +14 -0
ragmint/tests/test_search.py +17 -0
ragmint/tests/test_tuner.py +38 -0
ragmint/tuner.py +123 -0
ragmint/utils/__init__.py +0 -0
ragmint/utils/caching.py +37 -0
ragmint/utils/data_loader.py +35 -0
ragmint/utils/logger.py +36 -0
ragmint/utils/metrics.py +27 -0
ragmint-0.1.0.dist-info/METADATA +218 -0
ragmint-0.1.0.dist-info/RECORD +28 -0
ragmint-0.1.0.dist-info/WHEEL +5 -0
ragmint-0.1.0.dist-info/licenses/LICENSE +19 -0
ragmint-0.1.0.dist-info/top_level.txt +1 -0

ragmint/__init__.py ADDED Viewed

File without changes

ragmint/__main__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from pathlib import Path
+from ragmint.tuner import RAGMint
+def main():
+    # Dynamically resolve the path to the installed ragmint package
+    base_dir = Path(__file__).resolve().parent
+    docs_path = base_dir / "experiments" / "corpus"
+    validation_file = base_dir / "experiments" / "validation_qa.json"
+    rag = RAGMint(
+        docs_path=str(docs_path),
+        retrievers=["faiss"],
+        embeddings=["openai/text-embedding-3-small"],
+        rerankers=["mmr"],
+    )
+    best, results = rag.optimize(
+        validation_set=str(validation_file),
+        metric="faithfulness",
+        search_type="bayesian",
+        trials=10,
+    )
+    print("Best config found:\n", best)
+if __name__ == "__main__":
+    main()

ragmint/core/__init__.py ADDED Viewed

File without changes

ragmint/core/chunking.py ADDED Viewed

@@ -0,0 +1,22 @@
+from typing import List
+class Chunker:
+    """
+    Handles text chunking and splitting strategies:
+    - Fixed size chunks
+    - Overlapping windows
+    """
+    def __init__(self, chunk_size: int = 500, overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+    def chunk_text(self, text: str) -> List[str]:
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + self.chunk_size
+            chunks.append(text[start:end])
+            start += self.chunk_size - self.overlap
+        return chunks

ragmint/core/embeddings.py ADDED Viewed

@@ -0,0 +1,19 @@
+import numpy as np
+class EmbeddingModel:
+    """
+    Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
+    """
+    def __init__(self, backend: str = "dummy"):
+        self.backend = backend
+    def encode(self, texts):
+        if self.backend == "openai":
+            # Example placeholder — integrate with actual OpenAI API
+            return [np.random.rand(768) for _ in texts]
+        elif self.backend == "huggingface":
+            return [np.random.rand(768) for _ in texts]
+        else:
+            return [np.random.rand(768) for _ in texts]

ragmint/core/evaluation.py ADDED Viewed

@@ -0,0 +1,27 @@
+import time
+from typing import Dict, Any
+from difflib import SequenceMatcher
+class Evaluator:
+    """
+    Simple evaluation of generated answers:
+      - Faithfulness (similarity between answer and context)
+      - Latency
+    """
+    def __init__(self):
+        pass
+    def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
+        start = time.time()
+        faithfulness = self._similarity(answer, context)
+        latency = time.time() - start
+        return {
+            "faithfulness": faithfulness,
+            "latency": latency,
+        }
+    def _similarity(self, a: str, b: str) -> float:
+        return SequenceMatcher(None, a, b).ratio()

ragmint/core/pipeline.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import Any, Dict, List
+from .retriever import Retriever
+from .reranker import Reranker
+from .evaluation import Evaluator
+class RAGPipeline:
+    """
+    Core Retrieval-Augmented Generation pipeline.
+    Simplified (no generator). It retrieves, reranks, and evaluates.
+    """
+    def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
+        self.retriever = retriever
+        self.reranker = reranker
+        self.evaluator = evaluator
+    def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
+        # Retrieve documents
+        retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
+        # Rerank
+        reranked_docs = self.reranker.rerank(query, retrieved_docs)
+        # Use top document as pseudo-answer
+        if reranked_docs:
+            answer = reranked_docs[0]["text"]
+        else:
+            answer = ""
+        context = "\n".join([d["text"] for d in reranked_docs])
+        metrics = self.evaluator.evaluate(query, answer, context)
+        return {
+            "query": query,
+            "answer": answer,
+            "docs": reranked_docs,
+            "metrics": metrics,
+        }

ragmint/core/reranker.py ADDED Viewed

@@ -0,0 +1,62 @@
+from typing import List, Dict, Any
+import numpy as np
+class Reranker:
+    """
+    Supports:
+      - MMR (Maximal Marginal Relevance)
+      - Dummy CrossEncoder (for demonstration)
+    """
+    def __init__(self, mode: str = "mmr", lambda_param: float = 0.5, seed: int = 42):
+        self.mode = mode
+        self.lambda_param = lambda_param
+        np.random.seed(seed)
+    def rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        if not docs:
+            return []
+        if self.mode == "crossencoder":
+            return self._crossencoder_rerank(query, docs)
+        return self._mmr_rerank(query, docs)
+    def _mmr_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Perform MMR reranking using dummy similarity scores."""
+        selected = []
+        remaining = docs.copy()
+        while remaining and len(selected) < len(docs):
+            if not selected:
+                # pick doc with highest base score
+                best = max(remaining, key=lambda d: d["score"])
+            else:
+                # MMR balancing between relevance and diversity
+                mmr_scores = []
+                for d in remaining:
+                    max_div = max(
+                        [self._similarity(d["text"], s["text"]) for s in selected],
+                        default=0,
+                    )
+                    mmr_score = (
+                        self.lambda_param * d["score"]
+                        - (1 - self.lambda_param) * max_div
+                    )
+                    mmr_scores.append(mmr_score)
+                best = remaining[int(np.argmax(mmr_scores))]
+            selected.append(best)
+            remaining.remove(best)
+        return selected
+    def _crossencoder_rerank(self, query: str, docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Adds a small random perturbation to simulate crossencoder reranking."""
+        for d in docs:
+            d["score"] += np.random.uniform(0, 0.1)
+        return sorted(docs, key=lambda d: d["score"], reverse=True)
+    def _similarity(self, a: str, b: str) -> float:
+        """Dummy similarity function between two strings."""
+        # Deterministic pseudo-similarity based on hash
+        return abs(hash(a + b)) % 100 / 100.0

ragmint/core/retriever.py ADDED Viewed

@@ -0,0 +1,33 @@
+from typing import List, Dict, Any
+import numpy as np
+class Retriever:
+    """
+    Simple vector retriever using cosine similarity.
+    """
+    def __init__(self, embeddings: List[np.ndarray], documents: List[str]):
+        if len(embeddings) == 0:
+            self.embeddings = np.zeros((1, 768))
+        else:
+            self.embeddings = np.array(embeddings)
+        self.documents = documents or [""]
+    def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        if self.embeddings.size == 0 or len(self.documents) == 0:
+            return [{"text": "", "score": 0.0}]
+        query_vec = self._embed(query)
+        scores = self._cosine_similarity(query_vec, self.embeddings)
+        top_indices = np.argsort(scores)[::-1][:min(top_k, len(scores))]
+        return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
+    def _embed(self, query: str) -> np.ndarray:
+        dim = self.embeddings.shape[1] if len(self.embeddings.shape) > 1 else 768
+        return np.random.rand(dim)
+    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
+        a_norm = a / np.linalg.norm(a)
+        b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
+        return np.dot(b_norm, a_norm)

ragmint/experiments/__init__.py ADDED Viewed

File without changes

ragmint/optimization/__init__.py ADDED Viewed

File without changes

ragmint/optimization/search.py ADDED Viewed

@@ -0,0 +1,48 @@
+import itertools
+import random
+import logging
+from typing import Dict, List, Iterator, Any
+logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+class GridSearch:
+    def __init__(self, search_space: Dict[str, List[Any]]):
+        keys = list(search_space.keys())
+        values = list(search_space.values())
+        self.combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        for combo in self.combinations:
+            yield combo
+class RandomSearch:
+    def __init__(self, search_space: Dict[str, List[Any]], n_trials: int = 10):
+        self.search_space = search_space
+        self.n_trials = n_trials
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        keys = list(self.search_space.keys())
+        for _ in range(self.n_trials):
+            yield {k: random.choice(self.search_space[k]) for k in keys}
+class BayesianSearch:
+    def __init__(self, search_space: Dict[str, List[Any]]):
+        try:
+            import optuna
+            self.optuna = optuna
+        except ImportError:
+            raise RuntimeError("Optuna not installed; use GridSearch or RandomSearch instead.")
+        self.search_space = search_space
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        keys = list(self.search_space.keys())
+        def objective(trial):
+            return {k: trial.suggest_categorical(k, self.search_space[k]) for k in keys}
+        # Example static 5-trial yield for compatibility
+        for _ in range(5):
+            yield {k: random.choice(self.search_space[k]) for k in keys}

ragmint/tests/__init__.py ADDED Viewed

File without changes

ragmint/tests/test_pipeline.py ADDED Viewed

@@ -0,0 +1,19 @@
+import numpy as np
+from ragmint.core.pipeline import RAGPipeline
+from ragmint.core.retriever import Retriever
+from ragmint.core.reranker import Reranker
+from ragmint.core.evaluation import Evaluator
+def test_pipeline_run():
+    docs = ["doc1 text", "doc2 text"]
+    embeddings = [np.random.rand(4) for _ in range(2)]
+    retriever = Retriever(embeddings, docs)
+    reranker = Reranker("mmr")
+    evaluator = Evaluator()
+    pipeline = RAGPipeline(retriever, reranker, evaluator)
+    result = pipeline.run("what is doc1?")
+    assert "query" in result
+    assert "answer" in result
+    assert "metrics" in result

ragmint/tests/test_retriever.py ADDED Viewed

@@ -0,0 +1,14 @@
+import numpy as np
+from ragmint.core.retriever import Retriever
+def test_retrieve_basic():
+    embeddings = [np.random.rand(5) for _ in range(3)]
+    docs = ["doc A", "doc B", "doc C"]
+    retriever = Retriever(embeddings, docs)
+    results = retriever.retrieve("sample query", top_k=2)
+    assert isinstance(results, list)
+    assert len(results) == 2
+    assert "text" in results[0]
+    assert "score" in results[0]

ragmint/tests/test_search.py ADDED Viewed

@@ -0,0 +1,17 @@
+from ragmint.optimization.search import GridSearch, RandomSearch
+def test_grid_search_iterates():
+    space = {"retriever": ["faiss"], "embedding_model": ["openai"], "reranker": ["mmr"]}
+    search = GridSearch(space)
+    combos = list(search)
+    assert len(combos) == 1
+    assert "retriever" in combos[0]
+def test_random_search_n_trials():
+    space = {"retriever": ["faiss", "bm25"], "embedding_model": ["openai", "st"], "reranker": ["mmr"]}
+    search = RandomSearch(space, n_trials=5)
+    combos = list(search)
+    assert len(combos) == 5
+    assert all("retriever" in c for c in combos)

ragmint/tests/test_tuner.py ADDED Viewed

@@ -0,0 +1,38 @@
+import os
+import json
+from ragmint.tuner import RAGMint
+def setup_validation_file(tmp_path):
+    data = [
+        {"question": "What is AI?", "answer": "Artificial Intelligence"},
+        {"question": "Define ML", "answer": "Machine Learning"}
+    ]
+    file = tmp_path / "validation_qa.json"
+    with open(file, "w", encoding="utf-8") as f:
+        json.dump(data, f)
+    return str(file)
+def setup_docs(tmp_path):
+    corpus = tmp_path / "corpus"
+    corpus.mkdir()
+    (corpus / "doc1.txt").write_text("This is about Artificial Intelligence.")
+    (corpus / "doc2.txt").write_text("This text explains Machine Learning.")
+    return str(corpus)
+def test_optimize_random(tmp_path):
+    docs_path = setup_docs(tmp_path)
+    val_file = setup_validation_file(tmp_path)
+    rag = RAGMint(
+        docs_path=docs_path,
+        retrievers=["faiss"],
+        embeddings=["openai/text-embedding-3-small"],
+        rerankers=["mmr"]
+    )
+    best, results = rag.optimize(validation_set=val_file, metric="faithfulness", trials=2)
+    assert isinstance(best, dict)
+    assert isinstance(results, list)

ragmint/tuner.py ADDED Viewed

@@ -0,0 +1,123 @@
+import os
+import json
+import logging
+from typing import Any, Dict, List, Tuple, Optional
+from time import perf_counter
+from .core.pipeline import RAGPipeline
+from .core.embeddings import EmbeddingModel
+from .core.retriever import Retriever
+from .core.reranker import Reranker
+from .core.evaluation import Evaluator
+from .optimization.search import GridSearch, RandomSearch, BayesianSearch
+from .utils.data_loader import load_validation_set
+logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+class RAGMint:
+    """
+    Main RAG pipeline optimizer and evaluator.
+    """
+    def __init__(
+        self,
+        docs_path: str,
+        retrievers: List[str],
+        embeddings: List[str],
+        rerankers: List[str],
+    ):
+        self.docs_path = docs_path
+        self.retrievers = retrievers
+        self.embeddings = embeddings
+        self.rerankers = rerankers
+        self.documents: List[str] = self._load_docs()
+        self.embeddings_cache: Dict[str, Any] = {}
+    def _load_docs(self) -> List[str]:
+        if not os.path.exists(self.docs_path):
+            logging.warning(f"Corpus path not found: {self.docs_path}")
+            return []
+        docs = []
+        for file in os.listdir(self.docs_path):
+            if file.endswith(".txt") or file.endswith(".md") or file.endswith(".rst"):
+                with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
+                    docs.append(f.read())
+        logging.info(f"Loaded {len(docs)} documents from {self.docs_path}")
+        return docs
+    def _embed_docs(self, model_name: str):
+        if model_name in self.embeddings_cache:
+            return self.embeddings_cache[model_name]
+        model = EmbeddingModel(model_name)
+        embeddings = model.encode(self.documents)
+        self.embeddings_cache[model_name] = embeddings
+        return embeddings
+    def _build_pipeline(self, config: Dict[str, str]) -> RAGPipeline:
+        emb_model = EmbeddingModel(config["embedding_model"])
+        embeddings = self._embed_docs(config["embedding_model"])
+        retriever = Retriever(embeddings, self.documents)
+        reranker = Reranker(config["reranker"])
+        evaluator = Evaluator()
+        return RAGPipeline(retriever, reranker, evaluator)
+    def _evaluate_config(
+        self, config: Dict[str, Any], validation: List[Dict[str, str]], metric: str
+    ) -> Dict[str, float]:
+        pipeline = self._build_pipeline(config)
+        scores = []
+        start = perf_counter()
+        for sample in validation:
+            query = sample.get("question") or sample.get("query")
+            reference = sample.get("answer")
+            result = pipeline.run(query)
+            score = result["metrics"].get(metric, 0.0)
+            scores.append(score)
+        elapsed = perf_counter() - start
+        avg_score = sum(scores) / len(scores) if scores else 0.0
+        return {metric: avg_score, "latency": elapsed / max(1, len(validation))}
+    def optimize(
+        self,
+        validation_set: str,
+        metric: str = "faithfulness",
+        search_type: str = "random",
+        trials: int = 10,
+    ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        validation = load_validation_set(validation_set)
+        search_space = {
+            "retriever": self.retrievers,
+            "embedding_model": self.embeddings,
+            "reranker": self.rerankers,
+        }
+        logging.info(f"Starting {search_type} optimization with {trials} trials")
+        try:
+            if search_type == "grid":
+                searcher = GridSearch(search_space)
+            elif search_type == "bayesian":
+                searcher = BayesianSearch(search_space)
+            else:
+                searcher = RandomSearch(search_space, n_trials=trials)
+        except Exception as e:
+            logging.warning(f"Falling back to RandomSearch due to missing deps: {e}")
+            searcher = RandomSearch(search_space, n_trials=trials)
+        results = []
+        for config in searcher:
+            metrics = self._evaluate_config(config, validation, metric)
+            result = {**config, **metrics}
+            results.append(result)
+            logging.info(f"Tested config: {config} -> {metrics}")
+        best = max(results, key=lambda r: r.get(metric, 0.0)) if results else {}
+        logging.info(f"✅ Best configuration found: {best}")
+        return best, results

ragmint/utils/__init__.py ADDED Viewed

File without changes

ragmint/utils/caching.py ADDED Viewed

@@ -0,0 +1,37 @@
+import os
+import json
+import hashlib
+import pickle
+from typing import Any
+class Cache:
+    """
+    Simple file-based cache for embeddings or retrievals.
+    """
+    def __init__(self, cache_dir: str = ".ragmint_cache"):
+        self.cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+    def _hash_key(self, key: str) -> str:
+        return hashlib.md5(key.encode()).hexdigest()
+    def exists(self, key: str) -> bool:
+        return os.path.exists(os.path.join(self.cache_dir, self._hash_key(key)))
+    def get(self, key: str) -> Any:
+        path = os.path.join(self.cache_dir, self._hash_key(key))
+        if not os.path.exists(path):
+            return None
+        with open(path, "rb") as f:
+            return pickle.load(f)
+    def set(self, key: str, value: Any):
+        path = os.path.join(self.cache_dir, self._hash_key(key))
+        with open(path, "wb") as f:
+            pickle.dump(value, f)
+    def clear(self):
+        for file in os.listdir(self.cache_dir):
+            os.remove(os.path.join(self.cache_dir, file))

ragmint/utils/data_loader.py ADDED Viewed

@@ -0,0 +1,35 @@
+import json
+import csv
+from typing import List, Dict
+from pathlib import Path
+def load_json(path: str) -> List[Dict]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_csv(path: str) -> List[Dict]:
+    with open(path, newline="", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        return list(reader)
+def save_json(path: str, data: Dict):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+def load_validation_set(path: str) -> List[Dict]:
+    """
+    Loads a validation dataset (QA pairs) from JSON or CSV.
+    """
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"Validation file not found: {path}")
+    if p.suffix.lower() == ".json":
+        return load_json(path)
+    elif p.suffix.lower() in [".csv", ".tsv"]:
+        return load_csv(path)
+    else:
+        raise ValueError("Unsupported validation set format. Use JSON or CSV.")

ragmint/utils/logger.py ADDED Viewed

@@ -0,0 +1,36 @@
+import logging
+from tqdm import tqdm
+class Logger:
+    """
+    Centralized logger with optional tqdm integration and color formatting.
+    """
+    def __init__(self, name: str = "ragmint", level: int = logging.INFO):
+        self.logger = logging.getLogger(name)
+        self.logger.setLevel(level)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                "\033[96m[%(asctime)s]\033[0m \033[93m%(levelname)s\033[0m: %(message)s",
+                "%H:%M:%S",
+            )
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+    def info(self, msg: str):
+        self.logger.info(msg)
+    def warning(self, msg: str):
+        self.logger.warning(msg)
+    def error(self, msg: str):
+        self.logger.error(msg)
+    def progress(self, iterable, desc="Processing", total=None):
+        return tqdm(iterable, desc=desc, total=total)
+def get_logger(name: str = "ragmint") -> Logger:
+    return Logger(name)

ragmint/utils/metrics.py ADDED Viewed

@@ -0,0 +1,27 @@
+from typing import List
+import numpy as np
+from difflib import SequenceMatcher
+def bleu_score(reference: str, candidate: str) -> float:
+    """
+    Simple BLEU-like precision approximation.
+    """
+    ref_tokens = reference.split()
+    cand_tokens = candidate.split()
+    if not cand_tokens:
+        return 0.0
+    matches = sum(1 for token in cand_tokens if token in ref_tokens)
+    return matches / len(cand_tokens)
+def rouge_l(reference: str, candidate: str) -> float:
+    """
+    Approximation of ROUGE-L using sequence matcher ratio.
+    """
+    return SequenceMatcher(None, reference, candidate).ratio()
+def mean_score(scores: List[float]) -> float:
+    return float(np.mean(scores)) if scores else 0.0

ragmint-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,218 @@
+Metadata-Version: 2.4
+Name: ragmint
+Version: 0.1.0
+Summary: A modular framework for evaluating and optimizing RAG pipelines.
+Author-email: Andre Oliveira <oandreoliveira@outlook.com>
+License: Apache License 2.0
+Project-URL: Homepage, https://github.com/andyolivers/ragmint
+Project-URL: Documentation, https://andyolivers.com
+Project-URL: Issues, https://github.com/andyolivers/ragmint/issues
+Keywords: RAG,LLM,retrieval,optimization,AI,evaluation
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.23
+Requires-Dist: pandas>=2.0
+Requires-Dist: scikit-learn>=1.3
+Requires-Dist: openai>=1.0
+Requires-Dist: tqdm
+Requires-Dist: pyyaml
+Requires-Dist: chromadb>=0.4
+Requires-Dist: faiss-cpu; sys_platform != "darwin"
+Requires-Dist: optuna>=3.0
+Requires-Dist: pytest
+Requires-Dist: colorama
+Dynamic: license-file
+# Ragmint
+![Python](https://img.shields.io/badge/python-3.9%2B-blue)
+![License](https://img.shields.io/badge/license-Apache%202.0-green)
+![Tests](https://github.com/andyolivers/ragmint/actions/workflows/tests.yml/badge.svg)
+![Optuna](https://img.shields.io/badge/Optuna-Integrated-orange)
+![Status](https://img.shields.io/badge/Status-Active-success)
+![](/assets/images/ragmint-banner.png)
+**Ragmint** (Retrieval-Augmented Generation Model Inspection & Tuning) is a modular, developer-friendly Python library for **evaluating, optimizing, and tuning RAG (Retrieval-Augmented Generation) pipelines**.
+It provides a complete toolkit for **retriever selection**, **embedding model tuning**, and **automated RAG evaluation** with support for **Optuna-based Bayesian optimization**.
+---
+## ✨ Features
+- ✅ **Automated hyperparameter optimization** (Grid, Random, Bayesian via Optuna)
+- 🔍 **Built-in RAG evaluation metrics** — faithfulness, recall, BLEU, ROUGE, latency
+- ⚙️ **Retrievers** — FAISS, Chroma, ElasticSearch
+- 🧩 **Embeddings** — OpenAI, HuggingFace
+- 🧠 **Rerankers** — MMR, CrossEncoder (extensible via plugin interface)
+- 💾 **Caching, experiment tracking, and reproducibility** out of the box
+- 🧰 **Clean modular structure** for easy integration in research and production setups
+---
+## 🚀 Quick Start
+### 1️⃣ Installation
+```bash
+git clone https://github.com/andyolivers/ragmint.git
+cd ragmint
+pip install -e .
+```
+> The `-e` flag installs Ragmint in editable (development) mode.
+> Requires **Python ≥ 3.9**.
+---
+### 2️⃣ Run a RAG Optimization Experiment
+```bash
+python ragmint/main.py --config configs/default.yaml --search bayesian
+```
+Example `configs/default.yaml`:
+```yaml
+retriever: faiss
+embedding_model: text-embedding-3-small
+reranker:
+  mode: mmr
+  lambda_param: 0.5
+optimization:
+  search_method: bayesian
+  n_trials: 20
+```
+---
+### 3️⃣ Manual Pipeline Usage
+```python
+from ragmint.core.pipeline import RAGPipeline
+pipeline = RAGPipeline({
+    "embedding_model": "text-embedding-3-small",
+    "retriever": "faiss",
+})
+result = pipeline.run("What is retrieval-augmented generation?")
+print(result)
+```
+---
+## 🧩 Folder Structure
+```
+ragmint/
+├── core/
+│   ├── pipeline.py         # RAGPipeline implementation
+│   ├── retriever.py        # Retriever logic (FAISS, Chroma)
+│   ├── reranker.py         # MMR + CrossEncoder rerankers
+│   └── embedding.py        # Embedding backends
+├── tuner.py                # Grid, Random, Bayesian optimization (Optuna)
+├── utils/                  # Metrics, logging, caching helpers
+├── configs/                # Default experiment configs
+├── experiments/            # Saved experiment results
+├── tests/                  # Unit tests for all components
+├── main.py                 # CLI entrypoint for tuning
+└── pyproject.toml          # Project dependencies & build metadata
+```
+---
+## 🧪 Running Tests
+To verify your setup:
+```bash
+pytest -v
+```
+Or to test a specific component (e.g., reranker):
+```bash
+pytest tests/test_reranker.py -v
+```
+All tests are designed for **Pytest** and run with lightweight mock data.
+---
+## ⚙️ Configuration via `pyproject.toml`
+Your `pyproject.toml` automatically includes:
+```toml
+[project]
+name = "ragmint"
+version = "0.1.0"
+dependencies = [
+    "numpy",
+    "optuna",
+    "scikit-learn",
+    "faiss-cpu",
+    "chromadb",
+    "pytest",
+    "openai",
+    "tqdm",
+]
+```
+---
+## 📊 Example Experiment Workflow
+1. Define your retriever and reranker configuration in YAML
+2. Launch an optimization search (Grid, Random, or Bayesian)
+3. Ragmint evaluates combinations automatically and reports top results
+4. Export best parameters for production pipelines
+---
+## 🧬 Architecture Overview
+```mermaid
+flowchart TD
+    A[Query] --> B[Embedder]
+    B --> C[Retriever]
+    C --> D[Reranker]
+    D --> E[Generator]
+    E --> F[Evaluation]
+    F --> G[Optuna Tuner]
+    G -->|Best Params| B
+```
+---
+## 📘 Example Output
+```
+[INFO] Starting Bayesian optimization with Optuna
+[INFO] Trial 7 finished: recall=0.83, latency=0.42s
+[INFO] Best parameters: {'lambda_param': 0.6, 'retriever': 'faiss'}
+```
+---
+## 🧠 Why Ragmint?
+- Built for **RAG researchers**, **AI engineers**, and **LLM ops**
+- Works with **LangChain**, **LlamaIndex**, or standalone RAG setups
+- Designed for **extensibility** — plug in your own models, retrievers, or metrics
+---
+## ⚖️ License
+Licensed under the **Apache License 2.0** — free for personal, research, and commercial use.
+---
+## 👤 Author
+**André Oliveira**
+[andyolivers.com](https://andyolivers.com)
+Data Scientist | AI Engineer

ragmint-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,28 @@
+ragmint/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/__main__.py,sha256=q7hBn56Z1xAckbs03i8ynsuOzJVUXmod2qHddX7gkpc,729
+ragmint/tuner.py,sha256=sCUb-qGqk-lz4nUJboomwXFt3us7mYf3oJhwWV9Kzo4,4429
+ragmint/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/core/chunking.py,sha256=Dy9RYyapGSS6ik6Vg9lqbUPCFqSraU1JKpHbYUTkaFo,576
+ragmint/core/embeddings.py,sha256=6wJjfZ5ukr8G5bJJ1evjIqj0_FMbs_gq4xC-sBBqNlA,566
+ragmint/core/evaluation.py,sha256=LcR9AIsL9OyoENrUVSu0hhKzAItcBvEOy33V4i-0DtI,682
+ragmint/core/pipeline.py,sha256=2qwGKuG0Du7gtIpieLFn71h_RcwBpjcV-h9PQz2ZOsc,1169
+ragmint/core/reranker.py,sha256=B2-NDExqpd9jdXHkEHOXC0B_6-FMJm5vdi-_ZbxC3Os,2303
+ragmint/core/retriever.py,sha256=jbpKy_fGdDq736y0es_utQuLqY9eiWNd71Q8JbU0Sko,1259
+ragmint/experiments/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/optimization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/optimization/search.py,sha256=uiLJeoO_jaLCQEw99L6uI1rnqHHx_rTY81WxfMmlALs,1623
+ragmint/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/tests/test_pipeline.py,sha256=MIMkEKelh-POlbXzbCc4ClMk8XCGzfuj569xXltziic,615
+ragmint/tests/test_retriever.py,sha256=Ag0uGW8-iMzKA4nJNnsjuzlQHa79sN-T-K1g1cdin-A,421
+ragmint/tests/test_search.py,sha256=FcC-DEnw9veAEyMnFoRw9DAwzqJC9F6-r63Nqo2nO58,598
+ragmint/tests/test_tuner.py,sha256=VFZ23og0dOypBpr3TxkRmSngilkNgyboZc6u9qB0pME,1101
+ragmint/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ragmint/utils/caching.py,sha256=LPE2JorOQ90BgVf6NUiS0-bdt-FGpNxDy7FnuwEHzy0,1060
+ragmint/utils/data_loader.py,sha256=Q3pBO77XZ1rl4fuMn3TK7x3mSM2eLdV_OJTyy_eL3Ys,988
+ragmint/utils/logger.py,sha256=X7hTNb3st3fUeQIzSghuoV5B8FWXzm_O3DRkSfJvhmI,1033
+ragmint/utils/metrics.py,sha256=DR8mrdumHtQerK0VrugwYKIG1oNptEcsFqodXq3i2kY,717
+ragmint-0.1.0.dist-info/licenses/LICENSE,sha256=ahkhYfFLI8tGrdxdO2_GaT6OJW2eNwyFT3kYi85QQhc,692
+ragmint-0.1.0.dist-info/METADATA,sha256=BgMj5BxH2C2_5GweYpClkopepUBCVen5tWAFcOby8o8,5643
+ragmint-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ragmint-0.1.0.dist-info/top_level.txt,sha256=K2ulzMHuvFm6xayvvJdGABeRJAvKDBn6M3EI-3SbYLw,8
+ragmint-0.1.0.dist-info/RECORD,,

ragmint-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

ragmint-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,19 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+Copyright 2025 André Oliveira
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

ragmint-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ ragmint