PyPI - rag-python - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

rag-python 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{rag_python-0.2.0/src/rag_python.egg-info → rag_python-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rag-python
-Version: 0.2.0
+Version: 0.3.0
 Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
 Author-email: Raghav Singla <04raghavsingla28@gmail.com>
 License: MIT
@@ -35,6 +35,8 @@ Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
 Requires-Dist: torch>=2.0.0; extra == "rerank"
 Provides-Extra: local
 Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
+Provides-Extra: hybrid
+Requires-Dist: rank-bm25>=0.2.2; extra == "hybrid"
 Provides-Extra: anthropic
 Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: gemini
@@ -44,8 +46,9 @@ Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: build; extra == "dev"
 Requires-Dist: twine; extra == "dev"
+Requires-Dist: rank-bm25>=0.2.2; extra == "dev"
 Provides-Extra: all
-Requires-Dist: rag-python[anthropic,gemini,local,rerank]; extra == "all"
+Requires-Dist: rag-python[anthropic,gemini,hybrid,local,rerank]; extra == "all"
 # rag-python
@@ -67,10 +70,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
 ## Features
 - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
-- Query pipeline: rewriting → multi-query retrieval → reranking
+- Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
 - Generation with guardrails (prompt injection + hallucination checks)
 - Evaluation scores + self-correction retry loop
 - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
+- **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
 ---
@@ -81,7 +85,7 @@ pip install rag-python
 # or from source
 pip install -e .
 # with reranking + extra providers
-pip install -e ".[rerank,local,anthropic,gemini,all]"
+pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
 ```
 ---
@@ -103,12 +107,26 @@ answer = rag.query("How many days of annual leave?")
 print(answer.text)
 ```
+### Hybrid search + metadata filter
+```python
+from rag_python import RAG, SearchConfig
+rag = RAG(
+    retriever="hybrid",  # pip install rag-python[hybrid]
+    metadata_filter={"filename": "leave-policy.pdf"},
+)
+rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
+answer = rag.query("How many days of annual leave?")
+```
 ### CLI
 ```bash
 export OPENAI_API_KEY=sk-...
 rag-python ingest ./data --reindex
 rag-python query "How many days of annual leave?" -v
+rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
 ```
 ---

{rag_python-0.2.0 → rag_python-0.3.0}/README.md RENAMED Viewed

@@ -18,10 +18,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
 ## Features
 - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
-- Query pipeline: rewriting → multi-query retrieval → reranking
+- Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
 - Generation with guardrails (prompt injection + hallucination checks)
 - Evaluation scores + self-correction retry loop
 - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
+- **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
 ---
@@ -32,7 +33,7 @@ pip install rag-python
 # or from source
 pip install -e .
 # with reranking + extra providers
-pip install -e ".[rerank,local,anthropic,gemini,all]"
+pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
 ```
 ---
@@ -54,12 +55,26 @@ answer = rag.query("How many days of annual leave?")
 print(answer.text)
 ```
+### Hybrid search + metadata filter
+```python
+from rag_python import RAG, SearchConfig
+rag = RAG(
+    retriever="hybrid",  # pip install rag-python[hybrid]
+    metadata_filter={"filename": "leave-policy.pdf"},
+)
+rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
+answer = rag.query("How many days of annual leave?")
+```
 ### CLI
 ```bash
 export OPENAI_API_KEY=sk-...
 rag-python ingest ./data --reindex
 rag-python query "How many days of annual leave?" -v
+rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
 ```
 ---

{rag_python-0.2.0 → rag_python-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "rag-python"
-version = "0.2.0"
+version = "0.3.0"
 description = "Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation."
 readme = "README.md"
 license = { text = "MIT" }
@@ -39,10 +39,11 @@ dependencies = [
 [project.optional-dependencies]
 rerank = ["sentence-transformers>=2.2.0", "torch>=2.0.0"]
 local = ["sentence-transformers>=2.2.0"]
+hybrid = ["rank-bm25>=0.2.2"]
 anthropic = ["anthropic>=0.20.0"]
 gemini = ["google-genai>=0.3.0"]
-dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine"]
-all = ["rag-python[rerank,local,anthropic,gemini]"]
+dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine", "rank-bm25>=0.2.2"]
+all = ["rag-python[rerank,local,hybrid,anthropic,gemini]"]
 [project.scripts]
 rag-python = "rag_python.cli:main"

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ Quick start::
     print(rag.query("What is our leave policy?").text)
 """
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 from .client import RAG, RAGAnswer
 from .rag_pipeline import ingest, query, RAGResponse

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/cli.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """rag-python command-line interface."""
 import argparse
+import json
 from dataclasses import replace
 from . import __version__
@@ -7,7 +8,7 @@ from .client import RAG
 def _build_rag(args: argparse.Namespace) -> RAG:
-    return RAG(
+    kwargs: dict = dict(
         llm_provider=args.llm_provider,
         llm_model=args.llm_model,
         embedding_provider=args.embedding_provider,
@@ -20,6 +21,20 @@ def _build_rag(args: argparse.Namespace) -> RAG:
         gemini_api_key=args.gemini_api_key,
         ollama_base_url=args.ollama_base_url,
     )
+    if getattr(args, "retriever", None):
+        kwargs["retriever"] = args.retriever
+    if getattr(args, "metadata_filter", None):
+        kwargs["metadata_filter"] = args.metadata_filter
+    return RAG(**kwargs)
+def _parse_metadata_filter(raw: str | None) -> dict | None:
+    if not raw:
+        return None
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise argparse.ArgumentTypeError(f"Invalid JSON for metadata filter: {e}") from e
 def _add_provider_args(parser: argparse.ArgumentParser) -> None:
@@ -44,6 +59,21 @@ def _add_provider_args(parser: argparse.ArgumentParser) -> None:
     parser.add_argument("--gemini-api-key", default=None)
+def _add_search_args(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--retriever",
+        choices=["vector", "multi_query", "hybrid"],
+        default=None,
+        help="Retrieval strategy (default: multi_query; hybrid needs pip install rag-python[hybrid])",
+    )
+    parser.add_argument(
+        "--metadata-filter",
+        type=_parse_metadata_filter,
+        default=None,
+        help='Chroma metadata filter as JSON, e.g. \'{"filename": "policy.pdf"}\'',
+    )
 def main() -> None:
     parser = argparse.ArgumentParser(
         prog="rag-python",
@@ -59,9 +89,10 @@ def main() -> None:
     q = sub.add_parser("query", help="Ask a question against ingested documents")
     q.add_argument("question", nargs="+", help="Question text")
-    q.add_argument("--no-multi-query", action="store_true")
+    q.add_argument("--no-multi-query", action="store_true", help="Use vector retriever only")
     q.add_argument("-v", "--verbose", action="store_true")
     _add_provider_args(q)
+    _add_search_args(q)
     args = parser.parse_args()
@@ -74,9 +105,13 @@ def main() -> None:
     if args.command == "query":
         rag = _build_rag(args)
         question = " ".join(args.question)
+        retriever = args.retriever
+        if retriever is None and args.no_multi_query:
+            retriever = "vector"
         search = replace(
             rag.config.search,
-            retriever="vector" if args.no_multi_query else "multi_query",
+            retriever=retriever or rag.config.search.retriever,
+            metadata_filter=args.metadata_filter or rag.config.search.metadata_filter,
         )
         ans = rag.query(question, search=search)
         print(ans.text)

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/client.py RENAMED Viewed

@@ -60,6 +60,7 @@ class RAG:
         chunk_size: int | None = None,
         chunk_overlap: int | None = None,
         retriever: str | None = None,
+        metadata_filter: dict | None = None,
         top_k_retrieve: int | None = None,
         top_k_rerank: int | None = None,
         multi_query_n: int | None = None,
@@ -104,6 +105,8 @@ class RAG:
             self.config.search = replace(self.config.search, rerank_enabled=rerank_enabled)
         if document_extensions is not None:
             self.config.documents = replace(self.config.documents, extensions=document_extensions)
+        if metadata_filter is not None:
+            self.config.search = replace(self.config.search, metadata_filter=metadata_filter)
         self.llm = make_llm_provider(
             llm_provider,  # type: ignore[arg-type]

rag_python-0.3.0/src/rag_python/document_loaders.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Document loaders: raw data → structured text + metadata."""
+import csv
+import json
+from html.parser import HTMLParser
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Iterator
+try:
+    from pypdf import PdfReader
+except ImportError:
+    PdfReader = None
+try:
+    from docx import Document as DocxDocument
+except ImportError:
+    DocxDocument = None
+@dataclass
+class LoadedDocument:
+    """Single document with content and metadata."""
+    content: str
+    source: str
+    metadata: dict
+class _HTMLTextExtractor(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self.parts: list[str] = []
+    def handle_data(self, data: str) -> None:
+        text = data.strip()
+        if text:
+            self.parts.append(text)
+def _html_to_text(html: str) -> str:
+    parser = _HTMLTextExtractor()
+    parser.feed(html)
+    return "\n".join(parser.parts)
+def _load_csv(path: Path, metadata: dict) -> LoadedDocument | None:
+    rows: list[str] = []
+    with path.open(encoding="utf-8", errors="replace", newline="") as f:
+        reader = csv.DictReader(f)
+        if reader.fieldnames:
+            for row in reader:
+                rows.append(", ".join(f"{k}: {v}" for k, v in row.items() if v))
+        else:
+            f.seek(0)
+            for row in csv.reader(f):
+                rows.append(", ".join(row))
+    content = "\n".join(rows)
+    metadata["rows"] = len(rows)
+    return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
+def _load_json(path: Path, metadata: dict) -> LoadedDocument | None:
+    data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
+    if isinstance(data, list):
+        parts = []
+        for item in data:
+            if isinstance(item, dict) and "text" in item:
+                parts.append(str(item["text"]))
+            else:
+                parts.append(json.dumps(item, ensure_ascii=False))
+        content = "\n\n".join(parts)
+    elif isinstance(data, dict):
+        if "text" in data:
+            content = str(data["text"])
+        else:
+            content = json.dumps(data, ensure_ascii=False, indent=2)
+    else:
+        content = str(data)
+    return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
+def load_file(path: Path) -> LoadedDocument | None:
+    """Load a single file (PDF, TXT, DOCX, MD, CSV, JSON, HTML) into text + metadata."""
+    path = Path(path)
+    if not path.exists():
+        return None
+    suffix = path.suffix.lower()
+    metadata = {"source": str(path), "filename": path.name}
+    if suffix in (".txt", ".md"):
+        content = path.read_text(encoding="utf-8", errors="replace")
+        return LoadedDocument(content=content, source=str(path), metadata=metadata)
+    if suffix == ".html":
+        html = path.read_text(encoding="utf-8", errors="replace")
+        content = _html_to_text(html)
+        return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
+    if suffix == ".csv":
+        return _load_csv(path, metadata)
+    if suffix == ".json":
+        try:
+            return _load_json(path, metadata)
+        except json.JSONDecodeError:
+            return None
+    if suffix == ".pdf" and PdfReader:
+        try:
+            reader = PdfReader(path)
+            parts = []
+            for i, page in enumerate(reader.pages):
+                text = page.extract_text() or ""
+                parts.append(text)
+                metadata.setdefault("page_numbers", []).append(i + 1)
+            content = "\n\n".join(parts)
+            metadata["pages"] = len(parts)
+            return LoadedDocument(content=content, source=str(path), metadata=metadata)
+        except Exception:
+            return None
+    if suffix in (".docx", ".doc") and DocxDocument:
+        try:
+            doc = DocxDocument(path)
+            parts = [p.text for p in doc.paragraphs]
+            content = "\n\n".join(parts)
+            metadata["paragraphs"] = len(parts)
+            return LoadedDocument(content=content, source=str(path), metadata=metadata)
+        except Exception:
+            return None
+    return None
+def load_directory(
+    dir_path: Path,
+    extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
+) -> Iterator[LoadedDocument]:
+    """Yield LoadedDocument for each supported file under dir_path."""
+    dir_path = Path(dir_path)
+    if not dir_path.is_dir():
+        return
+    for f in dir_path.rglob("*"):
+        if f.is_file() and f.suffix.lower() in extensions:
+            doc = load_file(f)
+            if doc and doc.content.strip():
+                yield doc

rag_python-0.3.0/src/rag_python/hybrid_search.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""BM25 + vector fusion via reciprocal rank fusion (RRF)."""
+from __future__ import annotations
+from typing import Any
+def reciprocal_rank_fusion(
+    rankings: list[list[tuple[str, dict[str, Any], float]]],
+    *,
+    rrf_k: int = 60,
+) -> list[tuple[str, dict[str, Any], float]]:
+    """Merge ranked lists with RRF. Higher score is better."""
+    scores: dict[tuple[str, str], float] = {}
+    doc_map: dict[tuple[str, str], tuple[str, dict[str, Any]]] = {}
+    for ranking in rankings:
+        for rank, (doc, meta, _score) in enumerate(ranking):
+            key = (doc[:200], str(meta.get("source", "")))
+            doc_map[key] = (doc, meta)
+            scores[key] = scores.get(key, 0.0) + 1.0 / (rrf_k + rank + 1)
+    merged = sorted(scores.items(), key=lambda item: item[1], reverse=True)
+    return [(doc_map[key][0], doc_map[key][1], score) for key, score in merged]
+def bm25_retrieve(
+    query: str,
+    documents: list[str],
+    metadatas: list[dict[str, Any]],
+    *,
+    top_k: int = 20,
+) -> list[tuple[str, dict[str, Any], float]]:
+    """Keyword retrieval with BM25. Requires ``pip install rag-python[hybrid]``."""
+    if not documents:
+        return []
+    try:
+        from rank_bm25 import BM25Okapi
+    except ImportError as e:
+        raise ImportError(
+            "Hybrid search requires optional dependencies. Install with: pip install rag-python[hybrid]"
+        ) from e
+    tokenized_corpus = [doc.lower().split() for doc in documents]
+    bm25 = BM25Okapi(tokenized_corpus)
+    scores = bm25.get_scores(query.lower().split())
+    ranked = sorted(
+        ((documents[i], metadatas[i], float(scores[i])) for i in range(len(documents))),
+        key=lambda item: item[2],
+        reverse=True,
+    )
+    return ranked[:top_k]

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/options.py RENAMED Viewed

@@ -16,7 +16,7 @@ from .config import (
 )
 ChunkStrategy = Literal["recursive", "structure_aware", "semantic"]
-RetrieverStrategy = Literal["vector", "multi_query"]
+RetrieverStrategy = Literal["vector", "multi_query", "hybrid"]
 @dataclass
@@ -37,13 +37,14 @@ class SearchConfig:
     top_k_rerank: int = TOP_K_RERANK
     multi_query_n: int = MULTI_QUERY_N
     rerank_enabled: bool = RERANK_ENABLED
+    metadata_filter: dict | None = None
 @dataclass
 class DocumentConfig:
     """Which files to load and how to preprocess them."""
-    extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx")
+    extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
     clean: bool = True
     copy_to_data_dir: bool = True

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/rag_pipeline.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Full RAG pipeline: Query → Understanding/Rewrite → Retrieval (multi-query) → Rerank → LLM → Guardrails → Eval/Retry."""
+import logging
 from dataclasses import dataclass
 from pathlib import Path
@@ -14,6 +15,8 @@ from .providers import LLMProvider, EmbeddingProvider, make_llm_provider, make_e
 from .config import DATA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, CHUNK_STRATEGY
 from .options import QueryConfig, SearchConfig
+logger = logging.getLogger(__name__)
 @dataclass
 class RAGResponse:
@@ -34,7 +37,7 @@ def _load_documents(
     paths: list[Path] | None = None,
     data_path: Path | None = None,
     *,
-    extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx"),
+    extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
 ) -> list[LoadedDocument]:
     """Load documents from explicit paths and/or a data directory."""
     docs: list[LoadedDocument] = []
@@ -136,12 +139,13 @@ def ingest(
     strategy = chunk_strategy or CHUNK_STRATEGY
     size = chunk_size or CHUNK_SIZE
     overlap = chunk_overlap or CHUNK_OVERLAP
-    ext = extensions or (".txt", ".md", ".pdf", ".docx")
+    ext = extensions or (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
     embedder = embedder or make_embedding_provider("openai")
     path_list = [Path(p) for p in paths] if paths else None
     root = Path(data_path) if data_path else (None if path_list else Path(DATA_DIR))
     docs = _load_documents(path_list, root, extensions=ext)
+    logger.info("Loaded %s documents for ingest", len(docs))
     return _ingest_documents(
         docs,
         clean=clean,
@@ -202,11 +206,13 @@ def query(
         top_k_retrieve=search_cfg.top_k_retrieve,
         top_k_rerank=search_cfg.top_k_rerank,
         rerank_enabled=search_cfg.rerank_enabled,
+        metadata_filter=search_cfg.metadata_filter,
         embedder=embedder,
         embedding_model=embedding_model,
         llm=llm,
         llm_model=llm_model,
     )
+    logger.info("Retrieved %s chunks (retriever=%s)", len(hits), search_cfg.retriever)
     context_chunks = [h[0] for h in hits]
     sources = [{"text": h[0][:200], "metadata": h[1], "score": h[2]} for h in hits]
     context_str = "\n\n".join(context_chunks)

rag_python-0.3.0/src/rag_python/retrieval.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Retrieval: vector, multi-query, hybrid (BM25+vector), and reranking."""
+from typing import Any
+from .vector_store import retrieve as chroma_retrieve, list_documents
+from .query_rewriting import rewrite_for_retrieval
+from .reranker import rerank_with_metadata
+from .hybrid_search import bm25_retrieve, reciprocal_rank_fusion
+from .providers import EmbeddingProvider, LLMProvider
+from .options import RetrieverStrategy
+from .config import TOP_K_RETRIEVE, TOP_K_RERANK, MULTI_QUERY_N
+def _dedupe_candidates(candidates: list[tuple[str, dict, float]]) -> list[tuple[str, dict, float]]:
+    seen: set[tuple[str, str]] = set()
+    out: list[tuple[str, dict, float]] = []
+    for doc, meta, score in candidates:
+        key = (doc[:200], str(meta.get("source", "")))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append((doc, meta, score))
+    return out
+def _vector_candidates(
+    queries: list[str],
+    *,
+    embedder: EmbeddingProvider,
+    embedding_model: str | None,
+    top_k_retrieve: int,
+    where: dict | None,
+) -> list[tuple[str, dict, float]]:
+    seen_docs: set[tuple[str, str]] = set()
+    all_candidates: list[tuple[str, dict, float]] = []
+    for q in queries:
+        emb = embedder.embed([q], model=embedding_model)[0]
+        hits = chroma_retrieve(emb, top_k=top_k_retrieve, where=where)
+        for doc, meta, dist in hits:
+            key = (doc[:200], str(meta.get("source", "")))
+            if key in seen_docs:
+                continue
+            seen_docs.add(key)
+            all_candidates.append((doc, meta, -dist))
+    return all_candidates
+def retrieve(
+    query: str,
+    *,
+    embedder: EmbeddingProvider,
+    embedding_model: str | None = None,
+    retriever: RetrieverStrategy = "multi_query",
+    multi_query: bool | None = None,
+    n_queries: int | None = None,
+    top_k_retrieve: int | None = None,
+    top_k_rerank: int | None = None,
+    rerank_enabled: bool | None = None,
+    metadata_filter: dict | None = None,
+    llm: LLMProvider | None = None,
+    llm_model: str | None = None,
+) -> list[tuple[str, dict[str, Any], float]]:
+    """
+    Retrieve relevant chunks using vector, multi-query, or hybrid search, then rerank.
+    Returns list of (document_text, metadata, rerank_score).
+    """
+    top_k_retrieve = top_k_retrieve or TOP_K_RETRIEVE
+    top_k_rerank = top_k_rerank or TOP_K_RERANK
+    n_queries = n_queries or MULTI_QUERY_N
+    if retriever == "hybrid":
+        emb = embedder.embed([query], model=embedding_model)[0]
+        vector_hits = chroma_retrieve(emb, top_k=top_k_retrieve, where=metadata_filter)
+        vector_ranked = [(d, m, -dist) for d, m, dist in vector_hits]
+        docs, metas = list_documents(where=metadata_filter)
+        bm25_ranked = bm25_retrieve(query, docs, metas, top_k=top_k_retrieve)
+        fused = reciprocal_rank_fusion([vector_ranked, bm25_ranked])[:top_k_retrieve]
+        all_candidates = _dedupe_candidates(fused)
+    else:
+        use_multi_query = retriever == "multi_query" if multi_query is None else multi_query
+        queries = [query]
+        if use_multi_query and n_queries > 1:
+            rewritten = rewrite_for_retrieval(query, n_queries=n_queries, llm=llm, llm_model=llm_model)
+            if rewritten:
+                queries = rewritten
+        all_candidates = _vector_candidates(
+            queries,
+            embedder=embedder,
+            embedding_model=embedding_model,
+            top_k_retrieve=top_k_retrieve,
+            where=metadata_filter,
+        )
+    if not all_candidates:
+        return []
+    docs = [c[0] for c in all_candidates]
+    metas = [c[1] for c in all_candidates]
+    return rerank_with_metadata(
+        query, list(zip(docs, metas)), top_k=top_k_rerank, rerank_enabled=rerank_enabled
+    )

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python/vector_store.py RENAMED Viewed

@@ -85,6 +85,19 @@ def retrieve(
     return list(zip(docs, metas, dists))
+def list_documents(
+    *,
+    where: dict | None = None,
+    limit: int | None = None,
+) -> tuple[list[str], list[dict[str, Any]]]:
+    """Return all stored chunk texts and metadata (for BM25 indexing)."""
+    coll = get_collection()
+    res = coll.get(where=where, include=["documents", "metadatas"], limit=limit)
+    docs = res.get("documents") or []
+    metas = res.get("metadatas") or []
+    return docs, metas
 def delete_all() -> None:
     """Remove all documents from the collection (for re-ingestion)."""
     _get_client().delete_collection(COLLECTION_NAME)

{rag_python-0.2.0 → rag_python-0.3.0/src/rag_python.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rag-python
-Version: 0.2.0
+Version: 0.3.0
 Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
 Author-email: Raghav Singla <04raghavsingla28@gmail.com>
 License: MIT
@@ -35,6 +35,8 @@ Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
 Requires-Dist: torch>=2.0.0; extra == "rerank"
 Provides-Extra: local
 Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
+Provides-Extra: hybrid
+Requires-Dist: rank-bm25>=0.2.2; extra == "hybrid"
 Provides-Extra: anthropic
 Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: gemini
@@ -44,8 +46,9 @@ Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: build; extra == "dev"
 Requires-Dist: twine; extra == "dev"
+Requires-Dist: rank-bm25>=0.2.2; extra == "dev"
 Provides-Extra: all
-Requires-Dist: rag-python[anthropic,gemini,local,rerank]; extra == "all"
+Requires-Dist: rag-python[anthropic,gemini,hybrid,local,rerank]; extra == "all"
 # rag-python
@@ -67,10 +70,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
 ## Features
 - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
-- Query pipeline: rewriting → multi-query retrieval → reranking
+- Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
 - Generation with guardrails (prompt injection + hallucination checks)
 - Evaluation scores + self-correction retry loop
 - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
+- **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
 ---
@@ -81,7 +85,7 @@ pip install rag-python
 # or from source
 pip install -e .
 # with reranking + extra providers
-pip install -e ".[rerank,local,anthropic,gemini,all]"
+pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
 ```
 ---
@@ -103,12 +107,26 @@ answer = rag.query("How many days of annual leave?")
 print(answer.text)
 ```
+### Hybrid search + metadata filter
+```python
+from rag_python import RAG, SearchConfig
+rag = RAG(
+    retriever="hybrid",  # pip install rag-python[hybrid]
+    metadata_filter={"filename": "leave-policy.pdf"},
+)
+rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
+answer = rag.query("How many days of annual leave?")
+```
 ### CLI
 ```bash
 export OPENAI_API_KEY=sk-...
 rag-python ingest ./data --reindex
 rag-python query "How many days of annual leave?" -v
+rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
 ```
 ---

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,7 @@ src/rag_python/document_loaders.py
 src/rag_python/evaluation.py
 src/rag_python/generation.py
 src/rag_python/guardrails.py
+src/rag_python/hybrid_search.py
 src/rag_python/options.py
 src/rag_python/py.typed
 src/rag_python/query_rewriting.py
@@ -35,8 +36,10 @@ src/rag_python/providers/ollama_provider.py
 src/rag_python/providers/openai_provider.py
 tests/test_chunking.py
 tests/test_config.py
+tests/test_hybrid_search.py
 tests/test_import.py
 tests/test_loaders.py
 tests/test_package.py
 tests/test_pipeline.py
-tests/test_providers.py
+tests/test_providers.py
+tests/test_retrieval.py

{rag_python-0.2.0 → rag_python-0.3.0}/src/rag_python.egg-info/requires.txt RENAMED Viewed

@@ -9,7 +9,7 @@ python-dotenv>=1.0.0
 requests>=2.31.0
 [all]
-rag-python[anthropic,gemini,local,rerank]
+rag-python[anthropic,gemini,hybrid,local,rerank]
 [anthropic]
 anthropic>=0.20.0
@@ -19,10 +19,14 @@ pytest>=7.0
 ruff>=0.1.0
 build
 twine
+rank-bm25>=0.2.2
 [gemini]
 google-genai>=0.3.0
+[hybrid]
+rank-bm25>=0.2.2
 [local]
 sentence-transformers>=2.2.0

rag_python-0.3.0/tests/test_hybrid_search.py ADDED Viewed

@@ -0,0 +1,35 @@
+import pytest
+from rag_python.hybrid_search import bm25_retrieve, reciprocal_rank_fusion
+def test_reciprocal_rank_fusion_prefers_shared_docs():
+    vector = [
+        ("doc a", {"source": "a"}, 0.9),
+        ("doc b", {"source": "b"}, 0.8),
+    ]
+    bm25 = [
+        ("doc b", {"source": "b"}, 0.95),
+        ("doc c", {"source": "c"}, 0.7),
+    ]
+    merged = reciprocal_rank_fusion([vector, bm25])
+    assert len(merged) == 3
+    assert merged[0][0] == "doc b"
+def test_bm25_retrieve_ranks_relevant_doc():
+    docs = [
+        "annual leave policy grants twenty days per year",
+        "office cafeteria menu and lunch hours",
+    ]
+    metas = [{"source": "policy.txt"}, {"source": "cafe.txt"}]
+    try:
+        hits = bm25_retrieve("annual leave days", docs, metas, top_k=1)
+    except ImportError:
+        pytest.skip("rank_bm25 not installed")
+    assert hits[0][0] == docs[0]
+    assert hits[0][1]["source"] == "policy.txt"
+def test_bm25_retrieve_empty_corpus():
+    assert bm25_retrieve("query", [], [], top_k=5) == []

{rag_python-0.2.0 → rag_python-0.3.0}/tests/test_loaders.py RENAMED Viewed

@@ -20,6 +20,32 @@ def test_load_markdown_file(tmp_path: Path):
     assert "Title" in doc.content
+def test_load_csv_file(tmp_path: Path):
+    f = tmp_path / "data.csv"
+    f.write_text("name,days\nAlice,20\nBob,15\n", encoding="utf-8")
+    doc = load_file(f)
+    assert doc is not None
+    assert "Alice" in doc.content
+    assert doc.metadata.get("rows") == 2
+def test_load_json_file(tmp_path: Path):
+    f = tmp_path / "data.json"
+    f.write_text('[{"text": "Annual leave is twenty days."}]', encoding="utf-8")
+    doc = load_file(f)
+    assert doc is not None
+    assert "twenty days" in doc.content
+def test_load_html_file(tmp_path: Path):
+    f = tmp_path / "page.html"
+    f.write_text("<html><body><h1>Policy</h1><p>Twenty days leave.</p></body></html>", encoding="utf-8")
+    doc = load_file(f)
+    assert doc is not None
+    assert "Policy" in doc.content
+    assert "Twenty days" in doc.content
 def test_load_directory_skips_empty_files(tmp_path: Path):
     (tmp_path / "a.txt").write_text("content a", encoding="utf-8")
     (tmp_path / "empty.txt").write_text("   ", encoding="utf-8")

{rag_python-0.2.0 → rag_python-0.3.0}/tests/test_package.py RENAMED Viewed

@@ -4,7 +4,7 @@ import importlib.metadata
 def test_package_metadata():
     dist = importlib.metadata.metadata("rag-python")
     assert dist["Name"] == "rag-python"
-    assert dist["Version"] == "0.2.0"
+    assert dist["Version"] == "0.3.0"
     author = dist.get("Author") or dist.get("Author-email") or ""
     assert "Raghav Singla" in author or "RaghavOG" in author

rag_python-0.3.0/tests/test_retrieval.py ADDED Viewed

@@ -0,0 +1,52 @@
+from unittest.mock import MagicMock, patch
+from rag_python.retrieval import retrieve
+def test_hybrid_retriever_fuses_vector_and_bm25():
+    embedder = MagicMock()
+    embedder.embed.return_value = [[0.1, 0.2]]
+    vector_hits = [("vector doc", {"source": "v.txt"}, 0.1)]
+    bm25_hits = [("bm25 doc", {"source": "b.txt"}, 1.5)]
+    fused = [("vector doc", {"source": "v.txt"}, 0.5), ("bm25 doc", {"source": "b.txt"}, 0.4)]
+    with (
+        patch("rag_python.retrieval.chroma_retrieve", return_value=vector_hits) as mock_chroma,
+        patch("rag_python.retrieval.list_documents", return_value=(["bm25 doc"], [{"source": "b.txt"}])),
+        patch("rag_python.retrieval.bm25_retrieve", return_value=bm25_hits) as mock_bm25,
+        patch("rag_python.retrieval.reciprocal_rank_fusion", return_value=fused) as mock_rrf,
+        patch("rag_python.retrieval.rerank_with_metadata", side_effect=lambda q, pairs, **kw: pairs),
+    ):
+        hits = retrieve(
+            "leave policy",
+            embedder=embedder,
+            retriever="hybrid",
+            rerank_enabled=False,
+            metadata_filter={"filename": "policy.txt"},
+        )
+    mock_chroma.assert_called_once()
+    assert mock_chroma.call_args.kwargs["where"] == {"filename": "policy.txt"}
+    mock_bm25.assert_called_once()
+    mock_rrf.assert_called_once()
+    assert len(hits) == 2
+def test_vector_retriever_passes_metadata_filter():
+    embedder = MagicMock()
+    embedder.embed.return_value = [[0.5, 0.5]]
+    with (
+        patch("rag_python.retrieval.chroma_retrieve", return_value=[]) as mock_chroma,
+        patch("rag_python.retrieval.rerank_with_metadata", return_value=[]),
+    ):
+        retrieve(
+            "question",
+            embedder=embedder,
+            retriever="vector",
+            metadata_filter={"source": "/data/policy.txt"},
+        )
+    mock_chroma.assert_called_once()
+    assert mock_chroma.call_args.kwargs["where"] == {"source": "/data/policy.txt"}

rag_python-0.2.0/src/rag_python/document_loaders.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""Document loaders: raw data → structured text + metadata."""
-from pathlib import Path
-from dataclasses import dataclass
-from typing import Iterator
-try:
-    from pypdf import PdfReader
-except ImportError:
-    PdfReader = None
-try:
-    from docx import Document as DocxDocument
-except ImportError:
-    DocxDocument = None
-@dataclass
-class LoadedDocument:
-    """Single document with content and metadata."""
-    content: str
-    source: str
-    metadata: dict
-def load_file(path: Path) -> LoadedDocument | None:
-    """Load a single file (PDF, TXT, DOCX, MD) into text + metadata."""
-    path = Path(path)
-    if not path.exists():
-        return None
-    suffix = path.suffix.lower()
-    metadata = {"source": str(path), "filename": path.name}
-    if suffix == ".txt" or suffix == ".md":
-        content = path.read_text(encoding="utf-8", errors="replace")
-        return LoadedDocument(content=content, source=str(path), metadata=metadata)
-    if suffix == ".pdf" and PdfReader:
-        try:
-            reader = PdfReader(path)
-            parts = []
-            for i, page in enumerate(reader.pages):
-                text = page.extract_text() or ""
-                parts.append(text)
-                metadata.setdefault("page_numbers", []).append(i + 1)
-            content = "\n\n".join(parts)
-            metadata["pages"] = len(parts)
-            return LoadedDocument(content=content, source=str(path), metadata=metadata)
-        except Exception:
-            return None
-    if suffix in (".docx", ".doc") and DocxDocument:
-        try:
-            doc = DocxDocument(path)
-            parts = [p.text for p in doc.paragraphs]
-            content = "\n\n".join(parts)
-            metadata["paragraphs"] = len(parts)
-            return LoadedDocument(content=content, source=str(path), metadata=metadata)
-        except Exception:
-            return None
-    return None
-def load_directory(dir_path: Path, extensions: tuple = (".txt", ".md", ".pdf", ".docx")) -> Iterator[LoadedDocument]:
-    """Yield LoadedDocument for each supported file under dir_path."""
-    dir_path = Path(dir_path)
-    if not dir_path.is_dir():
-        return
-    for f in dir_path.rglob("*"):
-        if f.is_file() and f.suffix.lower() in extensions:
-            doc = load_file(f)
-            if doc and doc.content.strip():
-                yield doc

rag_python-0.2.0/src/rag_python/retrieval.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""Retrieval: multi-query retrieval + reranking."""
-from typing import Any
-from .vector_store import retrieve as chroma_retrieve
-from .query_rewriting import rewrite_for_retrieval
-from .reranker import rerank_with_metadata
-from .providers import EmbeddingProvider, LLMProvider
-from .options import RetrieverStrategy
-from .config import TOP_K_RETRIEVE, TOP_K_RERANK, MULTI_QUERY_N
-def retrieve(
-    query: str,
-    *,
-    embedder: EmbeddingProvider,
-    embedding_model: str | None = None,
-    retriever: RetrieverStrategy = "multi_query",
-    multi_query: bool | None = None,
-    n_queries: int | None = None,
-    top_k_retrieve: int | None = None,
-    top_k_rerank: int | None = None,
-    rerank_enabled: bool | None = None,
-    llm: LLMProvider | None = None,
-    llm_model: str | None = None,
-) -> list[tuple[str, dict[str, Any], float]]:
-    """
-    Retrieve relevant chunks using vector or multi-query search, then rerank.
-    Returns list of (document_text, metadata, rerank_score).
-    """
-    top_k_retrieve = top_k_retrieve or TOP_K_RETRIEVE
-    top_k_rerank = top_k_rerank or TOP_K_RERANK
-    n_queries = n_queries or MULTI_QUERY_N
-    use_multi_query = retriever == "multi_query" if multi_query is None else multi_query
-    queries = [query]
-    if use_multi_query and n_queries > 1:
-        rewritten = rewrite_for_retrieval(query, n_queries=n_queries, llm=llm, llm_model=llm_model)
-        if rewritten:
-            queries = rewritten
-    seen_docs: set[str] = set()
-    all_candidates: list[tuple[str, dict, float]] = []
-    for q in queries:
-        emb = embedder.embed([q], model=embedding_model)[0]
-        hits = chroma_retrieve(emb, top_k=top_k_retrieve)
-        for doc, meta, dist in hits:
-            key = (doc[:200], meta.get("source", ""))
-            if key in seen_docs:
-                continue
-            seen_docs.add(key)
-            all_candidates.append((doc, meta, -dist))
-    if not all_candidates:
-        return []
-    docs = [c[0] for c in all_candidates]
-    metas = [c[1] for c in all_candidates]
-    reranked = rerank_with_metadata(
-        query, list(zip(docs, metas)), top_k=top_k_rerank, rerank_enabled=rerank_enabled
-    )
-    return reranked