PyPI - rag-python - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rag-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

rag_python/__init__.py +39 -0
rag_python/chunking.py +181 -0
rag_python/cleaning.py +102 -0
rag_python/cli.py +77 -0
rag_python/client.py +190 -0
rag_python/config.py +37 -0
rag_python/document_loaders.py +74 -0
rag_python/evaluation.py +105 -0
rag_python/generation.py +35 -0
rag_python/guardrails.py +66 -0
rag_python/options.py +68 -0
rag_python/providers/__init__.py +5 -0
rag_python/providers/anthropic_provider.py +41 -0
rag_python/providers/azure_openai_provider.py +62 -0
rag_python/providers/base.py +24 -0
rag_python/providers/factory.py +53 -0
rag_python/providers/gemini_provider.py +45 -0
rag_python/providers/ollama_provider.py +56 -0
rag_python/providers/openai_provider.py +46 -0
rag_python/py.typed +0 -0
rag_python/query_rewriting.py +65 -0
rag_python/rag_pipeline.py +241 -0
rag_python/reranker.py +64 -0
rag_python/retrieval.py +61 -0
rag_python/vector_store.py +91 -0
rag_python-0.1.0.dist-info/LICENSE +22 -0
rag_python-0.1.0.dist-info/METADATA +158 -0
rag_python-0.1.0.dist-info/RECORD +31 -0
rag_python-0.1.0.dist-info/WHEEL +5 -0
rag_python-0.1.0.dist-info/entry_points.txt +2 -0
rag_python-0.1.0.dist-info/top_level.txt +1 -0

rag_python/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""rag-python — production-grade RAG for Python.
+Quick start::
+    from rag_python import RAG
+    rag = RAG(llm_model="gpt-4o-mini")
+    rag.ingest(["./docs"], reindex=True)
+    print(rag.query("What is our leave policy?").text)
+"""
+__version__ = "0.1.0"
+from .client import RAG, RAGAnswer
+from .rag_pipeline import ingest, query, RAGResponse
+from .providers import make_llm_provider, make_embedding_provider
+from .options import (
+    ChunkingConfig,
+    DocumentConfig,
+    QueryConfig,
+    RAGConfig,
+    SearchConfig,
+)
+__all__ = [
+    "__version__",
+    "RAG",
+    "RAGAnswer",
+    "RAGConfig",
+    "ChunkingConfig",
+    "SearchConfig",
+    "DocumentConfig",
+    "QueryConfig",
+    "ingest",
+    "query",
+    "RAGResponse",
+    "make_llm_provider",
+    "make_embedding_provider",
+]

rag_python/chunking.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Chunking: recursive, structure-aware (headings/sections), and semantic (embedding-based)."""
+import re
+from dataclasses import dataclass
+from typing import Callable
+try:
+    import tiktoken
+except ImportError:
+    tiktoken = None
+@dataclass
+class Chunk:
+    """Single chunk with text and metadata."""
+    text: str
+    metadata: dict
+# --- Recursive: split by section → paragraph → sentence → tokens ---
+RECURSIVE_SEPARATORS = ["\n\n\n", "\n\n", "\n", ". ", " ", ""]
+def _split_by_tokens(text: str, chunk_size: int, overlap: int, encoding_name: str = "cl100k_base") -> list[str]:
+    if not tiktoken:
+        size = chunk_size * 4
+        overlap_chars = overlap * 4
+        out = []
+        start = 0
+        while start < len(text):
+            end = min(start + size, len(text))
+            out.append(text[start:end])
+            start = end - overlap_chars if end < len(text) else len(text)
+        return out
+    enc = tiktoken.get_encoding(encoding_name)
+    tokens = enc.encode(text)
+    out = []
+    start = 0
+    while start < len(tokens):
+        end = min(start + chunk_size, len(tokens))
+        out.append(enc.decode(tokens[start:end]))
+        start = end - overlap if end < len(tokens) else len(tokens)
+    return out
+def _recursive_split(text: str, separators: list[str], chunk_size: int, overlap: int) -> list[str]:
+    if not text.strip():
+        return []
+    sep = separators[0] if separators else ""
+    if sep == "":
+        return _split_by_tokens(text, chunk_size, overlap)
+    parts = text.split(sep)
+    if len(parts) == 1:
+        return _recursive_split(text, separators[1:], chunk_size, overlap)
+    chunks = []
+    current = ""
+    for p in parts:
+        bit = p if sep in "\n" else p + sep
+        if len(current) + len(bit) <= chunk_size * 4:
+            current += bit
+        else:
+            if current.strip():
+                chunks.append(current.strip())
+            current = bit[-overlap * 4 :] + bit if overlap else bit
+    if current.strip():
+        chunks.append(current.strip())
+    return chunks
+def chunk_recursive(
+    text: str,
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+) -> list[Chunk]:
+    """Recursive chunking: section → paragraph → sentence → tokens."""
+    raw = _recursive_split(text, RECURSIVE_SEPARATORS, chunk_size, overlap)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "recursive"
+    return [Chunk(text=t, metadata={**meta}) for t in raw if t.strip()]
+HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
+def _structure_sections(text: str) -> list[tuple[str, str]]:
+    """Split by markdown-style headings; preserve content under each heading."""
+    sections = []
+    current_title = "Document"
+    current_content = []
+    for line in text.splitlines():
+        m = HEADING_PATTERN.match(line)
+        if m:
+            if current_content:
+                sections.append((current_title, "\n".join(current_content)))
+            current_title = m.group(2).strip()
+            current_content = []
+        else:
+            current_content.append(line)
+    if current_content:
+        sections.append((current_title, "\n".join(current_content)))
+    return sections
+def chunk_structure_aware(
+    text: str,
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+) -> list[Chunk]:
+    """Structure-aware: chunk by sections (headings); keep tables/code blocks intact."""
+    sections = _structure_sections(text)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "structure_aware"
+    chunks = []
+    for title, content in sections:
+        content = content.strip()
+        if not content:
+            continue
+        if len(content) <= chunk_size * 4:
+            chunks.append(Chunk(text=f"## {title}\n\n{content}", metadata={**meta, "section": title}))
+        else:
+            sub = _recursive_split(content, RECURSIVE_SEPARATORS[1:], chunk_size, overlap)
+            for i, t in enumerate(sub):
+                if t.strip():
+                    chunks.append(Chunk(
+                        text=f"## {title}\n\n{t.strip()}",
+                        metadata={**meta, "section": title, "section_part": i},
+                    ))
+    return chunks
+def chunk_semantic(
+    text: str,
+    embed_fn: Callable[[list[str]], list[list[float]]],
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+    similarity_threshold: float = 0.7,
+) -> list[Chunk]:
+    """Semantic chunking: approximate topic shifts and split."""
+    segments = re.split(r"(?<=[.!?])\s+", text)
+    if len(segments) <= 1:
+        return chunk_recursive(text, chunk_size, overlap, metadata)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "semantic"
+    chunks = []
+    current = []
+    current_len = 0
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+        current.append(seg)
+        current_len += len(seg)
+        if current_len >= chunk_size * 3:
+            chunk_text = " ".join(current)
+            chunks.append(Chunk(text=chunk_text, metadata={**meta}))
+            overlap_segs = max(1, len(current) // 4)
+            current = current[-overlap_segs:]
+            current_len = sum(len(s) for s in current)
+    if current:
+        chunks.append(Chunk(text=" ".join(current), metadata={**meta}))
+    return chunks
+def chunk_text(
+    text: str,
+    strategy: str = "recursive",
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+    embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
+) -> list[Chunk]:
+    """Unified entry: recursive | structure_aware | semantic."""
+    if strategy == "structure_aware":
+        return chunk_structure_aware(text, chunk_size, overlap, metadata)
+    if strategy == "semantic" and embed_fn:
+        return chunk_semantic(text, embed_fn, chunk_size, overlap, metadata, similarity_threshold=0.7)
+    return chunk_recursive(text, chunk_size, overlap, metadata)

rag_python/cleaning.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Text cleaning & normalization. Garbage in → hallucination out."""
+import re
+try:
+    from langdetect import detect, LangDetectException
+except ImportError:
+    detect = None
+    LangDetectException = Exception
+def normalize_whitespace(text: str) -> str:
+    """Collapse runs of whitespace and strip."""
+    return re.sub(r"\s+", " ", text).strip()
+def remove_header_footer_candidates(text: str, min_line_len: int = 10) -> str:
+    """Remove lines that look like headers/footers (very short, repeated at top/bottom)."""
+    lines = text.splitlines()
+    if len(lines) < 5:
+        return text
+    def is_likely_header_footer(line: str) -> bool:
+        s = line.strip()
+        if len(s) < min_line_len:
+            return True
+        if re.match(r"^[\d\s\-\.\/]+$", s):  # page numbers, dates
+            return True
+        return False
+    start = 0
+    while start < len(lines) and is_likely_header_footer(lines[start]):
+        start += 1
+    end = len(lines)
+    while end > start and is_likely_header_footer(lines[end - 1]):
+        end -= 1
+    return "\n".join(lines[start:end])
+def deduplicate_sentences(text: str) -> str:
+    """Remove consecutive duplicate sentences (and near-duplicates by line)."""
+    lines = [normalize_whitespace(line) for line in text.splitlines() if line.strip()]
+    seen = set()
+    out = []
+    for line in lines:
+        key = line.lower()[:200]
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(line)
+    return "\n".join(out)
+def preserve_blocks(text: str) -> str:
+    """Normalize whitespace but preserve code blocks and tables (markdown-style)."""
+    out = []
+    in_code = False
+    for part in re.split(r"(```[\w]*\n?|```)", text):
+        if part.startswith("```"):
+            in_code = not in_code
+            out.append(part)
+            continue
+        if in_code:
+            out.append(part)
+            continue
+        out.append(normalize_whitespace(part))
+    return "".join(out) if out else text
+def detect_language(text: str) -> str | None:
+    """Return ISO language code or None if detection fails."""
+    if not detect:
+        return None
+    try:
+        sample = text[:2000] if len(text) > 2000 else text
+        return detect(sample)
+    except LangDetectException:
+        return None
+def clean_document(
+    text: str,
+    *,
+    normalize_ws: bool = True,
+    remove_headers_footers: bool = True,
+    dedupe: bool = True,
+    preserve_code_tables: bool = True,
+    min_lang_length: int = 50,
+) -> str:
+    """Full cleaning pipeline. Preserve code/tables; optionally skip non-English if desired."""
+    if normalize_ws and not preserve_code_tables:
+        text = normalize_whitespace(text)
+    elif preserve_code_tables:
+        text = preserve_blocks(text)
+    if remove_headers_footers:
+        text = remove_header_footer_candidates(text)
+    if dedupe:
+        text = deduplicate_sentences(text)
+    if normalize_ws and preserve_code_tables:
+        text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
+        text = re.sub(r" +", " ", text)
+    return text.strip()

rag_python/cli.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""rag-python command-line interface."""
+import argparse
+from .client import RAG
+def _build_rag(args: argparse.Namespace) -> RAG:
+    return RAG(
+        llm_provider=args.llm_provider,
+        llm_model=args.llm_model,
+        embedding_provider=args.embedding_provider,
+        embedding_model=args.embedding_model,
+        openai_api_key=args.openai_api_key,
+        azure_endpoint=args.azure_endpoint,
+        azure_api_key=args.azure_api_key,
+        azure_api_version=args.azure_api_version,
+        anthropic_api_key=args.anthropic_api_key,
+        gemini_api_key=args.gemini_api_key,
+        ollama_base_url=args.ollama_base_url,
+    )
+def _add_provider_args(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument("--llm-provider", default="openai", choices=["openai", "azure_openai", "anthropic", "gemini", "ollama"])
+    parser.add_argument("--llm-model", default=None)
+    parser.add_argument("--embedding-provider", default="openai", choices=["openai", "azure_openai", "ollama"])
+    parser.add_argument("--embedding-model", default=None)
+    parser.add_argument("--ollama-base-url", default=None)
+    parser.add_argument("--azure-endpoint", default=None)
+    parser.add_argument("--azure-api-key", default=None)
+    parser.add_argument("--azure-api-version", default=None)
+    parser.add_argument("--openai-api-key", default=None)
+    parser.add_argument("--anthropic-api-key", default=None)
+    parser.add_argument("--gemini-api-key", default=None)
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="rag-python",
+        description="rag-python — modular RAG with query rewriting, reranking, guardrails, and multi-LLM support.",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+    ing = sub.add_parser("ingest", help="Ingest files/folders into the vector store")
+    ing.add_argument("paths", nargs="+", help="Files or folders to ingest")
+    ing.add_argument("--reindex", action="store_true", help="Clear vector store and re-ingest")
+    _add_provider_args(ing)
+    q = sub.add_parser("query", help="Ask a question against ingested documents")
+    q.add_argument("question", nargs="+", help="Question text")
+    q.add_argument("--no-multi-query", action="store_true")
+    q.add_argument("-v", "--verbose", action="store_true")
+    _add_provider_args(q)
+    args = parser.parse_args()
+    if args.command == "ingest":
+        rag = _build_rag(args)
+        n = rag.ingest(args.paths, reindex=args.reindex)
+        print(f"Ingested {n} chunks.")
+        return
+    if args.command == "query":
+        rag = _build_rag(args)
+        question = " ".join(args.question)
+        ans = rag.query(question, multi_query=not args.no_multi_query)
+        print(ans.text)
+        if args.verbose:
+            print("\n--- evaluation ---")
+            print(ans.evaluation)
+            print("\n--- sources ---")
+            for s in ans.sources[:5]:
+                print(s.get("metadata", {}).get("source", ""), "score:", s.get("score"))
+if __name__ == "__main__":
+    main()

rag_python/client.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""High-level RAG client API.
+This wraps the full RAG pipeline behind a simple interface:
+    from rag_python import RAG, RAGConfig, ChunkingConfig, SearchConfig
+    rag = RAG(
+        llm_model="gpt-4o-mini",
+        embedding_provider="openai",
+        embedding_model="text-embedding-3-small",
+        config=RAGConfig(
+            chunking=ChunkingConfig(strategy="recursive", chunk_size=512),
+            search=SearchConfig(retriever="multi_query", top_k_retrieve=20),
+        ),
+    )
+    rag.ingest(["./docs", "./policies.pdf", "README.md"])
+    answer = rag.query("What is our leave policy?")
+    print(answer.text)
+"""
+from dataclasses import dataclass, replace
+from pathlib import Path
+from typing import Iterable
+from .config import CHROMA_PERSIST_DIR, DATA_DIR, EMBEDDING_MODEL, LLM_MODEL
+from .options import (
+    ChunkingConfig,
+    DocumentConfig,
+    QueryConfig,
+    RAGConfig,
+    SearchConfig,
+)
+from .providers import make_llm_provider, make_embedding_provider
+from .rag_pipeline import ingest as _ingest, query as _query, RAGResponse
+from .vector_store import set_persist_dir
+@dataclass
+class RAGAnswer:
+    text: str
+    sources: list[dict]
+    evaluation: dict
+    retried: bool
+class RAG:
+    """User-facing RAG client with configurable chunking, retrieval, and embeddings."""
+    def __init__(
+        self,
+        *,
+        llm_provider: str = "openai",
+        llm_model: str | None = None,
+        embedding_provider: str = "openai",
+        embedding_model: str | None = None,
+        data_dir: str | Path | None = None,
+        chroma_dir: str | Path | None = None,
+        config: RAGConfig | None = None,
+        # Shorthand overrides (merged into ``config`` when provided)
+        chunk_strategy: str | None = None,
+        chunk_size: int | None = None,
+        chunk_overlap: int | None = None,
+        retriever: str | None = None,
+        top_k_retrieve: int | None = None,
+        top_k_rerank: int | None = None,
+        multi_query_n: int | None = None,
+        rerank_enabled: bool | None = None,
+        document_extensions: tuple[str, ...] | None = None,
+        # Provider kwargs (optional)
+        openai_api_key: str | None = None,
+        azure_endpoint: str | None = None,
+        azure_api_key: str | None = None,
+        azure_api_version: str | None = None,
+        anthropic_api_key: str | None = None,
+        gemini_api_key: str | None = None,
+        ollama_base_url: str | None = None,
+    ) -> None:
+        self.llm_provider_name = llm_provider
+        self.embedding_provider_name = embedding_provider
+        self.llm_model = llm_model or LLM_MODEL
+        self.embedding_model = embedding_model or EMBEDDING_MODEL
+        self.data_dir = Path(data_dir) if data_dir else Path(DATA_DIR)
+        if chroma_dir:
+            set_persist_dir(chroma_dir)
+        elif CHROMA_PERSIST_DIR:
+            set_persist_dir(CHROMA_PERSIST_DIR)
+        self.config = config or RAGConfig()
+        if chunk_strategy is not None:
+            self.config.chunking = replace(self.config.chunking, strategy=chunk_strategy)  # type: ignore[arg-type]
+        if chunk_size is not None:
+            self.config.chunking = replace(self.config.chunking, chunk_size=chunk_size)
+        if chunk_overlap is not None:
+            self.config.chunking = replace(self.config.chunking, chunk_overlap=chunk_overlap)
+        if retriever is not None:
+            self.config.search = replace(self.config.search, retriever=retriever)  # type: ignore[arg-type]
+        if top_k_retrieve is not None:
+            self.config.search = replace(self.config.search, top_k_retrieve=top_k_retrieve)
+        if top_k_rerank is not None:
+            self.config.search = replace(self.config.search, top_k_rerank=top_k_rerank)
+        if multi_query_n is not None:
+            self.config.search = replace(self.config.search, multi_query_n=multi_query_n)
+        if rerank_enabled is not None:
+            self.config.search = replace(self.config.search, rerank_enabled=rerank_enabled)
+        if document_extensions is not None:
+            self.config.documents = replace(self.config.documents, extensions=document_extensions)
+        self.llm = make_llm_provider(
+            llm_provider,  # type: ignore[arg-type]
+            api_key=openai_api_key or anthropic_api_key or gemini_api_key or azure_api_key,
+            azure_endpoint=azure_endpoint,
+            api_version=azure_api_version,
+            base_url=ollama_base_url,
+        )
+        self.embedder = make_embedding_provider(
+            embedding_provider,  # type: ignore[arg-type]
+            api_key=openai_api_key or azure_api_key,
+            azure_endpoint=azure_endpoint,
+            api_version=azure_api_version,
+            base_url=ollama_base_url,
+        )
+    def ingest(self, paths: Iterable[str | Path], *, reindex: bool = False) -> int:
+        """Ingest one or more files/directories into the vector store."""
+        path_list = [Path(p) for p in paths]
+        doc_cfg: DocumentConfig = self.config.documents
+        chunk_cfg: ChunkingConfig = self.config.chunking
+        if doc_cfg.copy_to_data_dir:
+            self.data_dir.mkdir(parents=True, exist_ok=True)
+            for p in path_list:
+                if p.is_file():
+                    target = self.data_dir / p.name
+                    if str(p.resolve()) != str(target.resolve()):
+                        target.write_bytes(p.read_bytes())
+                elif p.is_dir():
+                    for f in p.rglob("*"):
+                        if f.is_file():
+                            rel = f.relative_to(p)
+                            target = self.data_dir / rel
+                            target.parent.mkdir(parents=True, exist_ok=True)
+                            if str(f.resolve()) != str(target.resolve()):
+                                target.write_bytes(f.read_bytes())
+            return _ingest(
+                data_path=self.data_dir,
+                clean=doc_cfg.clean,
+                chunk_strategy=chunk_cfg.strategy,
+                chunk_size=chunk_cfg.chunk_size,
+                chunk_overlap=chunk_cfg.chunk_overlap,
+                extensions=doc_cfg.extensions,
+                reindex=reindex,
+                embedding_model=self.embedding_model,
+                embedder=self.embedder,
+            )
+        return _ingest(
+            paths=path_list,
+            clean=doc_cfg.clean,
+            chunk_strategy=chunk_cfg.strategy,
+            chunk_size=chunk_cfg.chunk_size,
+            chunk_overlap=chunk_cfg.chunk_overlap,
+            extensions=doc_cfg.extensions,
+            reindex=reindex,
+            embedding_model=self.embedding_model,
+            embedder=self.embedder,
+        )
+    def query(
+        self,
+        question: str,
+        *,
+        search: SearchConfig | None = None,
+        query_config: QueryConfig | None = None,
+    ) -> RAGAnswer:
+        """Run a full RAG query and return a friendly answer object."""
+        resp: RAGResponse = _query(
+            question,
+            search=search or self.config.search,
+            query_config=query_config or self.config.query,
+            llm_model=self.llm_model,
+            embedding_model=self.embedding_model,
+            llm=self.llm,
+            embedder=self.embedder,
+        )
+        return RAGAnswer(
+            text=resp.answer,
+            sources=resp.sources,
+            evaluation=resp.evaluation,
+            retried=resp.retried,
+        )

rag_python/config.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Configuration loaded from environment variables."""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+# API keys (provider-specific)
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Models
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
+# Paths — default to current working directory (works when installed from PyPI)
+PROJECT_ROOT = Path.cwd()
+DATA_DIR = Path(os.getenv("RAG_PYTHON_DATA_DIR", PROJECT_ROOT / "data"))
+CHROMA_PERSIST_DIR = Path(os.getenv("RAG_PYTHON_CHROMA_DIR", PROJECT_ROOT / "chroma_db"))
+# Chunking
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "512"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "64"))
+CHUNK_STRATEGY = os.getenv("CHUNK_STRATEGY", "recursive")  # recursive | structure_aware | semantic
+# Retrieval
+TOP_K_RETRIEVE = int(os.getenv("TOP_K_RETRIEVE", "20"))
+TOP_K_RERANK = int(os.getenv("TOP_K_RERANK", "5"))
+MULTI_QUERY_N = int(os.getenv("MULTI_QUERY_N", "3"))
+# Guardrails
+GUARDRAILS_ENABLED = os.getenv("GUARDRAILS_ENABLED", "true").lower() == "true"
+MAX_RETRIES = int(os.getenv("MAX_RETRIES", "2"))
+# Reranker (optional extra: pip install rag-python[rerank])
+RERANKER_MODEL = os.getenv("RERANKER_MODEL", "BAAI/bge-reranker-base")
+RERANK_ENABLED = os.getenv("RERANK_ENABLED", "true").lower() == "true"