PyPI - rag-python - Versions diffs - 0.1.0__tar.gz - Mend

rag-python 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

rag_python-0.1.0/LICENSE +22 -0
rag_python-0.1.0/PKG-INFO +158 -0
rag_python-0.1.0/README.md +111 -0
rag_python-0.1.0/pyproject.toml +67 -0
rag_python-0.1.0/setup.cfg +4 -0
rag_python-0.1.0/src/rag_python/__init__.py +39 -0
rag_python-0.1.0/src/rag_python/chunking.py +181 -0
rag_python-0.1.0/src/rag_python/cleaning.py +102 -0
rag_python-0.1.0/src/rag_python/cli.py +77 -0
rag_python-0.1.0/src/rag_python/client.py +190 -0
rag_python-0.1.0/src/rag_python/config.py +37 -0
rag_python-0.1.0/src/rag_python/document_loaders.py +74 -0
rag_python-0.1.0/src/rag_python/evaluation.py +105 -0
rag_python-0.1.0/src/rag_python/generation.py +35 -0
rag_python-0.1.0/src/rag_python/guardrails.py +66 -0
rag_python-0.1.0/src/rag_python/options.py +68 -0
rag_python-0.1.0/src/rag_python/providers/__init__.py +5 -0
rag_python-0.1.0/src/rag_python/providers/anthropic_provider.py +41 -0
rag_python-0.1.0/src/rag_python/providers/azure_openai_provider.py +62 -0
rag_python-0.1.0/src/rag_python/providers/base.py +24 -0
rag_python-0.1.0/src/rag_python/providers/factory.py +53 -0
rag_python-0.1.0/src/rag_python/providers/gemini_provider.py +45 -0
rag_python-0.1.0/src/rag_python/providers/ollama_provider.py +56 -0
rag_python-0.1.0/src/rag_python/providers/openai_provider.py +46 -0
rag_python-0.1.0/src/rag_python/py.typed +0 -0
rag_python-0.1.0/src/rag_python/query_rewriting.py +65 -0
rag_python-0.1.0/src/rag_python/rag_pipeline.py +241 -0
rag_python-0.1.0/src/rag_python/reranker.py +64 -0
rag_python-0.1.0/src/rag_python/retrieval.py +61 -0
rag_python-0.1.0/src/rag_python/vector_store.py +91 -0
rag_python-0.1.0/src/rag_python.egg-info/PKG-INFO +158 -0
rag_python-0.1.0/src/rag_python.egg-info/SOURCES.txt +37 -0
rag_python-0.1.0/src/rag_python.egg-info/dependency_links.txt +1 -0
rag_python-0.1.0/src/rag_python.egg-info/entry_points.txt +2 -0
rag_python-0.1.0/src/rag_python.egg-info/requires.txt +28 -0
rag_python-0.1.0/src/rag_python.egg-info/top_level.txt +1 -0
rag_python-0.1.0/tests/test_config.py +19 -0
rag_python-0.1.0/tests/test_import.py +9 -0
rag_python-0.1.0/tests/test_package.py +15 -0

rag_python-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+MIT License
+Copyright (c) 2026
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rag_python-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,158 @@
+Metadata-Version: 2.2
+Name: rag-python
+Version: 0.1.0
+Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
+Author-email: Raghav Singla <04raghavsingla28@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/RaghavOG/rag-python
+Project-URL: Repository, https://github.com/RaghavOG/rag-python
+Project-URL: Documentation, https://github.com/RaghavOG/rag-python#readme
+Project-URL: Issues, https://github.com/RaghavOG/rag-python/issues
+Keywords: rag,llm,embeddings,chromadb,openai,rag-python,retrieval-augmented-generation
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: openai>=1.12.0
+Requires-Dist: tiktoken>=0.5.0
+Requires-Dist: chromadb>=0.4.22
+Requires-Dist: pypdf>=3.17.0
+Requires-Dist: python-docx>=1.1.0
+Requires-Dist: langdetect>=1.0.9
+Requires-Dist: regex>=2023.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: requests>=2.31.0
+Provides-Extra: rerank
+Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
+Requires-Dist: torch>=2.0.0; extra == "rerank"
+Provides-Extra: anthropic
+Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
+Provides-Extra: gemini
+Requires-Dist: google-genai>=0.3.0; extra == "gemini"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+Provides-Extra: all
+Requires-Dist: rag-python[anthropic,gemini,rerank]; extra == "all"
+# rag-python
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![GitHub](https://img.shields.io/badge/GitHub-RaghavOG%2Frag--python-blue)](https://github.com/RaghavOG/rag-python)
+**rag-python** is a production-oriented Python library for **Retrieval-Augmented Generation (RAG)**.
+Ingest your documents, ask questions, get grounded answers — with query rewriting, multi-query retrieval, reranking, guardrails, and multi-LLM support.
+**Author:** [Raghav Singla](https://github.com/RaghavOG)
+**Repository:** [github.com/RaghavOG/rag-python](https://github.com/RaghavOG/rag-python)
+---
+## Features
+- Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
+- Query pipeline: rewriting → multi-query retrieval → reranking
+- Generation with guardrails (prompt injection + hallucination checks)
+- Evaluation scores + self-correction retry loop
+- **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
+---
+## Install
+```bash
+pip install rag-python
+# or from source
+pip install -e .
+# with reranking + extra providers
+pip install -e ".[rerank,anthropic,gemini,all]"
+```
+---
+## Quickstart
+```python
+from rag_python import RAG
+rag = RAG(
+    llm_provider="openai",
+    llm_model="gpt-4o-mini",
+    embedding_provider="openai",
+    embedding_model="text-embedding-3-small",
+)
+rag.ingest(["./data"], reindex=True)
+answer = rag.query("How many days of annual leave?")
+print(answer.text)
+```
+### CLI
+```bash
+export OPENAI_API_KEY=sk-...
+rag-python ingest ./data --reindex
+rag-python query "How many days of annual leave?" -v
+```
+---
+## Environment variables
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `OPENAI_API_KEY` | For OpenAI | Default LLM + embeddings |
+| `ANTHROPIC_API_KEY` | For Claude | LLM only |
+| `GEMINI_API_KEY` | For Gemini | LLM only |
+| `AZURE_OPENAI_ENDPOINT` | For Azure | Azure OpenAI |
+| `AZURE_OPENAI_API_KEY` | For Azure | Azure OpenAI |
+| `OPENAI_API_VERSION` | Azure | Default `2023-09-01-preview` |
+| `OLLAMA_BASE_URL` | Ollama | Default `http://localhost:11434` |
+| `RAG_PYTHON_DATA_DIR` | Optional | Default `./data` |
+| `RAG_PYTHON_CHROMA_DIR` | Optional | Default `./chroma_db` |
+See [`.env.example`](.env.example) for all tuning options.
+---
+## Project structure
+```text
+.
+├── src/rag_python/      # Installable package (PyPI: rag-python)
+│   ├── client.py        # High-level RAG API
+│   ├── rag_pipeline.py  # Full pipeline
+│   └── providers/       # OpenAI, Azure, Anthropic, Gemini, Ollama
+├── tests/
+├── examples/
+├── docs/
+├── data/                # Sample documents
+├── pyproject.toml
+└── main.py              # Local dev CLI wrapper
+```
+---
+## Docs
+- [Usage](docs/USAGE.md)
+- [Providers](docs/PROVIDERS.md)
+- [Changelog](CHANGELOG.md)
+---
+## License
+MIT © [Raghav Singla](https://github.com/RaghavOG)

rag_python-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,111 @@
+# rag-python
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![GitHub](https://img.shields.io/badge/GitHub-RaghavOG%2Frag--python-blue)](https://github.com/RaghavOG/rag-python)
+**rag-python** is a production-oriented Python library for **Retrieval-Augmented Generation (RAG)**.
+Ingest your documents, ask questions, get grounded answers — with query rewriting, multi-query retrieval, reranking, guardrails, and multi-LLM support.
+**Author:** [Raghav Singla](https://github.com/RaghavOG)
+**Repository:** [github.com/RaghavOG/rag-python](https://github.com/RaghavOG/rag-python)
+---
+## Features
+- Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
+- Query pipeline: rewriting → multi-query retrieval → reranking
+- Generation with guardrails (prompt injection + hallucination checks)
+- Evaluation scores + self-correction retry loop
+- **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
+---
+## Install
+```bash
+pip install rag-python
+# or from source
+pip install -e .
+# with reranking + extra providers
+pip install -e ".[rerank,anthropic,gemini,all]"
+```
+---
+## Quickstart
+```python
+from rag_python import RAG
+rag = RAG(
+    llm_provider="openai",
+    llm_model="gpt-4o-mini",
+    embedding_provider="openai",
+    embedding_model="text-embedding-3-small",
+)
+rag.ingest(["./data"], reindex=True)
+answer = rag.query("How many days of annual leave?")
+print(answer.text)
+```
+### CLI
+```bash
+export OPENAI_API_KEY=sk-...
+rag-python ingest ./data --reindex
+rag-python query "How many days of annual leave?" -v
+```
+---
+## Environment variables
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `OPENAI_API_KEY` | For OpenAI | Default LLM + embeddings |
+| `ANTHROPIC_API_KEY` | For Claude | LLM only |
+| `GEMINI_API_KEY` | For Gemini | LLM only |
+| `AZURE_OPENAI_ENDPOINT` | For Azure | Azure OpenAI |
+| `AZURE_OPENAI_API_KEY` | For Azure | Azure OpenAI |
+| `OPENAI_API_VERSION` | Azure | Default `2023-09-01-preview` |
+| `OLLAMA_BASE_URL` | Ollama | Default `http://localhost:11434` |
+| `RAG_PYTHON_DATA_DIR` | Optional | Default `./data` |
+| `RAG_PYTHON_CHROMA_DIR` | Optional | Default `./chroma_db` |
+See [`.env.example`](.env.example) for all tuning options.
+---
+## Project structure
+```text
+.
+├── src/rag_python/      # Installable package (PyPI: rag-python)
+│   ├── client.py        # High-level RAG API
+│   ├── rag_pipeline.py  # Full pipeline
+│   └── providers/       # OpenAI, Azure, Anthropic, Gemini, Ollama
+├── tests/
+├── examples/
+├── docs/
+├── data/                # Sample documents
+├── pyproject.toml
+└── main.py              # Local dev CLI wrapper
+```
+---
+## Docs
+- [Usage](docs/USAGE.md)
+- [Providers](docs/PROVIDERS.md)
+- [Changelog](CHANGELOG.md)
+---
+## License
+MIT © [Raghav Singla](https://github.com/RaghavOG)

rag_python-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,67 @@
+[build-system]
+requires = ["setuptools>=61.0,<77.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "rag-python"
+version = "0.1.0"
+description = "Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation."
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.10"
+authors = [
+  { name = "Raghav Singla", email = "04raghavsingla28@gmail.com" },
+]
+keywords = ["rag", "llm", "embeddings", "chromadb", "openai", "rag-python", "retrieval-augmented-generation"]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+  "openai>=1.12.0",
+  "tiktoken>=0.5.0",
+  "chromadb>=0.4.22",
+  "pypdf>=3.17.0",
+  "python-docx>=1.1.0",
+  "langdetect>=1.0.9",
+  "regex>=2023.0.0",
+  "python-dotenv>=1.0.0",
+  "requests>=2.31.0",
+]
+[project.optional-dependencies]
+rerank = ["sentence-transformers>=2.2.0", "torch>=2.0.0"]
+anthropic = ["anthropic>=0.20.0"]
+gemini = ["google-genai>=0.3.0"]
+dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine"]
+all = ["rag-python[rerank,anthropic,gemini]"]
+[project.scripts]
+rag-python = "rag_python.cli:main"
+[project.urls]
+Homepage = "https://github.com/RaghavOG/rag-python"
+Repository = "https://github.com/RaghavOG/rag-python"
+Documentation = "https://github.com/RaghavOG/rag-python#readme"
+Issues = "https://github.com/RaghavOG/rag-python/issues"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+rag_python = ["py.typed"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["src"]
+[tool.ruff]
+line-length = 100
+target-version = "py310"

rag_python-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

rag_python-0.1.0/src/rag_python/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""rag-python — production-grade RAG for Python.
+Quick start::
+    from rag_python import RAG
+    rag = RAG(llm_model="gpt-4o-mini")
+    rag.ingest(["./docs"], reindex=True)
+    print(rag.query("What is our leave policy?").text)
+"""
+__version__ = "0.1.0"
+from .client import RAG, RAGAnswer
+from .rag_pipeline import ingest, query, RAGResponse
+from .providers import make_llm_provider, make_embedding_provider
+from .options import (
+    ChunkingConfig,
+    DocumentConfig,
+    QueryConfig,
+    RAGConfig,
+    SearchConfig,
+)
+__all__ = [
+    "__version__",
+    "RAG",
+    "RAGAnswer",
+    "RAGConfig",
+    "ChunkingConfig",
+    "SearchConfig",
+    "DocumentConfig",
+    "QueryConfig",
+    "ingest",
+    "query",
+    "RAGResponse",
+    "make_llm_provider",
+    "make_embedding_provider",
+]

rag_python-0.1.0/src/rag_python/chunking.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Chunking: recursive, structure-aware (headings/sections), and semantic (embedding-based)."""
+import re
+from dataclasses import dataclass
+from typing import Callable
+try:
+    import tiktoken
+except ImportError:
+    tiktoken = None
+@dataclass
+class Chunk:
+    """Single chunk with text and metadata."""
+    text: str
+    metadata: dict
+# --- Recursive: split by section → paragraph → sentence → tokens ---
+RECURSIVE_SEPARATORS = ["\n\n\n", "\n\n", "\n", ". ", " ", ""]
+def _split_by_tokens(text: str, chunk_size: int, overlap: int, encoding_name: str = "cl100k_base") -> list[str]:
+    if not tiktoken:
+        size = chunk_size * 4
+        overlap_chars = overlap * 4
+        out = []
+        start = 0
+        while start < len(text):
+            end = min(start + size, len(text))
+            out.append(text[start:end])
+            start = end - overlap_chars if end < len(text) else len(text)
+        return out
+    enc = tiktoken.get_encoding(encoding_name)
+    tokens = enc.encode(text)
+    out = []
+    start = 0
+    while start < len(tokens):
+        end = min(start + chunk_size, len(tokens))
+        out.append(enc.decode(tokens[start:end]))
+        start = end - overlap if end < len(tokens) else len(tokens)
+    return out
+def _recursive_split(text: str, separators: list[str], chunk_size: int, overlap: int) -> list[str]:
+    if not text.strip():
+        return []
+    sep = separators[0] if separators else ""
+    if sep == "":
+        return _split_by_tokens(text, chunk_size, overlap)
+    parts = text.split(sep)
+    if len(parts) == 1:
+        return _recursive_split(text, separators[1:], chunk_size, overlap)
+    chunks = []
+    current = ""
+    for p in parts:
+        bit = p if sep in "\n" else p + sep
+        if len(current) + len(bit) <= chunk_size * 4:
+            current += bit
+        else:
+            if current.strip():
+                chunks.append(current.strip())
+            current = bit[-overlap * 4 :] + bit if overlap else bit
+    if current.strip():
+        chunks.append(current.strip())
+    return chunks
+def chunk_recursive(
+    text: str,
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+) -> list[Chunk]:
+    """Recursive chunking: section → paragraph → sentence → tokens."""
+    raw = _recursive_split(text, RECURSIVE_SEPARATORS, chunk_size, overlap)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "recursive"
+    return [Chunk(text=t, metadata={**meta}) for t in raw if t.strip()]
+HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
+def _structure_sections(text: str) -> list[tuple[str, str]]:
+    """Split by markdown-style headings; preserve content under each heading."""
+    sections = []
+    current_title = "Document"
+    current_content = []
+    for line in text.splitlines():
+        m = HEADING_PATTERN.match(line)
+        if m:
+            if current_content:
+                sections.append((current_title, "\n".join(current_content)))
+            current_title = m.group(2).strip()
+            current_content = []
+        else:
+            current_content.append(line)
+    if current_content:
+        sections.append((current_title, "\n".join(current_content)))
+    return sections
+def chunk_structure_aware(
+    text: str,
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+) -> list[Chunk]:
+    """Structure-aware: chunk by sections (headings); keep tables/code blocks intact."""
+    sections = _structure_sections(text)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "structure_aware"
+    chunks = []
+    for title, content in sections:
+        content = content.strip()
+        if not content:
+            continue
+        if len(content) <= chunk_size * 4:
+            chunks.append(Chunk(text=f"## {title}\n\n{content}", metadata={**meta, "section": title}))
+        else:
+            sub = _recursive_split(content, RECURSIVE_SEPARATORS[1:], chunk_size, overlap)
+            for i, t in enumerate(sub):
+                if t.strip():
+                    chunks.append(Chunk(
+                        text=f"## {title}\n\n{t.strip()}",
+                        metadata={**meta, "section": title, "section_part": i},
+                    ))
+    return chunks
+def chunk_semantic(
+    text: str,
+    embed_fn: Callable[[list[str]], list[list[float]]],
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+    similarity_threshold: float = 0.7,
+) -> list[Chunk]:
+    """Semantic chunking: approximate topic shifts and split."""
+    segments = re.split(r"(?<=[.!?])\s+", text)
+    if len(segments) <= 1:
+        return chunk_recursive(text, chunk_size, overlap, metadata)
+    meta = dict(metadata or {})
+    meta["chunk_strategy"] = "semantic"
+    chunks = []
+    current = []
+    current_len = 0
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+        current.append(seg)
+        current_len += len(seg)
+        if current_len >= chunk_size * 3:
+            chunk_text = " ".join(current)
+            chunks.append(Chunk(text=chunk_text, metadata={**meta}))
+            overlap_segs = max(1, len(current) // 4)
+            current = current[-overlap_segs:]
+            current_len = sum(len(s) for s in current)
+    if current:
+        chunks.append(Chunk(text=" ".join(current), metadata={**meta}))
+    return chunks
+def chunk_text(
+    text: str,
+    strategy: str = "recursive",
+    chunk_size: int = 512,
+    overlap: int = 64,
+    metadata: dict | None = None,
+    embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
+) -> list[Chunk]:
+    """Unified entry: recursive | structure_aware | semantic."""
+    if strategy == "structure_aware":
+        return chunk_structure_aware(text, chunk_size, overlap, metadata)
+    if strategy == "semantic" and embed_fn:
+        return chunk_semantic(text, embed_fn, chunk_size, overlap, metadata, similarity_threshold=0.7)
+    return chunk_recursive(text, chunk_size, overlap, metadata)

rag_python-0.1.0/src/rag_python/cleaning.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Text cleaning & normalization. Garbage in → hallucination out."""
+import re
+try:
+    from langdetect import detect, LangDetectException
+except ImportError:
+    detect = None
+    LangDetectException = Exception
+def normalize_whitespace(text: str) -> str:
+    """Collapse runs of whitespace and strip."""
+    return re.sub(r"\s+", " ", text).strip()
+def remove_header_footer_candidates(text: str, min_line_len: int = 10) -> str:
+    """Remove lines that look like headers/footers (very short, repeated at top/bottom)."""
+    lines = text.splitlines()
+    if len(lines) < 5:
+        return text
+    def is_likely_header_footer(line: str) -> bool:
+        s = line.strip()
+        if len(s) < min_line_len:
+            return True
+        if re.match(r"^[\d\s\-\.\/]+$", s):  # page numbers, dates
+            return True
+        return False
+    start = 0
+    while start < len(lines) and is_likely_header_footer(lines[start]):
+        start += 1
+    end = len(lines)
+    while end > start and is_likely_header_footer(lines[end - 1]):
+        end -= 1
+    return "\n".join(lines[start:end])
+def deduplicate_sentences(text: str) -> str:
+    """Remove consecutive duplicate sentences (and near-duplicates by line)."""
+    lines = [normalize_whitespace(line) for line in text.splitlines() if line.strip()]
+    seen = set()
+    out = []
+    for line in lines:
+        key = line.lower()[:200]
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(line)
+    return "\n".join(out)
+def preserve_blocks(text: str) -> str:
+    """Normalize whitespace but preserve code blocks and tables (markdown-style)."""
+    out = []
+    in_code = False
+    for part in re.split(r"(```[\w]*\n?|```)", text):
+        if part.startswith("```"):
+            in_code = not in_code
+            out.append(part)
+            continue
+        if in_code:
+            out.append(part)
+            continue
+        out.append(normalize_whitespace(part))
+    return "".join(out) if out else text
+def detect_language(text: str) -> str | None:
+    """Return ISO language code or None if detection fails."""
+    if not detect:
+        return None
+    try:
+        sample = text[:2000] if len(text) > 2000 else text
+        return detect(sample)
+    except LangDetectException:
+        return None
+def clean_document(
+    text: str,
+    *,
+    normalize_ws: bool = True,
+    remove_headers_footers: bool = True,
+    dedupe: bool = True,
+    preserve_code_tables: bool = True,
+    min_lang_length: int = 50,
+) -> str:
+    """Full cleaning pipeline. Preserve code/tables; optionally skip non-English if desired."""
+    if normalize_ws and not preserve_code_tables:
+        text = normalize_whitespace(text)
+    elif preserve_code_tables:
+        text = preserve_blocks(text)
+    if remove_headers_footers:
+        text = remove_header_footer_candidates(text)
+    if dedupe:
+        text = deduplicate_sentences(text)
+    if normalize_ws and preserve_code_tables:
+        text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
+        text = re.sub(r" +", " ", text)
+    return text.strip()