rag-python 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. rag_python-0.1.0/LICENSE +22 -0
  2. rag_python-0.1.0/PKG-INFO +158 -0
  3. rag_python-0.1.0/README.md +111 -0
  4. rag_python-0.1.0/pyproject.toml +67 -0
  5. rag_python-0.1.0/setup.cfg +4 -0
  6. rag_python-0.1.0/src/rag_python/__init__.py +39 -0
  7. rag_python-0.1.0/src/rag_python/chunking.py +181 -0
  8. rag_python-0.1.0/src/rag_python/cleaning.py +102 -0
  9. rag_python-0.1.0/src/rag_python/cli.py +77 -0
  10. rag_python-0.1.0/src/rag_python/client.py +190 -0
  11. rag_python-0.1.0/src/rag_python/config.py +37 -0
  12. rag_python-0.1.0/src/rag_python/document_loaders.py +74 -0
  13. rag_python-0.1.0/src/rag_python/evaluation.py +105 -0
  14. rag_python-0.1.0/src/rag_python/generation.py +35 -0
  15. rag_python-0.1.0/src/rag_python/guardrails.py +66 -0
  16. rag_python-0.1.0/src/rag_python/options.py +68 -0
  17. rag_python-0.1.0/src/rag_python/providers/__init__.py +5 -0
  18. rag_python-0.1.0/src/rag_python/providers/anthropic_provider.py +41 -0
  19. rag_python-0.1.0/src/rag_python/providers/azure_openai_provider.py +62 -0
  20. rag_python-0.1.0/src/rag_python/providers/base.py +24 -0
  21. rag_python-0.1.0/src/rag_python/providers/factory.py +53 -0
  22. rag_python-0.1.0/src/rag_python/providers/gemini_provider.py +45 -0
  23. rag_python-0.1.0/src/rag_python/providers/ollama_provider.py +56 -0
  24. rag_python-0.1.0/src/rag_python/providers/openai_provider.py +46 -0
  25. rag_python-0.1.0/src/rag_python/py.typed +0 -0
  26. rag_python-0.1.0/src/rag_python/query_rewriting.py +65 -0
  27. rag_python-0.1.0/src/rag_python/rag_pipeline.py +241 -0
  28. rag_python-0.1.0/src/rag_python/reranker.py +64 -0
  29. rag_python-0.1.0/src/rag_python/retrieval.py +61 -0
  30. rag_python-0.1.0/src/rag_python/vector_store.py +91 -0
  31. rag_python-0.1.0/src/rag_python.egg-info/PKG-INFO +158 -0
  32. rag_python-0.1.0/src/rag_python.egg-info/SOURCES.txt +37 -0
  33. rag_python-0.1.0/src/rag_python.egg-info/dependency_links.txt +1 -0
  34. rag_python-0.1.0/src/rag_python.egg-info/entry_points.txt +2 -0
  35. rag_python-0.1.0/src/rag_python.egg-info/requires.txt +28 -0
  36. rag_python-0.1.0/src/rag_python.egg-info/top_level.txt +1 -0
  37. rag_python-0.1.0/tests/test_config.py +19 -0
  38. rag_python-0.1.0/tests/test_import.py +9 -0
  39. rag_python-0.1.0/tests/test_package.py +15 -0
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,158 @@
1
+ Metadata-Version: 2.2
2
+ Name: rag-python
3
+ Version: 0.1.0
4
+ Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
5
+ Author-email: Raghav Singla <04raghavsingla28@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/RaghavOG/rag-python
8
+ Project-URL: Repository, https://github.com/RaghavOG/rag-python
9
+ Project-URL: Documentation, https://github.com/RaghavOG/rag-python#readme
10
+ Project-URL: Issues, https://github.com/RaghavOG/rag-python/issues
11
+ Keywords: rag,llm,embeddings,chromadb,openai,rag-python,retrieval-augmented-generation
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: openai>=1.12.0
25
+ Requires-Dist: tiktoken>=0.5.0
26
+ Requires-Dist: chromadb>=0.4.22
27
+ Requires-Dist: pypdf>=3.17.0
28
+ Requires-Dist: python-docx>=1.1.0
29
+ Requires-Dist: langdetect>=1.0.9
30
+ Requires-Dist: regex>=2023.0.0
31
+ Requires-Dist: python-dotenv>=1.0.0
32
+ Requires-Dist: requests>=2.31.0
33
+ Provides-Extra: rerank
34
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
35
+ Requires-Dist: torch>=2.0.0; extra == "rerank"
36
+ Provides-Extra: anthropic
37
+ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
38
+ Provides-Extra: gemini
39
+ Requires-Dist: google-genai>=0.3.0; extra == "gemini"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.0; extra == "dev"
42
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
43
+ Requires-Dist: build; extra == "dev"
44
+ Requires-Dist: twine; extra == "dev"
45
+ Provides-Extra: all
46
+ Requires-Dist: rag-python[anthropic,gemini,rerank]; extra == "all"
47
+
48
+ # rag-python
49
+
50
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
52
+ [![GitHub](https://img.shields.io/badge/GitHub-RaghavOG%2Frag--python-blue)](https://github.com/RaghavOG/rag-python)
53
+
54
+ **rag-python** is a production-oriented Python library for **Retrieval-Augmented Generation (RAG)**.
55
+
56
+ Ingest your documents, ask questions, get grounded answers — with query rewriting, multi-query retrieval, reranking, guardrails, and multi-LLM support.
57
+
58
+ **Author:** [Raghav Singla](https://github.com/RaghavOG)
59
+ **Repository:** [github.com/RaghavOG/rag-python](https://github.com/RaghavOG/rag-python)
60
+
61
+ ---
62
+
63
+ ## Features
64
+
65
+ - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
66
+ - Query pipeline: rewriting → multi-query retrieval → reranking
67
+ - Generation with guardrails (prompt injection + hallucination checks)
68
+ - Evaluation scores + self-correction retry loop
69
+ - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
70
+
71
+ ---
72
+
73
+ ## Install
74
+
75
+ ```bash
76
+ pip install rag-python
77
+ # or from source
78
+ pip install -e .
79
+ # with reranking + extra providers
80
+ pip install -e ".[rerank,anthropic,gemini,all]"
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Quickstart
86
+
87
+ ```python
88
+ from rag_python import RAG
89
+
90
+ rag = RAG(
91
+ llm_provider="openai",
92
+ llm_model="gpt-4o-mini",
93
+ embedding_provider="openai",
94
+ embedding_model="text-embedding-3-small",
95
+ )
96
+
97
+ rag.ingest(["./data"], reindex=True)
98
+ answer = rag.query("How many days of annual leave?")
99
+ print(answer.text)
100
+ ```
101
+
102
+ ### CLI
103
+
104
+ ```bash
105
+ export OPENAI_API_KEY=sk-...
106
+ rag-python ingest ./data --reindex
107
+ rag-python query "How many days of annual leave?" -v
108
+ ```
109
+
110
+ ---
111
+
112
+ ## Environment variables
113
+
114
+ | Variable | Required | Description |
115
+ |----------|----------|-------------|
116
+ | `OPENAI_API_KEY` | For OpenAI | Default LLM + embeddings |
117
+ | `ANTHROPIC_API_KEY` | For Claude | LLM only |
118
+ | `GEMINI_API_KEY` | For Gemini | LLM only |
119
+ | `AZURE_OPENAI_ENDPOINT` | For Azure | Azure OpenAI |
120
+ | `AZURE_OPENAI_API_KEY` | For Azure | Azure OpenAI |
121
+ | `OPENAI_API_VERSION` | Azure | Default `2023-09-01-preview` |
122
+ | `OLLAMA_BASE_URL` | Ollama | Default `http://localhost:11434` |
123
+ | `RAG_PYTHON_DATA_DIR` | Optional | Default `./data` |
124
+ | `RAG_PYTHON_CHROMA_DIR` | Optional | Default `./chroma_db` |
125
+
126
+ See [`.env.example`](.env.example) for all tuning options.
127
+
128
+ ---
129
+
130
+ ## Project structure
131
+
132
+ ```text
133
+ .
134
+ ├── src/rag_python/ # Installable package (PyPI: rag-python)
135
+ │ ├── client.py # High-level RAG API
136
+ │ ├── rag_pipeline.py # Full pipeline
137
+ │ └── providers/ # OpenAI, Azure, Anthropic, Gemini, Ollama
138
+ ├── tests/
139
+ ├── examples/
140
+ ├── docs/
141
+ ├── data/ # Sample documents
142
+ ├── pyproject.toml
143
+ └── main.py # Local dev CLI wrapper
144
+ ```
145
+
146
+ ---
147
+
148
+ ## Docs
149
+
150
+ - [Usage](docs/USAGE.md)
151
+ - [Providers](docs/PROVIDERS.md)
152
+ - [Changelog](CHANGELOG.md)
153
+
154
+ ---
155
+
156
+ ## License
157
+
158
+ MIT © [Raghav Singla](https://github.com/RaghavOG)
@@ -0,0 +1,111 @@
1
+ # rag-python
2
+
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+ [![GitHub](https://img.shields.io/badge/GitHub-RaghavOG%2Frag--python-blue)](https://github.com/RaghavOG/rag-python)
6
+
7
+ **rag-python** is a production-oriented Python library for **Retrieval-Augmented Generation (RAG)**.
8
+
9
+ Ingest your documents, ask questions, get grounded answers — with query rewriting, multi-query retrieval, reranking, guardrails, and multi-LLM support.
10
+
11
+ **Author:** [Raghav Singla](https://github.com/RaghavOG)
12
+ **Repository:** [github.com/RaghavOG/rag-python](https://github.com/RaghavOG/rag-python)
13
+
14
+ ---
15
+
16
+ ## Features
17
+
18
+ - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
19
+ - Query pipeline: rewriting → multi-query retrieval → reranking
20
+ - Generation with guardrails (prompt injection + hallucination checks)
21
+ - Evaluation scores + self-correction retry loop
22
+ - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
23
+
24
+ ---
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install rag-python
30
+ # or from source
31
+ pip install -e .
32
+ # with reranking + extra providers
33
+ pip install -e ".[rerank,anthropic,gemini,all]"
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Quickstart
39
+
40
+ ```python
41
+ from rag_python import RAG
42
+
43
+ rag = RAG(
44
+ llm_provider="openai",
45
+ llm_model="gpt-4o-mini",
46
+ embedding_provider="openai",
47
+ embedding_model="text-embedding-3-small",
48
+ )
49
+
50
+ rag.ingest(["./data"], reindex=True)
51
+ answer = rag.query("How many days of annual leave?")
52
+ print(answer.text)
53
+ ```
54
+
55
+ ### CLI
56
+
57
+ ```bash
58
+ export OPENAI_API_KEY=sk-...
59
+ rag-python ingest ./data --reindex
60
+ rag-python query "How many days of annual leave?" -v
61
+ ```
62
+
63
+ ---
64
+
65
+ ## Environment variables
66
+
67
+ | Variable | Required | Description |
68
+ |----------|----------|-------------|
69
+ | `OPENAI_API_KEY` | For OpenAI | Default LLM + embeddings |
70
+ | `ANTHROPIC_API_KEY` | For Claude | LLM only |
71
+ | `GEMINI_API_KEY` | For Gemini | LLM only |
72
+ | `AZURE_OPENAI_ENDPOINT` | For Azure | Azure OpenAI |
73
+ | `AZURE_OPENAI_API_KEY` | For Azure | Azure OpenAI |
74
+ | `OPENAI_API_VERSION` | Azure | Default `2023-09-01-preview` |
75
+ | `OLLAMA_BASE_URL` | Ollama | Default `http://localhost:11434` |
76
+ | `RAG_PYTHON_DATA_DIR` | Optional | Default `./data` |
77
+ | `RAG_PYTHON_CHROMA_DIR` | Optional | Default `./chroma_db` |
78
+
79
+ See [`.env.example`](.env.example) for all tuning options.
80
+
81
+ ---
82
+
83
+ ## Project structure
84
+
85
+ ```text
86
+ .
87
+ ├── src/rag_python/ # Installable package (PyPI: rag-python)
88
+ │ ├── client.py # High-level RAG API
89
+ │ ├── rag_pipeline.py # Full pipeline
90
+ │ └── providers/ # OpenAI, Azure, Anthropic, Gemini, Ollama
91
+ ├── tests/
92
+ ├── examples/
93
+ ├── docs/
94
+ ├── data/ # Sample documents
95
+ ├── pyproject.toml
96
+ └── main.py # Local dev CLI wrapper
97
+ ```
98
+
99
+ ---
100
+
101
+ ## Docs
102
+
103
+ - [Usage](docs/USAGE.md)
104
+ - [Providers](docs/PROVIDERS.md)
105
+ - [Changelog](CHANGELOG.md)
106
+
107
+ ---
108
+
109
+ ## License
110
+
111
+ MIT © [Raghav Singla](https://github.com/RaghavOG)
@@ -0,0 +1,67 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0,<77.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rag-python"
7
+ version = "0.1.0"
8
+ description = "Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Raghav Singla", email = "04raghavsingla28@gmail.com" },
14
+ ]
15
+ keywords = ["rag", "llm", "embeddings", "chromadb", "openai", "rag-python", "retrieval-augmented-generation"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Topic :: Software Development :: Libraries :: Python Modules",
26
+ ]
27
+ dependencies = [
28
+ "openai>=1.12.0",
29
+ "tiktoken>=0.5.0",
30
+ "chromadb>=0.4.22",
31
+ "pypdf>=3.17.0",
32
+ "python-docx>=1.1.0",
33
+ "langdetect>=1.0.9",
34
+ "regex>=2023.0.0",
35
+ "python-dotenv>=1.0.0",
36
+ "requests>=2.31.0",
37
+ ]
38
+
39
+ [project.optional-dependencies]
40
+ rerank = ["sentence-transformers>=2.2.0", "torch>=2.0.0"]
41
+ anthropic = ["anthropic>=0.20.0"]
42
+ gemini = ["google-genai>=0.3.0"]
43
+ dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine"]
44
+ all = ["rag-python[rerank,anthropic,gemini]"]
45
+
46
+ [project.scripts]
47
+ rag-python = "rag_python.cli:main"
48
+
49
+ [project.urls]
50
+ Homepage = "https://github.com/RaghavOG/rag-python"
51
+ Repository = "https://github.com/RaghavOG/rag-python"
52
+ Documentation = "https://github.com/RaghavOG/rag-python#readme"
53
+ Issues = "https://github.com/RaghavOG/rag-python/issues"
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
57
+
58
+ [tool.setuptools.package-data]
59
+ rag_python = ["py.typed"]
60
+
61
+ [tool.pytest.ini_options]
62
+ testpaths = ["tests"]
63
+ pythonpath = ["src"]
64
+
65
+ [tool.ruff]
66
+ line-length = 100
67
+ target-version = "py310"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,39 @@
1
+ """rag-python — production-grade RAG for Python.
2
+
3
+ Quick start::
4
+
5
+ from rag_python import RAG
6
+
7
+ rag = RAG(llm_model="gpt-4o-mini")
8
+ rag.ingest(["./docs"], reindex=True)
9
+ print(rag.query("What is our leave policy?").text)
10
+ """
11
+
12
+ __version__ = "0.1.0"
13
+
14
+ from .client import RAG, RAGAnswer
15
+ from .rag_pipeline import ingest, query, RAGResponse
16
+ from .providers import make_llm_provider, make_embedding_provider
17
+ from .options import (
18
+ ChunkingConfig,
19
+ DocumentConfig,
20
+ QueryConfig,
21
+ RAGConfig,
22
+ SearchConfig,
23
+ )
24
+
25
+ __all__ = [
26
+ "__version__",
27
+ "RAG",
28
+ "RAGAnswer",
29
+ "RAGConfig",
30
+ "ChunkingConfig",
31
+ "SearchConfig",
32
+ "DocumentConfig",
33
+ "QueryConfig",
34
+ "ingest",
35
+ "query",
36
+ "RAGResponse",
37
+ "make_llm_provider",
38
+ "make_embedding_provider",
39
+ ]
@@ -0,0 +1,181 @@
1
+ """Chunking: recursive, structure-aware (headings/sections), and semantic (embedding-based)."""
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import Callable
5
+
6
+ try:
7
+ import tiktoken
8
+ except ImportError:
9
+ tiktoken = None
10
+
11
+
12
+ @dataclass
13
+ class Chunk:
14
+ """Single chunk with text and metadata."""
15
+ text: str
16
+ metadata: dict
17
+
18
+
19
+ # --- Recursive: split by section → paragraph → sentence → tokens ---
20
+ RECURSIVE_SEPARATORS = ["\n\n\n", "\n\n", "\n", ". ", " ", ""]
21
+
22
+
23
+ def _split_by_tokens(text: str, chunk_size: int, overlap: int, encoding_name: str = "cl100k_base") -> list[str]:
24
+ if not tiktoken:
25
+ size = chunk_size * 4
26
+ overlap_chars = overlap * 4
27
+ out = []
28
+ start = 0
29
+ while start < len(text):
30
+ end = min(start + size, len(text))
31
+ out.append(text[start:end])
32
+ start = end - overlap_chars if end < len(text) else len(text)
33
+ return out
34
+ enc = tiktoken.get_encoding(encoding_name)
35
+ tokens = enc.encode(text)
36
+ out = []
37
+ start = 0
38
+ while start < len(tokens):
39
+ end = min(start + chunk_size, len(tokens))
40
+ out.append(enc.decode(tokens[start:end]))
41
+ start = end - overlap if end < len(tokens) else len(tokens)
42
+ return out
43
+
44
+
45
+ def _recursive_split(text: str, separators: list[str], chunk_size: int, overlap: int) -> list[str]:
46
+ if not text.strip():
47
+ return []
48
+ sep = separators[0] if separators else ""
49
+ if sep == "":
50
+ return _split_by_tokens(text, chunk_size, overlap)
51
+ parts = text.split(sep)
52
+ if len(parts) == 1:
53
+ return _recursive_split(text, separators[1:], chunk_size, overlap)
54
+ chunks = []
55
+ current = ""
56
+ for p in parts:
57
+ bit = p if sep in "\n" else p + sep
58
+ if len(current) + len(bit) <= chunk_size * 4:
59
+ current += bit
60
+ else:
61
+ if current.strip():
62
+ chunks.append(current.strip())
63
+ current = bit[-overlap * 4 :] + bit if overlap else bit
64
+ if current.strip():
65
+ chunks.append(current.strip())
66
+ return chunks
67
+
68
+
69
+ def chunk_recursive(
70
+ text: str,
71
+ chunk_size: int = 512,
72
+ overlap: int = 64,
73
+ metadata: dict | None = None,
74
+ ) -> list[Chunk]:
75
+ """Recursive chunking: section → paragraph → sentence → tokens."""
76
+ raw = _recursive_split(text, RECURSIVE_SEPARATORS, chunk_size, overlap)
77
+ meta = dict(metadata or {})
78
+ meta["chunk_strategy"] = "recursive"
79
+ return [Chunk(text=t, metadata={**meta}) for t in raw if t.strip()]
80
+
81
+
82
+ HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
83
+
84
+
85
+ def _structure_sections(text: str) -> list[tuple[str, str]]:
86
+ """Split by markdown-style headings; preserve content under each heading."""
87
+ sections = []
88
+ current_title = "Document"
89
+ current_content = []
90
+ for line in text.splitlines():
91
+ m = HEADING_PATTERN.match(line)
92
+ if m:
93
+ if current_content:
94
+ sections.append((current_title, "\n".join(current_content)))
95
+ current_title = m.group(2).strip()
96
+ current_content = []
97
+ else:
98
+ current_content.append(line)
99
+ if current_content:
100
+ sections.append((current_title, "\n".join(current_content)))
101
+ return sections
102
+
103
+
104
+ def chunk_structure_aware(
105
+ text: str,
106
+ chunk_size: int = 512,
107
+ overlap: int = 64,
108
+ metadata: dict | None = None,
109
+ ) -> list[Chunk]:
110
+ """Structure-aware: chunk by sections (headings); keep tables/code blocks intact."""
111
+ sections = _structure_sections(text)
112
+ meta = dict(metadata or {})
113
+ meta["chunk_strategy"] = "structure_aware"
114
+ chunks = []
115
+ for title, content in sections:
116
+ content = content.strip()
117
+ if not content:
118
+ continue
119
+ if len(content) <= chunk_size * 4:
120
+ chunks.append(Chunk(text=f"## {title}\n\n{content}", metadata={**meta, "section": title}))
121
+ else:
122
+ sub = _recursive_split(content, RECURSIVE_SEPARATORS[1:], chunk_size, overlap)
123
+ for i, t in enumerate(sub):
124
+ if t.strip():
125
+ chunks.append(Chunk(
126
+ text=f"## {title}\n\n{t.strip()}",
127
+ metadata={**meta, "section": title, "section_part": i},
128
+ ))
129
+ return chunks
130
+
131
+
132
+ def chunk_semantic(
133
+ text: str,
134
+ embed_fn: Callable[[list[str]], list[list[float]]],
135
+ chunk_size: int = 512,
136
+ overlap: int = 64,
137
+ metadata: dict | None = None,
138
+ similarity_threshold: float = 0.7,
139
+ ) -> list[Chunk]:
140
+ """Semantic chunking: approximate topic shifts and split."""
141
+ segments = re.split(r"(?<=[.!?])\s+", text)
142
+ if len(segments) <= 1:
143
+ return chunk_recursive(text, chunk_size, overlap, metadata)
144
+
145
+ meta = dict(metadata or {})
146
+ meta["chunk_strategy"] = "semantic"
147
+ chunks = []
148
+ current = []
149
+ current_len = 0
150
+ for seg in segments:
151
+ seg = seg.strip()
152
+ if not seg:
153
+ continue
154
+ current.append(seg)
155
+ current_len += len(seg)
156
+ if current_len >= chunk_size * 3:
157
+ chunk_text = " ".join(current)
158
+ chunks.append(Chunk(text=chunk_text, metadata={**meta}))
159
+ overlap_segs = max(1, len(current) // 4)
160
+ current = current[-overlap_segs:]
161
+ current_len = sum(len(s) for s in current)
162
+ if current:
163
+ chunks.append(Chunk(text=" ".join(current), metadata={**meta}))
164
+ return chunks
165
+
166
+
167
+ def chunk_text(
168
+ text: str,
169
+ strategy: str = "recursive",
170
+ chunk_size: int = 512,
171
+ overlap: int = 64,
172
+ metadata: dict | None = None,
173
+ embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
174
+ ) -> list[Chunk]:
175
+ """Unified entry: recursive | structure_aware | semantic."""
176
+ if strategy == "structure_aware":
177
+ return chunk_structure_aware(text, chunk_size, overlap, metadata)
178
+ if strategy == "semantic" and embed_fn:
179
+ return chunk_semantic(text, embed_fn, chunk_size, overlap, metadata, similarity_threshold=0.7)
180
+ return chunk_recursive(text, chunk_size, overlap, metadata)
181
+
@@ -0,0 +1,102 @@
1
+ """Text cleaning & normalization. Garbage in → hallucination out."""
2
+ import re
3
+
4
+ try:
5
+ from langdetect import detect, LangDetectException
6
+ except ImportError:
7
+ detect = None
8
+ LangDetectException = Exception
9
+
10
+
11
+ def normalize_whitespace(text: str) -> str:
12
+ """Collapse runs of whitespace and strip."""
13
+ return re.sub(r"\s+", " ", text).strip()
14
+
15
+
16
+ def remove_header_footer_candidates(text: str, min_line_len: int = 10) -> str:
17
+ """Remove lines that look like headers/footers (very short, repeated at top/bottom)."""
18
+ lines = text.splitlines()
19
+ if len(lines) < 5:
20
+ return text
21
+
22
+ def is_likely_header_footer(line: str) -> bool:
23
+ s = line.strip()
24
+ if len(s) < min_line_len:
25
+ return True
26
+ if re.match(r"^[\d\s\-\.\/]+$", s): # page numbers, dates
27
+ return True
28
+ return False
29
+
30
+ start = 0
31
+ while start < len(lines) and is_likely_header_footer(lines[start]):
32
+ start += 1
33
+ end = len(lines)
34
+ while end > start and is_likely_header_footer(lines[end - 1]):
35
+ end -= 1
36
+ return "\n".join(lines[start:end])
37
+
38
+
39
+ def deduplicate_sentences(text: str) -> str:
40
+ """Remove consecutive duplicate sentences (and near-duplicates by line)."""
41
+ lines = [normalize_whitespace(line) for line in text.splitlines() if line.strip()]
42
+ seen = set()
43
+ out = []
44
+ for line in lines:
45
+ key = line.lower()[:200]
46
+ if key in seen:
47
+ continue
48
+ seen.add(key)
49
+ out.append(line)
50
+ return "\n".join(out)
51
+
52
+
53
+ def preserve_blocks(text: str) -> str:
54
+ """Normalize whitespace but preserve code blocks and tables (markdown-style)."""
55
+ out = []
56
+ in_code = False
57
+ for part in re.split(r"(```[\w]*\n?|```)", text):
58
+ if part.startswith("```"):
59
+ in_code = not in_code
60
+ out.append(part)
61
+ continue
62
+ if in_code:
63
+ out.append(part)
64
+ continue
65
+ out.append(normalize_whitespace(part))
66
+ return "".join(out) if out else text
67
+
68
+
69
+ def detect_language(text: str) -> str | None:
70
+ """Return ISO language code or None if detection fails."""
71
+ if not detect:
72
+ return None
73
+ try:
74
+ sample = text[:2000] if len(text) > 2000 else text
75
+ return detect(sample)
76
+ except LangDetectException:
77
+ return None
78
+
79
+
80
+ def clean_document(
81
+ text: str,
82
+ *,
83
+ normalize_ws: bool = True,
84
+ remove_headers_footers: bool = True,
85
+ dedupe: bool = True,
86
+ preserve_code_tables: bool = True,
87
+ min_lang_length: int = 50,
88
+ ) -> str:
89
+ """Full cleaning pipeline. Preserve code/tables; optionally skip non-English if desired."""
90
+ if normalize_ws and not preserve_code_tables:
91
+ text = normalize_whitespace(text)
92
+ elif preserve_code_tables:
93
+ text = preserve_blocks(text)
94
+ if remove_headers_footers:
95
+ text = remove_header_footer_candidates(text)
96
+ if dedupe:
97
+ text = deduplicate_sentences(text)
98
+ if normalize_ws and preserve_code_tables:
99
+ text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
100
+ text = re.sub(r" +", " ", text)
101
+ return text.strip()
102
+