ragcheck-cli 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragcheck_cli-0.2.0/.github/workflows/ragcheck.yml +25 -0
- ragcheck_cli-0.2.0/.github/workflows/tests.yml +22 -0
- ragcheck_cli-0.2.0/.gitignore +12 -0
- ragcheck_cli-0.2.0/CHANGELOG.md +36 -0
- ragcheck_cli-0.2.0/CONTRIBUTING.md +31 -0
- ragcheck_cli-0.2.0/LICENSE +21 -0
- ragcheck_cli-0.2.0/MANIFEST.in +5 -0
- ragcheck_cli-0.2.0/PKG-INFO +193 -0
- ragcheck_cli-0.2.0/README.md +154 -0
- ragcheck_cli-0.2.0/docs/ARCHITECTURE.md +40 -0
- ragcheck_cli-0.2.0/examples/chunk_demo.py +47 -0
- ragcheck_cli-0.2.0/examples/classifier_demo.py +94 -0
- ragcheck_cli-0.2.0/examples/demo.py +91 -0
- ragcheck_cli-0.2.0/examples/embed_demo.py +50 -0
- ragcheck_cli-0.2.0/examples/full_pipeline_demo.py +36 -0
- ragcheck_cli-0.2.0/examples/qa_demo.py +73 -0
- ragcheck_cli-0.2.0/examples/report_demo.py +65 -0
- ragcheck_cli-0.2.0/legal_data/BNSS 2023.pdf +0 -0
- ragcheck_cli-0.2.0/legal_data/BNS_2023.pdf +0 -0
- ragcheck_cli-0.2.0/legal_data/BSA_2023.pdf +0 -0
- ragcheck_cli-0.2.0/pyproject.toml +83 -0
- ragcheck_cli-0.2.0/ragcheck/__init__.py +3 -0
- ragcheck_cli-0.2.0/ragcheck/__main__.py +6 -0
- ragcheck_cli-0.2.0/ragcheck/analyzers/__init__.py +1 -0
- ragcheck_cli-0.2.0/ragcheck/analyzers/chunkers.py +289 -0
- ragcheck_cli-0.2.0/ragcheck/analyzers/failure_classifier.py +174 -0
- ragcheck_cli-0.2.0/ragcheck/analyzers/recommender.py +176 -0
- ragcheck_cli-0.2.0/ragcheck/cli.py +211 -0
- ragcheck_cli-0.2.0/ragcheck/core/__init__.py +1 -0
- ragcheck_cli-0.2.0/ragcheck/core/config.py +75 -0
- ragcheck_cli-0.2.0/ragcheck/core/config_loader.py +55 -0
- ragcheck_cli-0.2.0/ragcheck/core/document_loader.py +99 -0
- ragcheck_cli-0.2.0/ragcheck/core/embeddings.py +38 -0
- ragcheck_cli-0.2.0/ragcheck/core/progress.py +41 -0
- ragcheck_cli-0.2.0/ragcheck/core/vector_store.py +81 -0
- ragcheck_cli-0.2.0/ragcheck/reports/__init__.py +1 -0
- ragcheck_cli-0.2.0/ragcheck/reports/chunk_visualizer.py +132 -0
- ragcheck_cli-0.2.0/ragcheck/reports/export.py +52 -0
- ragcheck_cli-0.2.0/ragcheck/reports/generator.py +235 -0
- ragcheck_cli-0.2.0/ragcheck/reports/html_report.py +460 -0
- ragcheck_cli-0.2.0/ragcheck/testers/__init__.py +1 -0
- ragcheck_cli-0.2.0/ragcheck/testers/auto_qa.py +221 -0
- ragcheck_cli-0.2.0/ragcheck/testers/retrieval_tester.py +185 -0
- ragcheck_cli-0.2.0/ragcheck.yaml +27 -0
- ragcheck_cli-0.2.0/sample_data/components.txt +1 -0
- ragcheck_cli-0.2.0/sample_data/intro.txt +1 -0
- ragcheck_cli-0.2.0/tests/__init__.py +1 -0
- ragcheck_cli-0.2.0/tests/integration/test_end_to_end.py +74 -0
- ragcheck_cli-0.2.0/tests/integration/test_pip_install.py +26 -0
- ragcheck_cli-0.2.0/tests/unit/__init__.py +1 -0
- ragcheck_cli-0.2.0/tests/unit/test_auto_qa.py +36 -0
- ragcheck_cli-0.2.0/tests/unit/test_chunkers.py +150 -0
- ragcheck_cli-0.2.0/tests/unit/test_cli.py +44 -0
- ragcheck_cli-0.2.0/tests/unit/test_config.py +33 -0
- ragcheck_cli-0.2.0/tests/unit/test_embeddings.py +30 -0
- ragcheck_cli-0.2.0/tests/unit/test_failure_classifier.py +58 -0
- ragcheck_cli-0.2.0/tests/unit/test_html_report.py +73 -0
- ragcheck_cli-0.2.0/tests/unit/test_recommender.py +70 -0
- ragcheck_cli-0.2.0/tests/unit/test_retrieval_tester.py +97 -0
- ragcheck_cli-0.2.0/tests/unit/test_vector_store.py +61 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: RAG Quality Check
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
ragcheck:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.11"
|
|
17
|
+
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
18
|
+
- run: uv sync
|
|
19
|
+
- run: uv run ragcheck run --docs ./data --ci --min-score 0.80
|
|
20
|
+
env:
|
|
21
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: ragcheck-report
|
|
25
|
+
path: ragcheck_report.html
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
21
|
+
- run: uv sync
|
|
22
|
+
- run: uv run pytest tests/ -v --cov=ragcheck --cov-report=term-missing
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.2.0] - 2026-06-04
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **Offline HTML reports** — Replaced Plotly CDN with pure CSS/HTML charts. Reports work without internet.
|
|
9
|
+
- **Real faithfulness scoring** — NLI model support (`--nli-model`) for verifying generated answers against retrieved chunks. Falls back to heuristic overlap check.
|
|
10
|
+
- **Answer generation** — `--generate-answers` flag wires LiteLLM to populate `RetrievalResult.generated_answer` for faithfulness evaluation.
|
|
11
|
+
- **Scaled auto-QA** — Increased from 3 to 50 synthetic test questions with perplexity-based filtering to remove trivial questions.
|
|
12
|
+
- **Chunk visualizer integration** — Merged standalone `chunk_visualizer.py` into main report as dedicated "Chunk Analysis" section with histogram and expandable previews.
|
|
13
|
+
- **RAGAS re-added** — Optional extra `pip install ragcheck[ragas]` with proper version pin (`>=0.4.0,<0.5.0`).
|
|
14
|
+
- **Windows compatibility** — UTF-8 encoding fixes in `config_loader.py`, removed Unicode checkmark causing `cp1252` encoding errors.
|
|
15
|
+
- **Local model support** — Zero-cost operation via Ollama (`--answer-model ollama/phi3:mini`).
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
- Histogram bin calculation for tiny datasets (no more backwards ranges like `276–275`)
|
|
19
|
+
- Faithfulness showing `0%` instead of `N/A` when `--generate-answers` is not used
|
|
20
|
+
- `FutureWarning` from `sentence-transformers` embedding dimension method
|
|
21
|
+
|
|
22
|
+
## [0.1.0] - 2026-06-02
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
- Typer CLI with `init`, `run`, `report` commands
|
|
26
|
+
- 6 chunking strategies: fixed, semantic, recursive, markdown, agentic, late
|
|
27
|
+
- Chunk visualization with Plotly histograms
|
|
28
|
+
- SentenceTransformer embeddings (all-MiniLM-L6-v2)
|
|
29
|
+
- ChromaDB vector store
|
|
30
|
+
- Auto-QA generation via LiteLLM
|
|
31
|
+
- Dense retriever with latency/cost tracking
|
|
32
|
+
- Failure classification: 4 modes (retrieval miss, hallucination, overload, boundary error)
|
|
33
|
+
- Recommendation engine with decision tree
|
|
34
|
+
- Beautiful HTML reports (single file, no server)
|
|
35
|
+
- CI/CD mode with GitHub Actions
|
|
36
|
+
- PDF/PNG export via Playwright
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Contributing to ragcheck
|
|
2
|
+
|
|
3
|
+
## Development Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/pranay7863/ragcheck.git
|
|
7
|
+
cd ragcheck
|
|
8
|
+
uv sync
|
|
9
|
+
uv run pytest
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Code Style
|
|
13
|
+
|
|
14
|
+
- `ruff` for linting and formatting
|
|
15
|
+
- `mypy` for type checking
|
|
16
|
+
- All code must pass `ruff check .` and `mypy ragcheck/`
|
|
17
|
+
|
|
18
|
+
## Testing
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv run pytest
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Pull Request Process
|
|
25
|
+
|
|
26
|
+
1. Fork the repository
|
|
27
|
+
2. Create a feature branch
|
|
28
|
+
3. Make your changes
|
|
29
|
+
4. Run tests and linting
|
|
30
|
+
5. Commit with clear messages
|
|
31
|
+
6. Open a Pull Request
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pranay Mane
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragcheck-cli
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Lighthouse for RAG systems — diagnose and fix your retrieval pipeline
|
|
5
|
+
Project-URL: Homepage, https://github.com/pranay7863/ragcheck
|
|
6
|
+
Project-URL: Documentation, https://github.com/pranay7863/ragcheck/blob/main/README.md
|
|
7
|
+
Project-URL: Repository, https://github.com/pranay7863/ragcheck
|
|
8
|
+
Project-URL: Issues, https://github.com/pranay7863/ragcheck/issues
|
|
9
|
+
Author-email: Pranay Mane <pranaymane78@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai,chunking,diagnostics,evaluation,llm,rag,retrieval
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: chromadb>=0.4.0
|
|
23
|
+
Requires-Dist: jinja2>=3.1.0
|
|
24
|
+
Requires-Dist: litellm>=1.0.0
|
|
25
|
+
Requires-Dist: nltk>=3.9.0
|
|
26
|
+
Requires-Dist: pydantic>=2.5.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
30
|
+
Requires-Dist: transformers>=4.30.0
|
|
31
|
+
Requires-Dist: typer>=0.12.0
|
|
32
|
+
Provides-Extra: export
|
|
33
|
+
Requires-Dist: playwright>=1.40.0; extra == 'export'
|
|
34
|
+
Provides-Extra: pdf
|
|
35
|
+
Requires-Dist: pypdf2>=3.0.0; extra == 'pdf'
|
|
36
|
+
Provides-Extra: ragas
|
|
37
|
+
Requires-Dist: ragas<0.5.0,>=0.4.0; extra == 'ragas'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# ragcheck - Lighthouse for RAG Systems
|
|
41
|
+
|
|
42
|
+
[](https://badge.fury.io/py/ragcheck)
|
|
43
|
+
[](https://www.python.org/)
|
|
44
|
+
[](https://opensource.org/licenses/MIT)
|
|
45
|
+
|
|
46
|
+
> One command to diagnose your RAG pipeline and get actionable fixes.
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install ragcheck
|
|
50
|
+
ragcheck init
|
|
51
|
+
ragcheck run --docs ./data --query "What is Article 370?"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## What is ragcheck?
|
|
55
|
+
|
|
56
|
+
**ragcheck** is a lightweight, one-command diagnostic CLI that generates a beautiful, shareable HTML report analyzing why your RAG system fails and how to fix it.
|
|
57
|
+
|
|
58
|
+
Think of it as **Lighthouse for RAG systems** — just like Lighthouse audits web pages, ragcheck audits your retrieval pipeline.
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- **Auto-Generated Test Suite** - 50 synthetic questions from your documents
|
|
63
|
+
- **Chunk Visualizer** - See exactly where your chunking breaks
|
|
64
|
+
- **Retrieval Heatmap** - Identify dead chunks and dominant chunks
|
|
65
|
+
- **Failure Classification** - Know WHY your RAG fails, not just THAT it fails
|
|
66
|
+
- **Actionable Recommendations** - Specific fixes with predicted impact
|
|
67
|
+
- **CI/CD Integration** - Fail builds when RAG quality regresses
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### Installation
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install ragcheck
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Or with [uv](https://github.com/astral-sh/uv):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
uv tool install ragcheck
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Initialize
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
ragcheck init
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Creates a `ragcheck.yaml` config file in your project.
|
|
90
|
+
|
|
91
|
+
### Run Analysis
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
ragcheck run --docs ./data --query "Your test query"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Generates `ragcheck_report.html` with:
|
|
98
|
+
- Scorecards (retrieval accuracy, faithfulness)
|
|
99
|
+
- Chunk boundary visualization
|
|
100
|
+
- Retrieval heatmap
|
|
101
|
+
- Failure mode classification
|
|
102
|
+
- Before/after score predictions
|
|
103
|
+
|
|
104
|
+
### CI Mode
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
ragcheck run --docs ./data --ci --min-score 0.80
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Returns exit code 0/1. Use in GitHub Actions to fail builds on quality regression.
|
|
111
|
+
|
|
112
|
+
## Example Report
|
|
113
|
+
|
|
114
|
+

|
|
115
|
+
|
|
116
|
+
## Architecture
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
ragcheck CLI
|
|
120
|
+
├── Chunk Analyzer (6 strategies + benchmark)
|
|
121
|
+
├── Retriever Tester (auto-QA + dense retrieval)
|
|
122
|
+
├── Failure Classifier (4 failure modes)
|
|
123
|
+
├── Recommendation Engine (decision tree)
|
|
124
|
+
└── Report Engine (Jinja2 + CSS/HTML HTML)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Tech Stack
|
|
128
|
+
|
|
129
|
+
| Component | Tool |
|
|
130
|
+
|-----------|------|
|
|
131
|
+
| CLI | Typer + Rich |
|
|
132
|
+
| Config | Pydantic |
|
|
133
|
+
| Embeddings | sentence-transformers |
|
|
134
|
+
| Vector DB | ChromaDB |
|
|
135
|
+
| LLM Interface | LiteLLM |
|
|
136
|
+
| Reports | Jinja2 + CSS/HTML |
|
|
137
|
+
|
|
138
|
+
## Configuration
|
|
139
|
+
|
|
140
|
+
`ragcheck.yaml`:
|
|
141
|
+
|
|
142
|
+
```yaml
|
|
143
|
+
project_name: ragcheck
|
|
144
|
+
docs_path: ./data
|
|
145
|
+
chunking:
|
|
146
|
+
strategy: recursive
|
|
147
|
+
chunk_size: 512
|
|
148
|
+
chunk_overlap: 128
|
|
149
|
+
llm:
|
|
150
|
+
provider: openai
|
|
151
|
+
model: gpt-3.5-turbo
|
|
152
|
+
retrieval:
|
|
153
|
+
top_k: 5
|
|
154
|
+
similarity_threshold: 0.7
|
|
155
|
+
report:
|
|
156
|
+
format: html
|
|
157
|
+
include_heatmap: true
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Development
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
git clone https://github.com/pranay7863/ragcheck.git
|
|
164
|
+
cd ragcheck
|
|
165
|
+
uv sync
|
|
166
|
+
uv run pytest
|
|
167
|
+
uv run ruff check .
|
|
168
|
+
uv run mypy ragcheck/
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Contributing
|
|
172
|
+
|
|
173
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT — see [LICENSE](LICENSE)
|
|
178
|
+
|
|
179
|
+
## Roadmap
|
|
180
|
+
|
|
181
|
+
- [x] v0.2.0 — Offline reports, NLI faithfulness, scaled auto-QA, chunk viz
|
|
182
|
+
- [ ] v0.3.0 — More vector DBs (Pinecone, Weaviate)
|
|
183
|
+
- [ ] v0.3.0 — SaaS API for teams
|
|
184
|
+
- [ ] v0.4.0 — Enterprise features (SSO, audit logs)
|
|
185
|
+
|
|
186
|
+
## Support
|
|
187
|
+
|
|
188
|
+
- [GitHub Issues](https://github.com/pranay7863/ragcheck/issues)
|
|
189
|
+
- Twitter: [@ypranay53](https://twitter.com/pranay53)
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
**Built with discipline.** Read the [blueprint](docs/ARCHITECTURE.md) that started it all.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# ragcheck - Lighthouse for RAG Systems
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/ragcheck)
|
|
4
|
+
[](https://www.python.org/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
> One command to diagnose your RAG pipeline and get actionable fixes.
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install ragcheck
|
|
11
|
+
ragcheck init
|
|
12
|
+
ragcheck run --docs ./data --query "What is Article 370?"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## What is ragcheck?
|
|
16
|
+
|
|
17
|
+
**ragcheck** is a lightweight, one-command diagnostic CLI that generates a beautiful, shareable HTML report analyzing why your RAG system fails and how to fix it.
|
|
18
|
+
|
|
19
|
+
Think of it as **Lighthouse for RAG systems** — just like Lighthouse audits web pages, ragcheck audits your retrieval pipeline.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Auto-Generated Test Suite** - 50 synthetic questions from your documents
|
|
24
|
+
- **Chunk Visualizer** - See exactly where your chunking breaks
|
|
25
|
+
- **Retrieval Heatmap** - Identify dead chunks and dominant chunks
|
|
26
|
+
- **Failure Classification** - Know WHY your RAG fails, not just THAT it fails
|
|
27
|
+
- **Actionable Recommendations** - Specific fixes with predicted impact
|
|
28
|
+
- **CI/CD Integration** - Fail builds when RAG quality regresses
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
### Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install ragcheck
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Or with [uv](https://github.com/astral-sh/uv):
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv tool install ragcheck
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Initialize
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
ragcheck init
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Creates a `ragcheck.yaml` config file in your project.
|
|
51
|
+
|
|
52
|
+
### Run Analysis
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
ragcheck run --docs ./data --query "Your test query"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Generates `ragcheck_report.html` with:
|
|
59
|
+
- Scorecards (retrieval accuracy, faithfulness)
|
|
60
|
+
- Chunk boundary visualization
|
|
61
|
+
- Retrieval heatmap
|
|
62
|
+
- Failure mode classification
|
|
63
|
+
- Before/after score predictions
|
|
64
|
+
|
|
65
|
+
### CI Mode
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
ragcheck run --docs ./data --ci --min-score 0.80
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Returns exit code 0/1. Use in GitHub Actions to fail builds on quality regression.
|
|
72
|
+
|
|
73
|
+
## Example Report
|
|
74
|
+
|
|
75
|
+

|
|
76
|
+
|
|
77
|
+
## Architecture
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
ragcheck CLI
|
|
81
|
+
├── Chunk Analyzer (6 strategies + benchmark)
|
|
82
|
+
├── Retriever Tester (auto-QA + dense retrieval)
|
|
83
|
+
├── Failure Classifier (4 failure modes)
|
|
84
|
+
├── Recommendation Engine (decision tree)
|
|
85
|
+
└── Report Engine (Jinja2 + CSS/HTML HTML)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Tech Stack
|
|
89
|
+
|
|
90
|
+
| Component | Tool |
|
|
91
|
+
|-----------|------|
|
|
92
|
+
| CLI | Typer + Rich |
|
|
93
|
+
| Config | Pydantic |
|
|
94
|
+
| Embeddings | sentence-transformers |
|
|
95
|
+
| Vector DB | ChromaDB |
|
|
96
|
+
| LLM Interface | LiteLLM |
|
|
97
|
+
| Reports | Jinja2 + CSS/HTML |
|
|
98
|
+
|
|
99
|
+
## Configuration
|
|
100
|
+
|
|
101
|
+
`ragcheck.yaml`:
|
|
102
|
+
|
|
103
|
+
```yaml
|
|
104
|
+
project_name: ragcheck
|
|
105
|
+
docs_path: ./data
|
|
106
|
+
chunking:
|
|
107
|
+
strategy: recursive
|
|
108
|
+
chunk_size: 512
|
|
109
|
+
chunk_overlap: 128
|
|
110
|
+
llm:
|
|
111
|
+
provider: openai
|
|
112
|
+
model: gpt-3.5-turbo
|
|
113
|
+
retrieval:
|
|
114
|
+
top_k: 5
|
|
115
|
+
similarity_threshold: 0.7
|
|
116
|
+
report:
|
|
117
|
+
format: html
|
|
118
|
+
include_heatmap: true
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Development
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
git clone https://github.com/pranay7863/ragcheck.git
|
|
125
|
+
cd ragcheck
|
|
126
|
+
uv sync
|
|
127
|
+
uv run pytest
|
|
128
|
+
uv run ruff check .
|
|
129
|
+
uv run mypy ragcheck/
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Contributing
|
|
133
|
+
|
|
134
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
MIT — see [LICENSE](LICENSE)
|
|
139
|
+
|
|
140
|
+
## Roadmap
|
|
141
|
+
|
|
142
|
+
- [x] v0.2.0 — Offline reports, NLI faithfulness, scaled auto-QA, chunk viz
|
|
143
|
+
- [ ] v0.3.0 — More vector DBs (Pinecone, Weaviate)
|
|
144
|
+
- [ ] v0.3.0 — SaaS API for teams
|
|
145
|
+
- [ ] v0.4.0 — Enterprise features (SSO, audit logs)
|
|
146
|
+
|
|
147
|
+
## Support
|
|
148
|
+
|
|
149
|
+
- [GitHub Issues](https://github.com/pranay7863/ragcheck/issues)
|
|
150
|
+
- Twitter: [@ypranay53](https://twitter.com/pranay53)
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
**Built with discipline.** Read the [blueprint](docs/ARCHITECTURE.md) that started it all.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# ragcheck Architecture
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
ragcheck CLI (Typer + Rich)
|
|
7
|
+
|
|
|
8
|
+
+-- Document Loader (Text, Markdown)
|
|
9
|
+
|
|
|
10
|
+
+-- Chunk Analyzer (6 strategies)
|
|
11
|
+
| +-- Fixed-size, Semantic, Recursive
|
|
12
|
+
| +-- Markdown-aware, Agentic, Late
|
|
13
|
+
|
|
|
14
|
+
+-- Embedding Manager (sentence-transformers)
|
|
15
|
+
|
|
|
16
|
+
+-- Vector Store (ChromaDB)
|
|
17
|
+
|
|
|
18
|
+
+-- Retriever Tester (DenseRetriever)
|
|
19
|
+
| +-- Auto-QA Generation (LiteLLM)
|
|
20
|
+
| +-- Latency/Cost Tracking
|
|
21
|
+
|
|
|
22
|
+
+-- Failure Classifier (4 modes)
|
|
23
|
+
| +-- Retrieval Miss
|
|
24
|
+
| +-- Context Overload
|
|
25
|
+
| +-- Hallucination
|
|
26
|
+
| +-- Chunk Boundary Error
|
|
27
|
+
|
|
|
28
|
+
+-- Recommendation Engine (Decision Tree)
|
|
29
|
+
|
|
|
30
|
+
+-- Report Engine (Jinja2 + Plotly)
|
|
31
|
+
+-- HTML Report (single file)
|
|
32
|
+
+-- PDF/PNG Export (Playwright)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Design Principles
|
|
36
|
+
|
|
37
|
+
1. **Zero-infrastructure**: `pip install ragcheck` works out of the box
|
|
38
|
+
2. **Single-file output**: HTML report is one file, no server needed
|
|
39
|
+
3. **Framework agnostic**: No LangChain or LlamaIndex dependency in core
|
|
40
|
+
4. **Offline-first**: Core metrics use local models; LLM calls are optional
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Demo script for all 6 chunking strategies."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ragcheck.analyzers.chunkers import ChunkerFactory, benchmark_chunking
|
|
6
|
+
from ragcheck.reports.chunk_visualizer import generate_chunk_viz
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
sample_text = """# Introduction to RAG
|
|
11
|
+
|
|
12
|
+
RAG (Retrieval-Augmented Generation) is a technique that combines
|
|
13
|
+
retrieval systems with generative AI. It works by retrieving relevant documents
|
|
14
|
+
from a knowledge base and then using a large language model to generate answers.
|
|
15
|
+
|
|
16
|
+
## Key Components
|
|
17
|
+
|
|
18
|
+
The key components are: a document store, an embedding model,
|
|
19
|
+
a vector database, and a language model. Chunking strategy is critical because
|
|
20
|
+
poor chunking can split important context across boundaries.
|
|
21
|
+
|
|
22
|
+
## Chunking Strategies
|
|
23
|
+
|
|
24
|
+
Common strategies include fixed-size, semantic, recursive, markdown-aware,
|
|
25
|
+
agentic (LLM-based), and late chunking (contextual embeddings)."""
|
|
26
|
+
|
|
27
|
+
# Benchmark all 6 strategies
|
|
28
|
+
strategies = ["fixed", "semantic", "recursive", "markdown", "agentic", "late"]
|
|
29
|
+
results = benchmark_chunking(sample_text, "sample.md", strategies)
|
|
30
|
+
|
|
31
|
+
print("Chunking Benchmark Results — All 6 Strategies")
|
|
32
|
+
print("=" * 60)
|
|
33
|
+
for strategy, metrics in results.items():
|
|
34
|
+
print(f"\n{strategy.upper():12} | Chunks: {metrics['num_chunks']:3} | "
|
|
35
|
+
f"Avg: {metrics['avg_length']:6.1f} | Loss: {metrics['context_loss_score']:.2%}")
|
|
36
|
+
|
|
37
|
+
# Generate HTML visualization for markdown chunker (most interesting for this doc)
|
|
38
|
+
md_chunks = results["markdown"]["chunks"]
|
|
39
|
+
html = generate_chunk_viz(md_chunks, "sample.md", "markdown", sample_text)
|
|
40
|
+
|
|
41
|
+
output_path = Path("chunk_visualization.html")
|
|
42
|
+
output_path.write_text(html, encoding="utf-8")
|
|
43
|
+
print(f"\nVisualization saved to: {output_path.absolute()}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
main()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Demo: Failure classification + recommendations."""
|
|
2
|
+
|
|
3
|
+
from ragcheck.analyzers.chunkers import Chunk
|
|
4
|
+
from ragcheck.analyzers.failure_classifier import FailureClassifier
|
|
5
|
+
from ragcheck.analyzers.recommender import RecommendationEngine, predict_scores
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
classifier = FailureClassifier()
|
|
10
|
+
engine = RecommendationEngine()
|
|
11
|
+
|
|
12
|
+
# Simulate 4 different failure scenarios
|
|
13
|
+
scenarios = [
|
|
14
|
+
{
|
|
15
|
+
"name": "Retrieval Miss",
|
|
16
|
+
"question": "What is quantum computing?",
|
|
17
|
+
"expected": "Quantum computing uses qubits.",
|
|
18
|
+
"generated": "",
|
|
19
|
+
"retrieved": [],
|
|
20
|
+
"source": ["Quantum computing uses qubits for computation."],
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"name": "Hallucination",
|
|
24
|
+
"question": "What is RAG?",
|
|
25
|
+
"expected": "RAG is Retrieval-Augmented Generation.",
|
|
26
|
+
"generated": "RAG is a type of database invented in 2015 by Google.",
|
|
27
|
+
"retrieved": [Chunk("RAG is Retrieval-Augmented Generation.", 0, 40, "doc.txt", "fixed")],
|
|
28
|
+
"source": ["RAG is Retrieval-Augmented Generation."],
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"name": "Context Overload",
|
|
32
|
+
"question": "How does RAG work?",
|
|
33
|
+
"expected": "RAG retrieves documents then generates answers.",
|
|
34
|
+
"generated": "RAG retrieves documents then generates answers.",
|
|
35
|
+
"retrieved": [Chunk(f"chunk{i}", i*10, i*10+10, "doc.txt", "fixed") for i in range(6)],
|
|
36
|
+
"source": ["RAG retrieves documents then generates answers."],
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "Chunk Boundary Error",
|
|
40
|
+
"question": "Explain the full RAG pipeline.",
|
|
41
|
+
"expected": "RAG has retrieval and generation components working together.",
|
|
42
|
+
"generated": "RAG has retrieval and generation components.",
|
|
43
|
+
"retrieved": [
|
|
44
|
+
Chunk("RAG has retrieval components", 0, 28, "doc.txt", "fixed"),
|
|
45
|
+
Chunk("and generation components working", 29, 60, "doc.txt", "fixed"),
|
|
46
|
+
],
|
|
47
|
+
"source": ["RAG has retrieval and generation components working together."],
|
|
48
|
+
},
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
print("Failure Classification Demo")
|
|
52
|
+
print("=" * 60)
|
|
53
|
+
|
|
54
|
+
all_failures = []
|
|
55
|
+
for s in scenarios:
|
|
56
|
+
analysis = classifier.classify(
|
|
57
|
+
question=s["question"],
|
|
58
|
+
expected_answer=s["expected"],
|
|
59
|
+
generated_answer=s["generated"],
|
|
60
|
+
retrieved_chunks=s["retrieved"],
|
|
61
|
+
source_chunks=s["source"],
|
|
62
|
+
)
|
|
63
|
+
all_failures.append(analysis)
|
|
64
|
+
|
|
65
|
+
print(f"\n{s['name']}:")
|
|
66
|
+
print(f" Mode: {analysis.failure_mode.value}")
|
|
67
|
+
print(f" Confidence: {analysis.confidence}")
|
|
68
|
+
print(f" Explanation: {analysis.explanation}")
|
|
69
|
+
print(f" Fix: {analysis.recommendation}")
|
|
70
|
+
|
|
71
|
+
# Generate recommendations from all failures
|
|
72
|
+
print("\n" + "=" * 60)
|
|
73
|
+
print("Prioritized Recommendations")
|
|
74
|
+
print("=" * 60)
|
|
75
|
+
|
|
76
|
+
recommendations = engine.generate_recommendations(all_failures)
|
|
77
|
+
for i, rec in enumerate(recommendations[:5], 1):
|
|
78
|
+
print(f"\n{i}. {rec.title} [{rec.implementation_difficulty}]")
|
|
79
|
+
print(f" {rec.description}")
|
|
80
|
+
print(f" Expected improvement: +{rec.expected_improvement:.1%}")
|
|
81
|
+
print(f" Tradeoffs: {rec.tradeoffs}")
|
|
82
|
+
if rec.code_example:
|
|
83
|
+
print(f" Code: {rec.code_example}")
|
|
84
|
+
|
|
85
|
+
# Score prediction
|
|
86
|
+
print("\n" + "=" * 60)
|
|
87
|
+
current = 0.55
|
|
88
|
+
prediction = predict_scores(current, recommendations)
|
|
89
|
+
print(f"Score Prediction: {prediction['current_score']:.0%} -> {prediction['predicted_score']:.0%}")
|
|
90
|
+
print(f" (+{prediction['improvement']:.1%} from top {prediction['recommendations_applied']} recommendations)")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
main()
|