hitgate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hitgate-0.1.0/LICENSE +21 -0
- hitgate-0.1.0/PKG-INFO +256 -0
- hitgate-0.1.0/README.md +216 -0
- hitgate-0.1.0/hitgate/__init__.py +5 -0
- hitgate-0.1.0/hitgate/audit_contamination.py +146 -0
- hitgate-0.1.0/hitgate/compare.py +144 -0
- hitgate-0.1.0/hitgate/diff.py +109 -0
- hitgate-0.1.0/hitgate/example_external_retriever.py +48 -0
- hitgate-0.1.0/hitgate/generate.py +438 -0
- hitgate-0.1.0/hitgate/plot_history.py +147 -0
- hitgate-0.1.0/hitgate/run.py +166 -0
- hitgate-0.1.0/hitgate/test_determinism.py +68 -0
- hitgate-0.1.0/hitgate.egg-info/PKG-INFO +256 -0
- hitgate-0.1.0/hitgate.egg-info/SOURCES.txt +36 -0
- hitgate-0.1.0/hitgate.egg-info/dependency_links.txt +1 -0
- hitgate-0.1.0/hitgate.egg-info/entry_points.txt +7 -0
- hitgate-0.1.0/hitgate.egg-info/requires.txt +5 -0
- hitgate-0.1.0/hitgate.egg-info/top_level.txt +2 -0
- hitgate-0.1.0/pyproject.toml +37 -0
- hitgate-0.1.0/ragcore/__init__.py +1 -0
- hitgate-0.1.0/ragcore/build.py +353 -0
- hitgate-0.1.0/ragcore/chunkers.py +180 -0
- hitgate-0.1.0/ragcore/config.py +92 -0
- hitgate-0.1.0/ragcore/mcp_server.py +142 -0
- hitgate-0.1.0/ragcore/pack.py +121 -0
- hitgate-0.1.0/ragcore/query.py +92 -0
- hitgate-0.1.0/ragcore/retrieval.py +359 -0
- hitgate-0.1.0/setup.cfg +4 -0
- hitgate-0.1.0/tests/test_chunkers.py +326 -0
- hitgate-0.1.0/tests/test_compare.py +118 -0
- hitgate-0.1.0/tests/test_generate.py +425 -0
- hitgate-0.1.0/tests/test_harness.py +57 -0
- hitgate-0.1.0/tests/test_langchain_adapter.py +50 -0
- hitgate-0.1.0/tests/test_langfuse_adapter.py +192 -0
- hitgate-0.1.0/tests/test_llamaindex_adapter.py +80 -0
- hitgate-0.1.0/tests/test_mcp_server.py +179 -0
- hitgate-0.1.0/tests/test_rank_modes.py +59 -0
- hitgate-0.1.0/tests/test_tokenizer.py +33 -0
hitgate-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lucas Santana
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
hitgate-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hitgate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A label-free retrieval-quality regression gate: measure any retriever's ranking (Hit@K/MRR) on your own corpus, no labels, gated in CI like a test.
|
|
5
|
+
Author: Lucas Santana
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Lucas Santana
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/LucasSantana-Dev/evidence-first-rag
|
|
29
|
+
Project-URL: Repository, https://github.com/LucasSantana-Dev/evidence-first-rag
|
|
30
|
+
Project-URL: Methodology, https://github.com/LucasSantana-Dev/evidence-first-rag/blob/main/docs/METHODOLOGY.md
|
|
31
|
+
Keywords: rag,retrieval,evaluation,hit@k,mrr,regression-gate,ci,bm25,reranking,search
|
|
32
|
+
Requires-Python: >=3.10
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Provides-Extra: hybrid
|
|
36
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "hybrid"
|
|
37
|
+
Requires-Dist: rank-bm25>=0.2; extra == "hybrid"
|
|
38
|
+
Requires-Dist: numpy>=1.24; extra == "hybrid"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# hitgate
|
|
42
|
+
|
|
43
|
+
[](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml)
|
|
44
|
+
|
|
45
|
+
**`pip install hitgate`** installs the harness (dependency-free; measures *any* retriever via `--retriever`).
|
|
46
|
+
**`pip install "hitgate[hybrid]"`** adds the bundled hybrid retriever used in the demo below.
|
|
47
|
+
|
|
48
|
+
> **A pytest-style regression gate for retrieval quality** — plus the small hybrid
|
|
49
|
+
> retriever it was built to measure. Point it at *your* retriever and find out whether a
|
|
50
|
+
> change helped or hurt, when you have **no labeled data and no users to A/B against**.
|
|
51
|
+
|
|
52
|
+
**Status:** working · stable · single-author personal tooling, published for the
|
|
53
|
+
*methodology*. The adoptable part is the **harness**: a label-free, regression-gated quality
|
|
54
|
+
check for any retriever (`--retriever module:callable`). The bundled hybrid engine is just
|
|
55
|
+
the thing it measures.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Why this exists
|
|
60
|
+
|
|
61
|
+
Building RAG is easy; *knowing whether a change made it better or worse* is the hard
|
|
62
|
+
part. With a small corpus and a single user you have none of the production crutches —
|
|
63
|
+
no click logs, no A/B traffic, no annotation budget. This repo is one answer: treat
|
|
64
|
+
retrieval quality as a **measurable, regression-gated property**, like a test suite,
|
|
65
|
+
and be ruthlessly honest about what the numbers do and don't prove.
|
|
66
|
+
|
|
67
|
+
## Quickstart (reproducible in ~10 seconds)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install -e ".[hybrid]" # harness core is dependency-free; [hybrid] adds the bundled retriever
|
|
71
|
+
|
|
72
|
+
# Index this repo into a local ./.rag-index/ (the tool indexes itself)
|
|
73
|
+
RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build
|
|
74
|
+
|
|
75
|
+
# Ask it something
|
|
76
|
+
RAG_SOURCE_ROOTS="$PWD" python -m ragcore.query --scope code "how does the reranker fall back"
|
|
77
|
+
|
|
78
|
+
# Run the eval gate (bundled retriever)
|
|
79
|
+
RAG_RERANK_AUTO=off python -m hitgate.run --label demo
|
|
80
|
+
|
|
81
|
+
# ...or point the SAME gate at YOUR retriever — any callable (query, top, scope) -> [{"path": ...}]
|
|
82
|
+
python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve --label mine
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
That eval indexes the repo's own source and scores 50 golden cases against it — so
|
|
86
|
+
**you can reproduce the number below yourself**, no private data required.
|
|
87
|
+
|
|
88
|
+
## Results
|
|
89
|
+
|
|
90
|
+
### Self-indexed demo (reproducible)
|
|
91
|
+
|
|
92
|
+
| Metric | Value |
|
|
93
|
+
|---|---|
|
|
94
|
+
| **Hit@5** (code scope, pure hybrid) — *the regression-gated headline* | **1.0** |
|
|
95
|
+
| Hit@1 | 0.663 |
|
|
96
|
+
| MRR | 0.800 |
|
|
97
|
+
| Corpus | this repo, self-indexed · 101 cases |
|
|
98
|
+
|
|
99
|
+
67 of 101 cases hit at rank 1; the misses are left in on purpose. Inflating a benchmark by
|
|
100
|
+
quietly dropping the cases it fails is the first thing this project refuses to do — see
|
|
101
|
+
[DECISIONS.md](./DECISIONS.md); measured before/after deltas are in
|
|
102
|
+
[CHANGELOG.md](./CHANGELOG.md). An honest ablation — where **BM25-only wins Hit@1**
|
|
103
|
+
(0.522) while **hybrid wins Hit@5** (1.0) — is walked through in
|
|
104
|
+
[docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
|
|
105
|
+
|
|
106
|
+
Because the demo indexes **this repo itself**, the corpus grows as the repo does, so
|
|
107
|
+
Hit@1 and MRR drift over time — adding a file can demote a borderline case. That's why
|
|
108
|
+
**Hit@5 is the number under regression gate** (`hitgate/check.sh`, ±5pp). The drift is the
|
|
109
|
+
honest behavior of a self-indexing benchmark, not noise swept under a frozen number.
|
|
110
|
+
|
|
111
|
+
### External corpus benchmarks
|
|
112
|
+
|
|
113
|
+
The same retriever — zero tuning, same `hitgate/run.py` pipeline — measured against 7 other
|
|
114
|
+
codebases with no corpus-specific configuration:
|
|
115
|
+
|
|
116
|
+
| Corpus | Language | n | Hit@5 | Hit@1 | MRR |
|
|
117
|
+
|---|---|---|---|---|---|
|
|
118
|
+
| FastAPI v0.115 | Python | 25 | **1.0** | 0.64 | 0.79 |
|
|
119
|
+
| forge-space / mcp-gateway | TypeScript | 20 | **1.0** | 0.70 | 0.821 |
|
|
120
|
+
| portfolio / src | React/TS | 15 | **1.0** | 0.60 | 0.778 |
|
|
121
|
+
| ai-dev-toolkit / packages/core | Python + TS | 20 | **1.0** | 0.85 | 0.925 |
|
|
122
|
+
| homelab / homelab\_manager | Python | 20 | 0.950 | 0.85 | 0.900 |
|
|
123
|
+
| Lucky / packages/backend | TypeScript | 21 | 0.905 | 0.71 | 0.810 |
|
|
124
|
+
| Criativaria / web-app | Next.js/TS | 27 | 0.741 | 0.59 | 0.660 |
|
|
125
|
+
|
|
126
|
+
Hit@5=1.0 on four of seven corpora. The two lowest-performing corpora have structural
|
|
127
|
+
causes: Lucky has one Category B drift miss (Prometheus registry vs middleware, identical
|
|
128
|
+
vocabulary); Criativaria is a homogeneous Next.js component library where sibling components
|
|
129
|
+
are lexically indistinguishable — a genuine retrieval ceiling, not a tuning problem.
|
|
130
|
+
|
|
131
|
+
The finding that matters: **corpus module clarity predicts Hit@1 better than language or
|
|
132
|
+
size.** Clean functional boundaries (homelab, ADT) → 0.85. Same-layer UI components
|
|
133
|
+
(portfolio, Criativaria) → 0.59–0.60. Python vs TypeScript is not the variable.
|
|
134
|
+
|
|
135
|
+
Full methodology, miss taxonomy, and reproduce commands: [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
|
|
136
|
+
|
|
137
|
+
## How it works
|
|
138
|
+
|
|
139
|
+
- **Hybrid retrieval** — dense embeddings (`intfloat/multilingual-e5-small`) + lexical
|
|
140
|
+
BM25, fused with Reciprocal Rank Fusion. A code-aware tokenizer splits identifiers
|
|
141
|
+
into `camelCase`/`snake_case` subtokens so "get user profile" matches `getUserProfile`.
|
|
142
|
+
- **Selective reranking** (optional) — a cross-encoder reranker that, when enabled, is
|
|
143
|
+
scoped to code-scope queries only (it was measured to *help* code and *regress*
|
|
144
|
+
prose), with graceful fallback to the fused ranking if the model isn't present.
|
|
145
|
+
- **Language-aware chunking** — Python by AST symbol, TS/JS/Shell by regex, with a
|
|
146
|
+
word-count fallback.
|
|
147
|
+
- **Config by env var** — zero-setup defaults (`RAG_*`); see [`ragcore/config.py`](./ragcore/config.py).
|
|
148
|
+
- **Eval (the point)** — `hitgate/run.py` reports Hit@K/MRR for *any* retriever via
|
|
149
|
+
`--retriever`; `hitgate/check.sh` gates a run against a frozen baseline (±5pp).
|
|
150
|
+
- **Golden set generator** — `hitgate/generate.py` bootstraps candidate cases from your corpus
|
|
151
|
+
structure (docstrings, symbol names) with zero dependencies. LLM paraphrase generation is
|
|
152
|
+
opt-in via `--llm`. Output feeds directly into `hitgate/run.py --dataset`.
|
|
153
|
+
|
|
154
|
+
## Use it on your own retriever
|
|
155
|
+
|
|
156
|
+
The harness doesn't care whose retriever it's measuring. A retriever is any callable:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
retrieve(query: str, top: int, scope: str | None) -> Sequence[Mapping]
|
|
160
|
+
# results ranked best-first; each a mapping with at least "path" (optionally "start_line")
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Point the gate at yours with `--retriever module.path:callable`:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
python -m hitgate.run --retriever mypkg.myretriever:retrieve --label mine
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
A runnable, dependency-free example — a deliberately dumb keyword matcher — is in
|
|
170
|
+
[`hitgate/example_external_retriever.py`](./hitgate/example_external_retriever.py). Ecosystem
|
|
171
|
+
wrappers (LangChain / LlamaIndex) live under [`adapters/`](./adapters/README.md). Bring your
|
|
172
|
+
own retriever and corpus; keep the measurement discipline.
|
|
173
|
+
|
|
174
|
+
### Bring your own corpus — 4-step quickstart
|
|
175
|
+
|
|
176
|
+
**0. Bootstrap candidate cases from your corpus (optional):**
|
|
177
|
+
```bash
|
|
178
|
+
RAG_SOURCE_ROOTS="/path/to/your/corpus" python -m hitgate.generate \
|
|
179
|
+
--output hitgate/candidates.jsonl \
|
|
180
|
+
--min-confidence medium
|
|
181
|
+
|
|
182
|
+
# LLM-enhanced (identifier + paraphrase per chunk, no extra package needed):
|
|
183
|
+
OPENAI_API_KEY=sk-... RAG_SOURCE_ROOTS="/path/to/your/corpus" \
|
|
184
|
+
python -m hitgate.generate --llm --output hitgate/candidates.jsonl
|
|
185
|
+
```
|
|
186
|
+
Review and curate `hitgate/candidates.jsonl` — delete cases where the query is too vague
|
|
187
|
+
or the expected file is wrong — then use it as your golden set below.
|
|
188
|
+
|
|
189
|
+
**1. Write golden cases** — each is a JSON object with three fields:
|
|
190
|
+
```jsonl
|
|
191
|
+
{"query": "what handles pagination in the API", "expect_path_contains": "api/pagination.py", "expect_scope": "code"}
|
|
192
|
+
{"query": "where are rate limits configured", "expect_path_contains": "config/limits.yaml", "expect_scope": "code"}
|
|
193
|
+
```
|
|
194
|
+
`expect_path_contains` is a substring of the expected result's path (file name is usually enough).
|
|
195
|
+
Aim for 20–50 cases across a mix of identifier lookups and paraphrase queries. Save as any `.jsonl`.
|
|
196
|
+
|
|
197
|
+
**2. Run your retriever against the cases:**
|
|
198
|
+
```bash
|
|
199
|
+
python -m hitgate.run \
|
|
200
|
+
--retriever mypkg.myretriever:retrieve \
|
|
201
|
+
--dataset my_golden.jsonl \
|
|
202
|
+
--label baseline-v1
|
|
203
|
+
# writes hitgate/baseline-v1.json with hit@1/hit@3/hit@5/mrr + per_case breakdown
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**3. Freeze the baseline:**
|
|
207
|
+
```bash
|
|
208
|
+
cp hitgate/baseline-v1.json hitgate/baseline.my-project.json
|
|
209
|
+
# edit _note to record conditions: corpus, model, date
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**4. Gate future runs with check.sh:**
|
|
213
|
+
```bash
|
|
214
|
+
# hitgate/check.sh already reads BASELINE_FILE env var
|
|
215
|
+
BASELINE_FILE=hitgate/baseline.my-project.json \
|
|
216
|
+
RAG_SOURCE_ROOTS="/path/to/your/corpus" \
|
|
217
|
+
python -m hitgate.run --retriever mypkg.myretriever:retrieve --dataset my_golden.jsonl --label ci
|
|
218
|
+
bash hitgate/check.sh hitgate/ci.json hitgate/baseline.my-project.json
|
|
219
|
+
# exits 1 if any metric regresses by more than 5pp
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
To diff two runs case-by-case: `python -m hitgate.diff hitgate/baseline-v1.json hitgate/ci.json`.
|
|
223
|
+
|
|
224
|
+
## What to adopt (and what to skip)
|
|
225
|
+
|
|
226
|
+
**Adopt the harness.** The reusable thing here is `hitgate/` — the label-free, regression-gated
|
|
227
|
+
quality check and the `--retriever` interface. The bundled hybrid engine is a reference
|
|
228
|
+
implementation, not the product. What this is **not**:
|
|
229
|
+
|
|
230
|
+
- **Not a framework or a hosted service** — no plugin marketplace, no SaaS. Fork the harness;
|
|
231
|
+
the retriever is swappable by design.
|
|
232
|
+
- **Not state-of-the-art retrieval research** — a pragmatic single-user system that
|
|
233
|
+
knows its own ceiling and stops there.
|
|
234
|
+
- **Not a maintained project** — a solo operator's personal tool, shared for the
|
|
235
|
+
methodology. Issues and PRs are welcome but may not be triaged; expect best-effort,
|
|
236
|
+
no SLA. The eval workflow is an *advisory* gate (it proves the numbers reproduce), not
|
|
237
|
+
a support promise.
|
|
238
|
+
|
|
239
|
+
Other conventional repo furniture — `CONTRIBUTING`, issue templates, a badge wall — is
|
|
240
|
+
**deliberately** omitted, not unfinished. [DECISIONS.md](./DECISIONS.md) records what's
|
|
241
|
+
left out on purpose and the trigger that would reopen each.
|
|
242
|
+
|
|
243
|
+
## Extending
|
|
244
|
+
|
|
245
|
+
The core indexes code + docs + commits and nothing else, on purpose. Tool-specific
|
|
246
|
+
sources (assistant transcripts, code-graphs, other memory stores) plug in as opt-in
|
|
247
|
+
adapters — see [`adapters/README.md`](./adapters/README.md).
|
|
248
|
+
|
|
249
|
+
## Where this could go
|
|
250
|
+
|
|
251
|
+
Candidate experiments — each gated on a measured win, none promised — are written up in
|
|
252
|
+
[ROADMAP.md](./ROADMAP.md). They're directions, not commitments.
|
|
253
|
+
|
|
254
|
+
## License
|
|
255
|
+
|
|
256
|
+
MIT — see [LICENSE](./LICENSE). Use the methodology freely.
|
hitgate-0.1.0/README.md
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# hitgate
|
|
2
|
+
|
|
3
|
+
[](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml)
|
|
4
|
+
|
|
5
|
+
**`pip install hitgate`** installs the harness (dependency-free; measures *any* retriever via `--retriever`).
|
|
6
|
+
**`pip install "hitgate[hybrid]"`** adds the bundled hybrid retriever used in the demo below.
|
|
7
|
+
|
|
8
|
+
> **A pytest-style regression gate for retrieval quality** — plus the small hybrid
|
|
9
|
+
> retriever it was built to measure. Point it at *your* retriever and find out whether a
|
|
10
|
+
> change helped or hurt, when you have **no labeled data and no users to A/B against**.
|
|
11
|
+
|
|
12
|
+
**Status:** working · stable · single-author personal tooling, published for the
|
|
13
|
+
*methodology*. The adoptable part is the **harness**: a label-free, regression-gated quality
|
|
14
|
+
check for any retriever (`--retriever module:callable`). The bundled hybrid engine is just
|
|
15
|
+
the thing it measures.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Why this exists
|
|
20
|
+
|
|
21
|
+
Building RAG is easy; *knowing whether a change made it better or worse* is the hard
|
|
22
|
+
part. With a small corpus and a single user you have none of the production crutches —
|
|
23
|
+
no click logs, no A/B traffic, no annotation budget. This repo is one answer: treat
|
|
24
|
+
retrieval quality as a **measurable, regression-gated property**, like a test suite,
|
|
25
|
+
and be ruthlessly honest about what the numbers do and don't prove.
|
|
26
|
+
|
|
27
|
+
## Quickstart (reproducible in ~10 seconds)
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install -e ".[hybrid]" # harness core is dependency-free; [hybrid] adds the bundled retriever
|
|
31
|
+
|
|
32
|
+
# Index this repo into a local ./.rag-index/ (the tool indexes itself)
|
|
33
|
+
RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build
|
|
34
|
+
|
|
35
|
+
# Ask it something
|
|
36
|
+
RAG_SOURCE_ROOTS="$PWD" python -m ragcore.query --scope code "how does the reranker fall back"
|
|
37
|
+
|
|
38
|
+
# Run the eval gate (bundled retriever)
|
|
39
|
+
RAG_RERANK_AUTO=off python -m hitgate.run --label demo
|
|
40
|
+
|
|
41
|
+
# ...or point the SAME gate at YOUR retriever — any callable (query, top, scope) -> [{"path": ...}]
|
|
42
|
+
python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve --label mine
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
That eval indexes the repo's own source and scores 50 golden cases against it — so
|
|
46
|
+
**you can reproduce the number below yourself**, no private data required.
|
|
47
|
+
|
|
48
|
+
## Results
|
|
49
|
+
|
|
50
|
+
### Self-indexed demo (reproducible)
|
|
51
|
+
|
|
52
|
+
| Metric | Value |
|
|
53
|
+
|---|---|
|
|
54
|
+
| **Hit@5** (code scope, pure hybrid) — *the regression-gated headline* | **1.0** |
|
|
55
|
+
| Hit@1 | 0.663 |
|
|
56
|
+
| MRR | 0.800 |
|
|
57
|
+
| Corpus | this repo, self-indexed · 101 cases |
|
|
58
|
+
|
|
59
|
+
67 of 101 cases hit at rank 1; the misses are left in on purpose. Inflating a benchmark by
|
|
60
|
+
quietly dropping the cases it fails is the first thing this project refuses to do — see
|
|
61
|
+
[DECISIONS.md](./DECISIONS.md); measured before/after deltas are in
|
|
62
|
+
[CHANGELOG.md](./CHANGELOG.md). An honest ablation — where **BM25-only wins Hit@1**
|
|
63
|
+
(0.522) while **hybrid wins Hit@5** (1.0) — is walked through in
|
|
64
|
+
[docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
|
|
65
|
+
|
|
66
|
+
Because the demo indexes **this repo itself**, the corpus grows as the repo does, so
|
|
67
|
+
Hit@1 and MRR drift over time — adding a file can demote a borderline case. That's why
|
|
68
|
+
**Hit@5 is the number under regression gate** (`hitgate/check.sh`, ±5pp). The drift is the
|
|
69
|
+
honest behavior of a self-indexing benchmark, not noise swept under a frozen number.
|
|
70
|
+
|
|
71
|
+
### External corpus benchmarks
|
|
72
|
+
|
|
73
|
+
The same retriever — zero tuning, same `hitgate/run.py` pipeline — measured against 7 other
|
|
74
|
+
codebases with no corpus-specific configuration:
|
|
75
|
+
|
|
76
|
+
| Corpus | Language | n | Hit@5 | Hit@1 | MRR |
|
|
77
|
+
|---|---|---|---|---|---|
|
|
78
|
+
| FastAPI v0.115 | Python | 25 | **1.0** | 0.64 | 0.79 |
|
|
79
|
+
| forge-space / mcp-gateway | TypeScript | 20 | **1.0** | 0.70 | 0.821 |
|
|
80
|
+
| portfolio / src | React/TS | 15 | **1.0** | 0.60 | 0.778 |
|
|
81
|
+
| ai-dev-toolkit / packages/core | Python + TS | 20 | **1.0** | 0.85 | 0.925 |
|
|
82
|
+
| homelab / homelab\_manager | Python | 20 | 0.950 | 0.85 | 0.900 |
|
|
83
|
+
| Lucky / packages/backend | TypeScript | 21 | 0.905 | 0.71 | 0.810 |
|
|
84
|
+
| Criativaria / web-app | Next.js/TS | 27 | 0.741 | 0.59 | 0.660 |
|
|
85
|
+
|
|
86
|
+
Hit@5=1.0 on four of seven corpora. The two lowest-performing corpora have structural
|
|
87
|
+
causes: Lucky has one Category B drift miss (Prometheus registry vs middleware, identical
|
|
88
|
+
vocabulary); Criativaria is a homogeneous Next.js component library where sibling components
|
|
89
|
+
are lexically indistinguishable — a genuine retrieval ceiling, not a tuning problem.
|
|
90
|
+
|
|
91
|
+
The finding that matters: **corpus module clarity predicts Hit@1 better than language or
|
|
92
|
+
size.** Clean functional boundaries (homelab, ADT) → 0.85. Same-layer UI components
|
|
93
|
+
(portfolio, Criativaria) → 0.59–0.60. Python vs TypeScript is not the variable.
|
|
94
|
+
|
|
95
|
+
Full methodology, miss taxonomy, and reproduce commands: [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
|
|
96
|
+
|
|
97
|
+
## How it works
|
|
98
|
+
|
|
99
|
+
- **Hybrid retrieval** — dense embeddings (`intfloat/multilingual-e5-small`) + lexical
|
|
100
|
+
BM25, fused with Reciprocal Rank Fusion. A code-aware tokenizer splits identifiers
|
|
101
|
+
into `camelCase`/`snake_case` subtokens so "get user profile" matches `getUserProfile`.
|
|
102
|
+
- **Selective reranking** (optional) — a cross-encoder reranker that, when enabled, is
|
|
103
|
+
scoped to code-scope queries only (it was measured to *help* code and *regress*
|
|
104
|
+
prose), with graceful fallback to the fused ranking if the model isn't present.
|
|
105
|
+
- **Language-aware chunking** — Python by AST symbol, TS/JS/Shell by regex, with a
|
|
106
|
+
word-count fallback.
|
|
107
|
+
- **Config by env var** — zero-setup defaults (`RAG_*`); see [`ragcore/config.py`](./ragcore/config.py).
|
|
108
|
+
- **Eval (the point)** — `hitgate/run.py` reports Hit@K/MRR for *any* retriever via
|
|
109
|
+
`--retriever`; `hitgate/check.sh` gates a run against a frozen baseline (±5pp).
|
|
110
|
+
- **Golden set generator** — `hitgate/generate.py` bootstraps candidate cases from your corpus
|
|
111
|
+
structure (docstrings, symbol names) with zero dependencies. LLM paraphrase generation is
|
|
112
|
+
opt-in via `--llm`. Output feeds directly into `hitgate/run.py --dataset`.
|
|
113
|
+
|
|
114
|
+
## Use it on your own retriever
|
|
115
|
+
|
|
116
|
+
The harness doesn't care whose retriever it's measuring. A retriever is any callable:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
retrieve(query: str, top: int, scope: str | None) -> Sequence[Mapping]
|
|
120
|
+
# results ranked best-first; each a mapping with at least "path" (optionally "start_line")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Point the gate at yours with `--retriever module.path:callable`:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
python -m hitgate.run --retriever mypkg.myretriever:retrieve --label mine
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
A runnable, dependency-free example — a deliberately dumb keyword matcher — is in
|
|
130
|
+
[`hitgate/example_external_retriever.py`](./hitgate/example_external_retriever.py). Ecosystem
|
|
131
|
+
wrappers (LangChain / LlamaIndex) live under [`adapters/`](./adapters/README.md). Bring your
|
|
132
|
+
own retriever and corpus; keep the measurement discipline.
|
|
133
|
+
|
|
134
|
+
### Bring your own corpus — 4-step quickstart
|
|
135
|
+
|
|
136
|
+
**0. Bootstrap candidate cases from your corpus (optional):**
|
|
137
|
+
```bash
|
|
138
|
+
RAG_SOURCE_ROOTS="/path/to/your/corpus" python -m hitgate.generate \
|
|
139
|
+
--output hitgate/candidates.jsonl \
|
|
140
|
+
--min-confidence medium
|
|
141
|
+
|
|
142
|
+
# LLM-enhanced (identifier + paraphrase per chunk, no extra package needed):
|
|
143
|
+
OPENAI_API_KEY=sk-... RAG_SOURCE_ROOTS="/path/to/your/corpus" \
|
|
144
|
+
python -m hitgate.generate --llm --output hitgate/candidates.jsonl
|
|
145
|
+
```
|
|
146
|
+
Review and curate `hitgate/candidates.jsonl` — delete cases where the query is too vague
|
|
147
|
+
or the expected file is wrong — then use it as your golden set below.
|
|
148
|
+
|
|
149
|
+
**1. Write golden cases** — each is a JSON object with three fields:
|
|
150
|
+
```jsonl
|
|
151
|
+
{"query": "what handles pagination in the API", "expect_path_contains": "api/pagination.py", "expect_scope": "code"}
|
|
152
|
+
{"query": "where are rate limits configured", "expect_path_contains": "config/limits.yaml", "expect_scope": "code"}
|
|
153
|
+
```
|
|
154
|
+
`expect_path_contains` is a substring of the expected result's path (file name is usually enough).
|
|
155
|
+
Aim for 20–50 cases across a mix of identifier lookups and paraphrase queries. Save as any `.jsonl`.
|
|
156
|
+
|
|
157
|
+
**2. Run your retriever against the cases:**
|
|
158
|
+
```bash
|
|
159
|
+
python -m hitgate.run \
|
|
160
|
+
--retriever mypkg.myretriever:retrieve \
|
|
161
|
+
--dataset my_golden.jsonl \
|
|
162
|
+
--label baseline-v1
|
|
163
|
+
# writes hitgate/baseline-v1.json with hit@1/hit@3/hit@5/mrr + per_case breakdown
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**3. Freeze the baseline:**
|
|
167
|
+
```bash
|
|
168
|
+
cp hitgate/baseline-v1.json hitgate/baseline.my-project.json
|
|
169
|
+
# edit _note to record conditions: corpus, model, date
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
**4. Gate future runs with check.sh:**
|
|
173
|
+
```bash
|
|
174
|
+
# hitgate/check.sh already reads BASELINE_FILE env var
|
|
175
|
+
BASELINE_FILE=hitgate/baseline.my-project.json \
|
|
176
|
+
RAG_SOURCE_ROOTS="/path/to/your/corpus" \
|
|
177
|
+
python -m hitgate.run --retriever mypkg.myretriever:retrieve --dataset my_golden.jsonl --label ci
|
|
178
|
+
bash hitgate/check.sh hitgate/ci.json hitgate/baseline.my-project.json
|
|
179
|
+
# exits 1 if any metric regresses by more than 5pp
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
To diff two runs case-by-case: `python -m hitgate.diff hitgate/baseline-v1.json hitgate/ci.json`.
|
|
183
|
+
|
|
184
|
+
## What to adopt (and what to skip)
|
|
185
|
+
|
|
186
|
+
**Adopt the harness.** The reusable thing here is `hitgate/` — the label-free, regression-gated
|
|
187
|
+
quality check and the `--retriever` interface. The bundled hybrid engine is a reference
|
|
188
|
+
implementation, not the product. What this is **not**:
|
|
189
|
+
|
|
190
|
+
- **Not a framework or a hosted service** — no plugin marketplace, no SaaS. Fork the harness;
|
|
191
|
+
the retriever is swappable by design.
|
|
192
|
+
- **Not state-of-the-art retrieval research** — a pragmatic single-user system that
|
|
193
|
+
knows its own ceiling and stops there.
|
|
194
|
+
- **Not a maintained project** — a solo operator's personal tool, shared for the
|
|
195
|
+
methodology. Issues and PRs are welcome but may not be triaged; expect best-effort,
|
|
196
|
+
no SLA. The eval workflow is an *advisory* gate (it proves the numbers reproduce), not
|
|
197
|
+
a support promise.
|
|
198
|
+
|
|
199
|
+
Other conventional repo furniture — `CONTRIBUTING`, issue templates, a badge wall — is
|
|
200
|
+
**deliberately** omitted, not unfinished. [DECISIONS.md](./DECISIONS.md) records what's
|
|
201
|
+
left out on purpose and the trigger that would reopen each.
|
|
202
|
+
|
|
203
|
+
## Extending
|
|
204
|
+
|
|
205
|
+
The core indexes code + docs + commits and nothing else, on purpose. Tool-specific
|
|
206
|
+
sources (assistant transcripts, code-graphs, other memory stores) plug in as opt-in
|
|
207
|
+
adapters — see [`adapters/README.md`](./adapters/README.md).
|
|
208
|
+
|
|
209
|
+
## Where this could go
|
|
210
|
+
|
|
211
|
+
Candidate experiments — each gated on a measured win, none promised — are written up in
|
|
212
|
+
[ROADMAP.md](./ROADMAP.md). They're directions, not commitments.
|
|
213
|
+
|
|
214
|
+
## License
|
|
215
|
+
|
|
216
|
+
MIT — see [LICENSE](./LICENSE). Use the methodology freely.
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""hitgate/audit_contamination.py — find un-winnable cases in an eval set.
|
|
3
|
+
|
|
4
|
+
The most insidious way a retrieval benchmark lies is *contamination*: a "golden"
|
|
5
|
+
case whose expected answer isn't in the indexed corpus at all. Such a case can
|
|
6
|
+
only ever miss, so it caps the score with a constant penalty that looks like a
|
|
7
|
+
quality floor — and every decision made on that number inherits the lie. (This is
|
|
8
|
+
the audit that moved this project's own baseline ~8pp; see DECISIONS.md.)
|
|
9
|
+
|
|
10
|
+
This script makes that audit reusable. Point it at any eval set (same schema as
|
|
11
|
+
hitgate/golden.demo.jsonl) and an index, and it classifies every case:
|
|
12
|
+
|
|
13
|
+
ok — the expected path is indexed within the case's declared scope
|
|
14
|
+
scope-mismatch — the path is indexed, but only OUTSIDE the declared scope
|
|
15
|
+
CONTAMINATED — the expected path is not in the corpus at all → un-winnable
|
|
16
|
+
|
|
17
|
+
Exit 0 if no contamination, 1 if any case is un-winnable (so it can gate a build).
|
|
18
|
+
scope-mismatch is reported as a warning, not a failure.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build # build the index first
|
|
22
|
+
RAG_SOURCE_ROOTS="$PWD" python -m hitgate.audit_contamination
|
|
23
|
+
python -m hitgate.audit_contamination --dataset path/to/your.jsonl
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import json
|
|
29
|
+
import sqlite3
|
|
30
|
+
import sys
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
ROOT = Path(__file__).resolve().parent.parent
|
|
34
|
+
from ragcore.config import DB # honors RAG_INDEX_DIR
|
|
35
|
+
|
|
36
|
+
DEFAULT_DATASET = ROOT / "hitgate" / "golden.demo.jsonl"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _expected_substrings(case: dict) -> list[str]:
|
|
40
|
+
"""Non-empty expected path substrings for a case (empties dropped — '' matches every path)."""
|
|
41
|
+
raw = case["expect_path_contains"]
|
|
42
|
+
raw = raw if isinstance(raw, list) else [raw]
|
|
43
|
+
return [e for e in raw if isinstance(e, str) and e.strip()]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def load_cases(path: Path) -> list[dict]:
|
|
47
|
+
if not path.exists():
|
|
48
|
+
sys.exit(f"dataset not found: {path}")
|
|
49
|
+
cases = []
|
|
50
|
+
for i, line in enumerate(path.read_text().splitlines(), 1):
|
|
51
|
+
if not line.strip():
|
|
52
|
+
continue
|
|
53
|
+
try:
|
|
54
|
+
case = json.loads(line)
|
|
55
|
+
except json.JSONDecodeError as e:
|
|
56
|
+
sys.exit(f"{path}:{i}: invalid JSON — {e}")
|
|
57
|
+
if "expect_path_contains" not in case: # tolerate other schemas by skipping
|
|
58
|
+
continue
|
|
59
|
+
if not _expected_substrings(case):
|
|
60
|
+
sys.exit(
|
|
61
|
+
f"{path}:{i}: case {case.get('query', '?')!r} has empty expect_path_contains "
|
|
62
|
+
f"— a malformed eval case (an empty substring matches every path). Fix the eval set."
|
|
63
|
+
)
|
|
64
|
+
cases.append(case)
|
|
65
|
+
return cases
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def load_corpus(db: Path) -> list[tuple[str, str]]:
|
|
69
|
+
"""(source_type, path) for every indexed chunk."""
|
|
70
|
+
if not db.exists():
|
|
71
|
+
sys.exit(f"no index at {db} — run ragcore/build.py first")
|
|
72
|
+
conn = sqlite3.connect(db)
|
|
73
|
+
try:
|
|
74
|
+
return conn.execute("SELECT source_type, path FROM chunks").fetchall()
|
|
75
|
+
finally:
|
|
76
|
+
conn.close()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def classify(case: dict, corpus: list[tuple[str, str]]) -> str:
|
|
80
|
+
expected = _expected_substrings(case) # already validated non-empty in load_cases
|
|
81
|
+
scope = case.get("expect_scope")
|
|
82
|
+
scopes = scope if isinstance(scope, list) else ([scope] if scope else [])
|
|
83
|
+
|
|
84
|
+
def path_matches(p: str) -> bool:
|
|
85
|
+
return any(e in p for e in expected)
|
|
86
|
+
|
|
87
|
+
anywhere = [(st, p) for st, p in corpus if path_matches(p)]
|
|
88
|
+
if not anywhere:
|
|
89
|
+
return "CONTAMINATED"
|
|
90
|
+
if scopes and not any(st in scopes for st, _ in anywhere):
|
|
91
|
+
return "scope-mismatch"
|
|
92
|
+
return "ok"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def resolve_dataset(arg: str) -> Path:
|
|
96
|
+
"""Resolve --dataset robustly: absolute as-is; otherwise try cwd-relative (standard
|
|
97
|
+
CLI behavior) and then repo-root-relative, so the tool works whether you run it from
|
|
98
|
+
inside the repo or from elsewhere. Errors clearly, naming both paths tried."""
|
|
99
|
+
p = Path(arg)
|
|
100
|
+
if p.is_absolute():
|
|
101
|
+
return p
|
|
102
|
+
candidates = [Path.cwd() / arg, ROOT / arg]
|
|
103
|
+
for candidate in candidates:
|
|
104
|
+
if candidate.exists():
|
|
105
|
+
return candidate
|
|
106
|
+
sys.exit("dataset not found — tried " + " and ".join(str(c) for c in candidates))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def main() -> int:
|
|
110
|
+
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
111
|
+
ap.add_argument("--dataset", default=str(DEFAULT_DATASET),
|
|
112
|
+
help="eval jsonl to audit (absolute, or relative to cwd or repo root)")
|
|
113
|
+
args = ap.parse_args()
|
|
114
|
+
|
|
115
|
+
dataset = resolve_dataset(args.dataset)
|
|
116
|
+
cases = load_cases(dataset)
|
|
117
|
+
if not cases:
|
|
118
|
+
sys.exit(f"no usable cases in {dataset}")
|
|
119
|
+
corpus = load_corpus(DB)
|
|
120
|
+
|
|
121
|
+
verdicts = {"ok": [], "scope-mismatch": [], "CONTAMINATED": []}
|
|
122
|
+
for case in cases:
|
|
123
|
+
verdicts[classify(case, corpus)].append(case)
|
|
124
|
+
|
|
125
|
+
n = len(cases)
|
|
126
|
+
print(f"contamination audit: {n} cases vs {len(corpus)} indexed chunks ({DB})")
|
|
127
|
+
print(f" ok: {len(verdicts['ok'])}")
|
|
128
|
+
print(f" scope-mismatch: {len(verdicts['scope-mismatch'])}")
|
|
129
|
+
print(f" CONTAMINATED: {len(verdicts['CONTAMINATED'])}")
|
|
130
|
+
|
|
131
|
+
for case in verdicts["scope-mismatch"]:
|
|
132
|
+
print(f"\n ⚠ scope-mismatch: {case['query'][:70]!r}")
|
|
133
|
+
print(f" expects {case['expect_path_contains']} in scope={case.get('expect_scope')}, found only out of scope")
|
|
134
|
+
for case in verdicts["CONTAMINATED"]:
|
|
135
|
+
print(f"\n ✗ CONTAMINATED: {case['query'][:70]!r}")
|
|
136
|
+
print(f" expects {case['expect_path_contains']} — not in the corpus; this case is un-winnable")
|
|
137
|
+
|
|
138
|
+
if verdicts["CONTAMINATED"]:
|
|
139
|
+
print(f"\n{len(verdicts['CONTAMINATED'])} un-winnable case(s) — remove them or fix the corpus before trusting the score.")
|
|
140
|
+
return 1
|
|
141
|
+
print("\n✓ no contamination — every case's answer is in the corpus.")
|
|
142
|
+
return 0
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
sys.exit(main())
|