hitgate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. hitgate-0.1.0/LICENSE +21 -0
  2. hitgate-0.1.0/PKG-INFO +256 -0
  3. hitgate-0.1.0/README.md +216 -0
  4. hitgate-0.1.0/hitgate/__init__.py +5 -0
  5. hitgate-0.1.0/hitgate/audit_contamination.py +146 -0
  6. hitgate-0.1.0/hitgate/compare.py +144 -0
  7. hitgate-0.1.0/hitgate/diff.py +109 -0
  8. hitgate-0.1.0/hitgate/example_external_retriever.py +48 -0
  9. hitgate-0.1.0/hitgate/generate.py +438 -0
  10. hitgate-0.1.0/hitgate/plot_history.py +147 -0
  11. hitgate-0.1.0/hitgate/run.py +166 -0
  12. hitgate-0.1.0/hitgate/test_determinism.py +68 -0
  13. hitgate-0.1.0/hitgate.egg-info/PKG-INFO +256 -0
  14. hitgate-0.1.0/hitgate.egg-info/SOURCES.txt +36 -0
  15. hitgate-0.1.0/hitgate.egg-info/dependency_links.txt +1 -0
  16. hitgate-0.1.0/hitgate.egg-info/entry_points.txt +7 -0
  17. hitgate-0.1.0/hitgate.egg-info/requires.txt +5 -0
  18. hitgate-0.1.0/hitgate.egg-info/top_level.txt +2 -0
  19. hitgate-0.1.0/pyproject.toml +37 -0
  20. hitgate-0.1.0/ragcore/__init__.py +1 -0
  21. hitgate-0.1.0/ragcore/build.py +353 -0
  22. hitgate-0.1.0/ragcore/chunkers.py +180 -0
  23. hitgate-0.1.0/ragcore/config.py +92 -0
  24. hitgate-0.1.0/ragcore/mcp_server.py +142 -0
  25. hitgate-0.1.0/ragcore/pack.py +121 -0
  26. hitgate-0.1.0/ragcore/query.py +92 -0
  27. hitgate-0.1.0/ragcore/retrieval.py +359 -0
  28. hitgate-0.1.0/setup.cfg +4 -0
  29. hitgate-0.1.0/tests/test_chunkers.py +326 -0
  30. hitgate-0.1.0/tests/test_compare.py +118 -0
  31. hitgate-0.1.0/tests/test_generate.py +425 -0
  32. hitgate-0.1.0/tests/test_harness.py +57 -0
  33. hitgate-0.1.0/tests/test_langchain_adapter.py +50 -0
  34. hitgate-0.1.0/tests/test_langfuse_adapter.py +192 -0
  35. hitgate-0.1.0/tests/test_llamaindex_adapter.py +80 -0
  36. hitgate-0.1.0/tests/test_mcp_server.py +179 -0
  37. hitgate-0.1.0/tests/test_rank_modes.py +59 -0
  38. hitgate-0.1.0/tests/test_tokenizer.py +33 -0
hitgate-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lucas Santana
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
hitgate-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: hitgate
3
+ Version: 0.1.0
4
+ Summary: A label-free retrieval-quality regression gate: measure any retriever's ranking (Hit@K/MRR) on your own corpus, no labels, gated in CI like a test.
5
+ Author: Lucas Santana
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Lucas Santana
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/LucasSantana-Dev/evidence-first-rag
29
+ Project-URL: Repository, https://github.com/LucasSantana-Dev/evidence-first-rag
30
+ Project-URL: Methodology, https://github.com/LucasSantana-Dev/evidence-first-rag/blob/main/docs/METHODOLOGY.md
31
+ Keywords: rag,retrieval,evaluation,hit@k,mrr,regression-gate,ci,bm25,reranking,search
32
+ Requires-Python: >=3.10
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: hybrid
36
+ Requires-Dist: sentence-transformers>=2.2; extra == "hybrid"
37
+ Requires-Dist: rank-bm25>=0.2; extra == "hybrid"
38
+ Requires-Dist: numpy>=1.24; extra == "hybrid"
39
+ Dynamic: license-file
40
+
41
+ # hitgate
42
+
43
+ [![eval-gate (advisory)](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml/badge.svg)](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml)
44
+
45
+ **`pip install hitgate`** installs the harness (dependency-free; measures *any* retriever via `--retriever`).
46
+ **`pip install "hitgate[hybrid]"`** adds the bundled hybrid retriever used in the demo below.
47
+
48
+ > **A pytest-style regression gate for retrieval quality** — plus the small hybrid
49
+ > retriever it was built to measure. Point it at *your* retriever and find out whether a
50
+ > change helped or hurt, when you have **no labeled data and no users to A/B against**.
51
+
52
+ **Status:** working · stable · single-author personal tooling, published for the
53
+ *methodology*. The adoptable part is the **harness**: a label-free, regression-gated quality
54
+ check for any retriever (`--retriever module:callable`). The bundled hybrid engine is just
55
+ the thing it measures.
56
+
57
+ ---
58
+
59
+ ## Why this exists
60
+
61
+ Building RAG is easy; *knowing whether a change made it better or worse* is the hard
62
+ part. With a small corpus and a single user you have none of the production crutches —
63
+ no click logs, no A/B traffic, no annotation budget. This repo is one answer: treat
64
+ retrieval quality as a **measurable, regression-gated property**, like a test suite,
65
+ and be ruthlessly honest about what the numbers do and don't prove.
66
+
67
+ ## Quickstart (reproducible in ~10 seconds)
68
+
69
+ ```bash
70
+ pip install -e ".[hybrid]" # harness core is dependency-free; [hybrid] adds the bundled retriever
71
+
72
+ # Index this repo into a local ./.rag-index/ (the tool indexes itself)
73
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build
74
+
75
+ # Ask it something
76
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.query --scope code "how does the reranker fall back"
77
+
78
+ # Run the eval gate (bundled retriever)
79
+ RAG_RERANK_AUTO=off python -m hitgate.run --label demo
80
+
81
+ # ...or point the SAME gate at YOUR retriever — any callable (query, top, scope) -> [{"path": ...}]
82
+ python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve --label mine
83
+ ```
84
+
85
+ That eval indexes the repo's own source and scores 50 golden cases against it — so
86
+ **you can reproduce the number below yourself**, no private data required.
87
+
88
+ ## Results
89
+
90
+ ### Self-indexed demo (reproducible)
91
+
92
+ | Metric | Value |
93
+ |---|---|
94
+ | **Hit@5** (code scope, pure hybrid) — *the regression-gated headline* | **1.0** |
95
+ | Hit@1 | 0.663 |
96
+ | MRR | 0.800 |
97
+ | Corpus | this repo, self-indexed · 101 cases |
98
+
99
+ 67 of 101 cases hit at rank 1; the misses are left in on purpose. Inflating a benchmark by
100
+ quietly dropping the cases it fails is the first thing this project refuses to do — see
101
+ [DECISIONS.md](./DECISIONS.md); measured before/after deltas are in
102
+ [CHANGELOG.md](./CHANGELOG.md). An honest ablation — where **BM25-only wins Hit@1**
103
+ (0.522) while **hybrid wins Hit@5** (1.0) — is walked through in
104
+ [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
105
+
106
+ Because the demo indexes **this repo itself**, the corpus grows as the repo does, so
107
+ Hit@1 and MRR drift over time — adding a file can demote a borderline case. That's why
108
+ **Hit@5 is the number under regression gate** (`hitgate/check.sh`, ±5pp). The drift is the
109
+ honest behavior of a self-indexing benchmark, not noise swept under a frozen number.
110
+
111
+ ### External corpus benchmarks
112
+
113
+ The same retriever — zero tuning, same `hitgate/run.py` pipeline — measured against 7 other
114
+ codebases with no corpus-specific configuration:
115
+
116
+ | Corpus | Language | n | Hit@5 | Hit@1 | MRR |
117
+ |---|---|---|---|---|---|
118
+ | FastAPI v0.115 | Python | 25 | **1.0** | 0.64 | 0.79 |
119
+ | forge-space / mcp-gateway | TypeScript | 20 | **1.0** | 0.70 | 0.821 |
120
+ | portfolio / src | React/TS | 15 | **1.0** | 0.60 | 0.778 |
121
+ | ai-dev-toolkit / packages/core | Python + TS | 20 | **1.0** | 0.85 | 0.925 |
122
+ | homelab / homelab\_manager | Python | 20 | 0.950 | 0.85 | 0.900 |
123
+ | Lucky / packages/backend | TypeScript | 21 | 0.905 | 0.71 | 0.810 |
124
+ | Criativaria / web-app | Next.js/TS | 27 | 0.741 | 0.59 | 0.660 |
125
+
126
+ Hit@5=1.0 on four of seven corpora. The two lowest-performing corpora have structural
127
+ causes: Lucky has one Category B drift miss (Prometheus registry vs middleware, identical
128
+ vocabulary); Criativaria is a homogeneous Next.js component library where sibling components
129
+ are lexically indistinguishable — a genuine retrieval ceiling, not a tuning problem.
130
+
131
+ The finding that matters: **corpus module clarity predicts Hit@1 better than language or
132
+ size.** Clean functional boundaries (homelab, ADT) → 0.85. Same-layer UI components
133
+ (portfolio, Criativaria) → 0.59–0.60. Python vs TypeScript is not the variable.
134
+
135
+ Full methodology, miss taxonomy, and reproduce commands: [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
136
+
137
+ ## How it works
138
+
139
+ - **Hybrid retrieval** — dense embeddings (`intfloat/multilingual-e5-small`) + lexical
140
+ BM25, fused with Reciprocal Rank Fusion. A code-aware tokenizer splits identifiers
141
+ into `camelCase`/`snake_case` subtokens so "get user profile" matches `getUserProfile`.
142
+ - **Selective reranking** (optional) — a cross-encoder reranker that, when enabled, is
143
+ scoped to code-scope queries only (it was measured to *help* code and *regress*
144
+ prose), with graceful fallback to the fused ranking if the model isn't present.
145
+ - **Language-aware chunking** — Python by AST symbol, TS/JS/Shell by regex, with a
146
+ word-count fallback.
147
+ - **Config by env var** — zero-setup defaults (`RAG_*`); see [`ragcore/config.py`](./ragcore/config.py).
148
+ - **Eval (the point)** — `hitgate/run.py` reports Hit@K/MRR for *any* retriever via
149
+ `--retriever`; `hitgate/check.sh` gates a run against a frozen baseline (±5pp).
150
+ - **Golden set generator** — `hitgate/generate.py` bootstraps candidate cases from your corpus
151
+ structure (docstrings, symbol names) with zero dependencies. LLM paraphrase generation is
152
+ opt-in via `--llm`. Output feeds directly into `hitgate/run.py --dataset`.
153
+
154
+ ## Use it on your own retriever
155
+
156
+ The harness doesn't care whose retriever it's measuring. A retriever is any callable:
157
+
158
+ ```python
159
+ retrieve(query: str, top: int, scope: str | None) -> Sequence[Mapping]
160
+ # results ranked best-first; each a mapping with at least "path" (optionally "start_line")
161
+ ```
162
+
163
+ Point the gate at yours with `--retriever module.path:callable`:
164
+
165
+ ```bash
166
+ python -m hitgate.run --retriever mypkg.myretriever:retrieve --label mine
167
+ ```
168
+
169
+ A runnable, dependency-free example — a deliberately dumb keyword matcher — is in
170
+ [`hitgate/example_external_retriever.py`](./hitgate/example_external_retriever.py). Ecosystem
171
+ wrappers (LangChain / LlamaIndex) live under [`adapters/`](./adapters/README.md). Bring your
172
+ own retriever and corpus; keep the measurement discipline.
173
+
174
+ ### Bring your own corpus — 4-step quickstart
175
+
176
+ **0. Bootstrap candidate cases from your corpus (optional):**
177
+ ```bash
178
+ RAG_SOURCE_ROOTS="/path/to/your/corpus" python -m hitgate.generate \
179
+ --output hitgate/candidates.jsonl \
180
+ --min-confidence medium
181
+
182
+ # LLM-enhanced (identifier + paraphrase per chunk, no extra package needed):
183
+ OPENAI_API_KEY=sk-... RAG_SOURCE_ROOTS="/path/to/your/corpus" \
184
+ python -m hitgate.generate --llm --output hitgate/candidates.jsonl
185
+ ```
186
+ Review and curate `hitgate/candidates.jsonl` — delete cases where the query is too vague
187
+ or the expected file is wrong — then use it as your golden set below.
188
+
189
+ **1. Write golden cases** — each is a JSON object with three fields:
190
+ ```jsonl
191
+ {"query": "what handles pagination in the API", "expect_path_contains": "api/pagination.py", "expect_scope": "code"}
192
+ {"query": "where are rate limits configured", "expect_path_contains": "config/limits.yaml", "expect_scope": "code"}
193
+ ```
194
+ `expect_path_contains` is a substring of the expected result's path (file name is usually enough).
195
+ Aim for 20–50 cases across a mix of identifier lookups and paraphrase queries. Save as any `.jsonl`.
196
+
197
+ **2. Run your retriever against the cases:**
198
+ ```bash
199
+ python -m hitgate.run \
200
+ --retriever mypkg.myretriever:retrieve \
201
+ --dataset my_golden.jsonl \
202
+ --label baseline-v1
203
+ # writes hitgate/baseline-v1.json with hit@1/hit@3/hit@5/mrr + per_case breakdown
204
+ ```
205
+
206
+ **3. Freeze the baseline:**
207
+ ```bash
208
+ cp hitgate/baseline-v1.json hitgate/baseline.my-project.json
209
+ # edit _note to record conditions: corpus, model, date
210
+ ```
211
+
212
+ **4. Gate future runs with check.sh:**
213
+ ```bash
214
+ # hitgate/check.sh already reads BASELINE_FILE env var
215
+ BASELINE_FILE=hitgate/baseline.my-project.json \
216
+ RAG_SOURCE_ROOTS="/path/to/your/corpus" \
217
+ python -m hitgate.run --retriever mypkg.myretriever:retrieve --dataset my_golden.jsonl --label ci
218
+ bash hitgate/check.sh hitgate/ci.json hitgate/baseline.my-project.json
219
+ # exits 1 if any metric regresses by more than 5pp
220
+ ```
221
+
222
+ To diff two runs case-by-case: `python -m hitgate.diff hitgate/baseline-v1.json hitgate/ci.json`.
223
+
224
+ ## What to adopt (and what to skip)
225
+
226
+ **Adopt the harness.** The reusable thing here is `hitgate/` — the label-free, regression-gated
227
+ quality check and the `--retriever` interface. The bundled hybrid engine is a reference
228
+ implementation, not the product. What this is **not**:
229
+
230
+ - **Not a framework or a hosted service** — no plugin marketplace, no SaaS. Fork the harness;
231
+ the retriever is swappable by design.
232
+ - **Not state-of-the-art retrieval research** — a pragmatic single-user system that
233
+ knows its own ceiling and stops there.
234
+ - **Not a maintained project** — a solo operator's personal tool, shared for the
235
+ methodology. Issues and PRs are welcome but may not be triaged; expect best-effort,
236
+ no SLA. The eval workflow is an *advisory* gate (it proves the numbers reproduce), not
237
+ a support promise.
238
+
239
+ Other conventional repo furniture — `CONTRIBUTING`, issue templates, a badge wall — is
240
+ **deliberately** omitted, not unfinished. [DECISIONS.md](./DECISIONS.md) records what's
241
+ left out on purpose and the trigger that would reopen each.
242
+
243
+ ## Extending
244
+
245
+ The core indexes code + docs + commits and nothing else, on purpose. Tool-specific
246
+ sources (assistant transcripts, code-graphs, other memory stores) plug in as opt-in
247
+ adapters — see [`adapters/README.md`](./adapters/README.md).
248
+
249
+ ## Where this could go
250
+
251
+ Candidate experiments — each gated on a measured win, none promised — are written up in
252
+ [ROADMAP.md](./ROADMAP.md). They're directions, not commitments.
253
+
254
+ ## License
255
+
256
+ MIT — see [LICENSE](./LICENSE). Use the methodology freely.
@@ -0,0 +1,216 @@
1
+ # hitgate
2
+
3
+ [![eval-gate (advisory)](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml/badge.svg)](https://github.com/LucasSantana-Dev/hitgate/actions/workflows/eval.yml)
4
+
5
+ **`pip install hitgate`** installs the harness (dependency-free; measures *any* retriever via `--retriever`).
6
+ **`pip install "hitgate[hybrid]"`** adds the bundled hybrid retriever used in the demo below.
7
+
8
+ > **A pytest-style regression gate for retrieval quality** — plus the small hybrid
9
+ > retriever it was built to measure. Point it at *your* retriever and find out whether a
10
+ > change helped or hurt, when you have **no labeled data and no users to A/B against**.
11
+
12
+ **Status:** working · stable · single-author personal tooling, published for the
13
+ *methodology*. The adoptable part is the **harness**: a label-free, regression-gated quality
14
+ check for any retriever (`--retriever module:callable`). The bundled hybrid engine is just
15
+ the thing it measures.
16
+
17
+ ---
18
+
19
+ ## Why this exists
20
+
21
+ Building RAG is easy; *knowing whether a change made it better or worse* is the hard
22
+ part. With a small corpus and a single user you have none of the production crutches —
23
+ no click logs, no A/B traffic, no annotation budget. This repo is one answer: treat
24
+ retrieval quality as a **measurable, regression-gated property**, like a test suite,
25
+ and be ruthlessly honest about what the numbers do and don't prove.
26
+
27
+ ## Quickstart (reproducible in ~10 seconds)
28
+
29
+ ```bash
30
+ pip install -e ".[hybrid]" # harness core is dependency-free; [hybrid] adds the bundled retriever
31
+
32
+ # Index this repo into a local ./.rag-index/ (the tool indexes itself)
33
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build
34
+
35
+ # Ask it something
36
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.query --scope code "how does the reranker fall back"
37
+
38
+ # Run the eval gate (bundled retriever)
39
+ RAG_RERANK_AUTO=off python -m hitgate.run --label demo
40
+
41
+ # ...or point the SAME gate at YOUR retriever — any callable (query, top, scope) -> [{"path": ...}]
42
+ python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve --label mine
43
+ ```
44
+
45
+ That eval indexes the repo's own source and scores 50 golden cases against it — so
46
+ **you can reproduce the number below yourself**, no private data required.
47
+
48
+ ## Results
49
+
50
+ ### Self-indexed demo (reproducible)
51
+
52
+ | Metric | Value |
53
+ |---|---|
54
+ | **Hit@5** (code scope, pure hybrid) — *the regression-gated headline* | **1.0** |
55
+ | Hit@1 | 0.663 |
56
+ | MRR | 0.800 |
57
+ | Corpus | this repo, self-indexed · 101 cases |
58
+
59
+ 67 of 101 cases hit at rank 1; the misses are left in on purpose. Inflating a benchmark by
60
+ quietly dropping the cases it fails is the first thing this project refuses to do — see
61
+ [DECISIONS.md](./DECISIONS.md); measured before/after deltas are in
62
+ [CHANGELOG.md](./CHANGELOG.md). An honest ablation — where **BM25-only wins Hit@1**
63
+ (0.522) while **hybrid wins Hit@5** (1.0) — is walked through in
64
+ [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
65
+
66
+ Because the demo indexes **this repo itself**, the corpus grows as the repo does, so
67
+ Hit@1 and MRR drift over time — adding a file can demote a borderline case. That's why
68
+ **Hit@5 is the number under regression gate** (`hitgate/check.sh`, ±5pp). The drift is the
69
+ honest behavior of a self-indexing benchmark, not noise swept under a frozen number.
70
+
71
+ ### External corpus benchmarks
72
+
73
+ The same retriever — zero tuning, same `hitgate/run.py` pipeline — measured against 7 other
74
+ codebases with no corpus-specific configuration:
75
+
76
+ | Corpus | Language | n | Hit@5 | Hit@1 | MRR |
77
+ |---|---|---|---|---|---|
78
+ | FastAPI v0.115 | Python | 25 | **1.0** | 0.64 | 0.79 |
79
+ | forge-space / mcp-gateway | TypeScript | 20 | **1.0** | 0.70 | 0.821 |
80
+ | portfolio / src | React/TS | 15 | **1.0** | 0.60 | 0.778 |
81
+ | ai-dev-toolkit / packages/core | Python + TS | 20 | **1.0** | 0.85 | 0.925 |
82
+ | homelab / homelab\_manager | Python | 20 | 0.950 | 0.85 | 0.900 |
83
+ | Lucky / packages/backend | TypeScript | 21 | 0.905 | 0.71 | 0.810 |
84
+ | Criativaria / web-app | Next.js/TS | 27 | 0.741 | 0.59 | 0.660 |
85
+
86
+ Hit@5=1.0 on four of seven corpora. The two lowest-performing corpora have structural
87
+ causes: Lucky has one Category B drift miss (Prometheus registry vs middleware, identical
88
+ vocabulary); Criativaria is a homogeneous Next.js component library where sibling components
89
+ are lexically indistinguishable — a genuine retrieval ceiling, not a tuning problem.
90
+
91
+ The finding that matters: **corpus module clarity predicts Hit@1 better than language or
92
+ size.** Clean functional boundaries (homelab, ADT) → 0.85. Same-layer UI components
93
+ (portfolio, Criativaria) → 0.59–0.60. Python vs TypeScript is not the variable.
94
+
95
+ Full methodology, miss taxonomy, and reproduce commands: [docs/METHODOLOGY.md](./docs/METHODOLOGY.md).
96
+
97
+ ## How it works
98
+
99
+ - **Hybrid retrieval** — dense embeddings (`intfloat/multilingual-e5-small`) + lexical
100
+ BM25, fused with Reciprocal Rank Fusion. A code-aware tokenizer splits identifiers
101
+ into `camelCase`/`snake_case` subtokens so "get user profile" matches `getUserProfile`.
102
+ - **Selective reranking** (optional) — a cross-encoder reranker that, when enabled, is
103
+ scoped to code-scope queries only (it was measured to *help* code and *regress*
104
+ prose), with graceful fallback to the fused ranking if the model isn't present.
105
+ - **Language-aware chunking** — Python by AST symbol, TS/JS/Shell by regex, with a
106
+ word-count fallback.
107
+ - **Config by env var** — zero-setup defaults (`RAG_*`); see [`ragcore/config.py`](./ragcore/config.py).
108
+ - **Eval (the point)** — `hitgate/run.py` reports Hit@K/MRR for *any* retriever via
109
+ `--retriever`; `hitgate/check.sh` gates a run against a frozen baseline (±5pp).
110
+ - **Golden set generator** — `hitgate/generate.py` bootstraps candidate cases from your corpus
111
+ structure (docstrings, symbol names) with zero dependencies. LLM paraphrase generation is
112
+ opt-in via `--llm`. Output feeds directly into `hitgate/run.py --dataset`.
113
+
114
+ ## Use it on your own retriever
115
+
116
+ The harness doesn't care whose retriever it's measuring. A retriever is any callable:
117
+
118
+ ```python
119
+ retrieve(query: str, top: int, scope: str | None) -> Sequence[Mapping]
120
+ # results ranked best-first; each a mapping with at least "path" (optionally "start_line")
121
+ ```
122
+
123
+ Point the gate at yours with `--retriever module.path:callable`:
124
+
125
+ ```bash
126
+ python -m hitgate.run --retriever mypkg.myretriever:retrieve --label mine
127
+ ```
128
+
129
+ A runnable, dependency-free example — a deliberately dumb keyword matcher — is in
130
+ [`hitgate/example_external_retriever.py`](./hitgate/example_external_retriever.py). Ecosystem
131
+ wrappers (LangChain / LlamaIndex) live under [`adapters/`](./adapters/README.md). Bring your
132
+ own retriever and corpus; keep the measurement discipline.
133
+
134
+ ### Bring your own corpus — 4-step quickstart
135
+
136
+ **0. Bootstrap candidate cases from your corpus (optional):**
137
+ ```bash
138
+ RAG_SOURCE_ROOTS="/path/to/your/corpus" python -m hitgate.generate \
139
+ --output hitgate/candidates.jsonl \
140
+ --min-confidence medium
141
+
142
+ # LLM-enhanced (identifier + paraphrase per chunk, no extra package needed):
143
+ OPENAI_API_KEY=sk-... RAG_SOURCE_ROOTS="/path/to/your/corpus" \
144
+ python -m hitgate.generate --llm --output hitgate/candidates.jsonl
145
+ ```
146
+ Review and curate `hitgate/candidates.jsonl` — delete cases where the query is too vague
147
+ or the expected file is wrong — then use it as your golden set below.
148
+
149
+ **1. Write golden cases** — each is a JSON object with three fields:
150
+ ```jsonl
151
+ {"query": "what handles pagination in the API", "expect_path_contains": "api/pagination.py", "expect_scope": "code"}
152
+ {"query": "where are rate limits configured", "expect_path_contains": "config/limits.yaml", "expect_scope": "code"}
153
+ ```
154
+ `expect_path_contains` is a substring of the expected result's path (file name is usually enough).
155
+ Aim for 20–50 cases across a mix of identifier lookups and paraphrase queries. Save as any `.jsonl`.
156
+
157
+ **2. Run your retriever against the cases:**
158
+ ```bash
159
+ python -m hitgate.run \
160
+ --retriever mypkg.myretriever:retrieve \
161
+ --dataset my_golden.jsonl \
162
+ --label baseline-v1
163
+ # writes hitgate/baseline-v1.json with hit@1/hit@3/hit@5/mrr + per_case breakdown
164
+ ```
165
+
166
+ **3. Freeze the baseline:**
167
+ ```bash
168
+ cp hitgate/baseline-v1.json hitgate/baseline.my-project.json
169
+ # edit _note to record conditions: corpus, model, date
170
+ ```
171
+
172
+ **4. Gate future runs with check.sh:**
173
+ ```bash
174
+ # hitgate/check.sh already reads BASELINE_FILE env var
175
+ BASELINE_FILE=hitgate/baseline.my-project.json \
176
+ RAG_SOURCE_ROOTS="/path/to/your/corpus" \
177
+ python -m hitgate.run --retriever mypkg.myretriever:retrieve --dataset my_golden.jsonl --label ci
178
+ bash hitgate/check.sh hitgate/ci.json hitgate/baseline.my-project.json
179
+ # exits 1 if any metric regresses by more than 5pp
180
+ ```
181
+
182
+ To diff two runs case-by-case: `python -m hitgate.diff hitgate/baseline-v1.json hitgate/ci.json`.
183
+
184
+ ## What to adopt (and what to skip)
185
+
186
+ **Adopt the harness.** The reusable thing here is `hitgate/` — the label-free, regression-gated
187
+ quality check and the `--retriever` interface. The bundled hybrid engine is a reference
188
+ implementation, not the product. What this is **not**:
189
+
190
+ - **Not a framework or a hosted service** — no plugin marketplace, no SaaS. Fork the harness;
191
+ the retriever is swappable by design.
192
+ - **Not state-of-the-art retrieval research** — a pragmatic single-user system that
193
+ knows its own ceiling and stops there.
194
+ - **Not a maintained project** — a solo operator's personal tool, shared for the
195
+ methodology. Issues and PRs are welcome but may not be triaged; expect best-effort,
196
+ no SLA. The eval workflow is an *advisory* gate (it proves the numbers reproduce), not
197
+ a support promise.
198
+
199
+ Other conventional repo furniture — `CONTRIBUTING`, issue templates, a badge wall — is
200
+ **deliberately** omitted, not unfinished. [DECISIONS.md](./DECISIONS.md) records what's
201
+ left out on purpose and the trigger that would reopen each.
202
+
203
+ ## Extending
204
+
205
+ The core indexes code + docs + commits and nothing else, on purpose. Tool-specific
206
+ sources (assistant transcripts, code-graphs, other memory stores) plug in as opt-in
207
+ adapters — see [`adapters/README.md`](./adapters/README.md).
208
+
209
+ ## Where this could go
210
+
211
+ Candidate experiments — each gated on a measured win, none promised — are written up in
212
+ [ROADMAP.md](./ROADMAP.md). They're directions, not commitments.
213
+
214
+ ## License
215
+
216
+ MIT — see [LICENSE](./LICENSE). Use the methodology freely.
@@ -0,0 +1,5 @@
1
+ """hitgate — a label-free, regression-gated retrieval-evaluation harness.
2
+
3
+ The harness core is dependency-free; the bundled hybrid retriever it can measure
4
+ lives in `ragcore` and installs with the optional `[hybrid]` extra.
5
+ """
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python3
2
+ """hitgate/audit_contamination.py — find un-winnable cases in an eval set.
3
+
4
+ The most insidious way a retrieval benchmark lies is *contamination*: a "golden"
5
+ case whose expected answer isn't in the indexed corpus at all. Such a case can
6
+ only ever miss, so it caps the score with a constant penalty that looks like a
7
+ quality floor — and every decision made on that number inherits the lie. (This is
8
+ the audit that moved this project's own baseline ~8pp; see DECISIONS.md.)
9
+
10
+ This script makes that audit reusable. Point it at any eval set (same schema as
11
+ hitgate/golden.demo.jsonl) and an index, and it classifies every case:
12
+
13
+ ok — the expected path is indexed within the case's declared scope
14
+ scope-mismatch — the path is indexed, but only OUTSIDE the declared scope
15
+ CONTAMINATED — the expected path is not in the corpus at all → un-winnable
16
+
17
+ Exit 0 if no contamination, 1 if any case is un-winnable (so it can gate a build).
18
+ scope-mismatch is reported as a warning, not a failure.
19
+
20
+ Usage:
21
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build # build the index first
22
+ RAG_SOURCE_ROOTS="$PWD" python -m hitgate.audit_contamination
23
+ python -m hitgate.audit_contamination --dataset path/to/your.jsonl
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import sqlite3
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ ROOT = Path(__file__).resolve().parent.parent
34
+ from ragcore.config import DB # honors RAG_INDEX_DIR
35
+
36
+ DEFAULT_DATASET = ROOT / "hitgate" / "golden.demo.jsonl"
37
+
38
+
39
+ def _expected_substrings(case: dict) -> list[str]:
40
+ """Non-empty expected path substrings for a case (empties dropped — '' matches every path)."""
41
+ raw = case["expect_path_contains"]
42
+ raw = raw if isinstance(raw, list) else [raw]
43
+ return [e for e in raw if isinstance(e, str) and e.strip()]
44
+
45
+
46
+ def load_cases(path: Path) -> list[dict]:
47
+ if not path.exists():
48
+ sys.exit(f"dataset not found: {path}")
49
+ cases = []
50
+ for i, line in enumerate(path.read_text().splitlines(), 1):
51
+ if not line.strip():
52
+ continue
53
+ try:
54
+ case = json.loads(line)
55
+ except json.JSONDecodeError as e:
56
+ sys.exit(f"{path}:{i}: invalid JSON — {e}")
57
+ if "expect_path_contains" not in case: # tolerate other schemas by skipping
58
+ continue
59
+ if not _expected_substrings(case):
60
+ sys.exit(
61
+ f"{path}:{i}: case {case.get('query', '?')!r} has empty expect_path_contains "
62
+ f"— a malformed eval case (an empty substring matches every path). Fix the eval set."
63
+ )
64
+ cases.append(case)
65
+ return cases
66
+
67
+
68
+ def load_corpus(db: Path) -> list[tuple[str, str]]:
69
+ """(source_type, path) for every indexed chunk."""
70
+ if not db.exists():
71
+ sys.exit(f"no index at {db} — run ragcore/build.py first")
72
+ conn = sqlite3.connect(db)
73
+ try:
74
+ return conn.execute("SELECT source_type, path FROM chunks").fetchall()
75
+ finally:
76
+ conn.close()
77
+
78
+
79
+ def classify(case: dict, corpus: list[tuple[str, str]]) -> str:
80
+ expected = _expected_substrings(case) # already validated non-empty in load_cases
81
+ scope = case.get("expect_scope")
82
+ scopes = scope if isinstance(scope, list) else ([scope] if scope else [])
83
+
84
+ def path_matches(p: str) -> bool:
85
+ return any(e in p for e in expected)
86
+
87
+ anywhere = [(st, p) for st, p in corpus if path_matches(p)]
88
+ if not anywhere:
89
+ return "CONTAMINATED"
90
+ if scopes and not any(st in scopes for st, _ in anywhere):
91
+ return "scope-mismatch"
92
+ return "ok"
93
+
94
+
95
+ def resolve_dataset(arg: str) -> Path:
96
+ """Resolve --dataset robustly: absolute as-is; otherwise try cwd-relative (standard
97
+ CLI behavior) and then repo-root-relative, so the tool works whether you run it from
98
+ inside the repo or from elsewhere. Errors clearly, naming both paths tried."""
99
+ p = Path(arg)
100
+ if p.is_absolute():
101
+ return p
102
+ candidates = [Path.cwd() / arg, ROOT / arg]
103
+ for candidate in candidates:
104
+ if candidate.exists():
105
+ return candidate
106
+ sys.exit("dataset not found — tried " + " and ".join(str(c) for c in candidates))
107
+
108
+
109
+ def main() -> int:
110
+ ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
111
+ ap.add_argument("--dataset", default=str(DEFAULT_DATASET),
112
+ help="eval jsonl to audit (absolute, or relative to cwd or repo root)")
113
+ args = ap.parse_args()
114
+
115
+ dataset = resolve_dataset(args.dataset)
116
+ cases = load_cases(dataset)
117
+ if not cases:
118
+ sys.exit(f"no usable cases in {dataset}")
119
+ corpus = load_corpus(DB)
120
+
121
+ verdicts = {"ok": [], "scope-mismatch": [], "CONTAMINATED": []}
122
+ for case in cases:
123
+ verdicts[classify(case, corpus)].append(case)
124
+
125
+ n = len(cases)
126
+ print(f"contamination audit: {n} cases vs {len(corpus)} indexed chunks ({DB})")
127
+ print(f" ok: {len(verdicts['ok'])}")
128
+ print(f" scope-mismatch: {len(verdicts['scope-mismatch'])}")
129
+ print(f" CONTAMINATED: {len(verdicts['CONTAMINATED'])}")
130
+
131
+ for case in verdicts["scope-mismatch"]:
132
+ print(f"\n ⚠ scope-mismatch: {case['query'][:70]!r}")
133
+ print(f" expects {case['expect_path_contains']} in scope={case.get('expect_scope')}, found only out of scope")
134
+ for case in verdicts["CONTAMINATED"]:
135
+ print(f"\n ✗ CONTAMINATED: {case['query'][:70]!r}")
136
+ print(f" expects {case['expect_path_contains']} — not in the corpus; this case is un-winnable")
137
+
138
+ if verdicts["CONTAMINATED"]:
139
+ print(f"\n{len(verdicts['CONTAMINATED'])} un-winnable case(s) — remove them or fix the corpus before trusting the score.")
140
+ return 1
141
+ print("\n✓ no contamination — every case's answer is in the corpus.")
142
+ return 0
143
+
144
+
145
+ if __name__ == "__main__":
146
+ sys.exit(main())