hitgate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hitgate/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """hitgate — a label-free, regression-gated retrieval-evaluation harness.
2
+
3
+ The harness core is dependency-free; the bundled hybrid retriever it can measure
4
+ lives in `ragcore` and installs with the optional `[hybrid]` extra.
5
+ """
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python3
2
+ """hitgate/audit_contamination.py — find un-winnable cases in an eval set.
3
+
4
+ The most insidious way a retrieval benchmark lies is *contamination*: a "golden"
5
+ case whose expected answer isn't in the indexed corpus at all. Such a case can
6
+ only ever miss, so it caps the score with a constant penalty that looks like a
7
+ quality floor — and every decision made on that number inherits the lie. (This is
8
+ the audit that moved this project's own baseline ~8pp; see DECISIONS.md.)
9
+
10
+ This script makes that audit reusable. Point it at any eval set (same schema as
11
+ hitgate/golden.demo.jsonl) and an index, and it classifies every case:
12
+
13
+ ok — the expected path is indexed within the case's declared scope
14
+ scope-mismatch — the path is indexed, but only OUTSIDE the declared scope
15
+ CONTAMINATED — the expected path is not in the corpus at all → un-winnable
16
+
17
+ Exit 0 if no contamination, 1 if any case is un-winnable (so it can gate a build).
18
+ scope-mismatch is reported as a warning, not a failure.
19
+
20
+ Usage:
21
+ RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build # build the index first
22
+ RAG_SOURCE_ROOTS="$PWD" python -m hitgate.audit_contamination
23
+ python -m hitgate.audit_contamination --dataset path/to/your.jsonl
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import sqlite3
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ ROOT = Path(__file__).resolve().parent.parent
34
+ from ragcore.config import DB # honors RAG_INDEX_DIR
35
+
36
+ DEFAULT_DATASET = ROOT / "hitgate" / "golden.demo.jsonl"
37
+
38
+
39
+ def _expected_substrings(case: dict) -> list[str]:
40
+ """Non-empty expected path substrings for a case (empties dropped — '' matches every path)."""
41
+ raw = case["expect_path_contains"]
42
+ raw = raw if isinstance(raw, list) else [raw]
43
+ return [e for e in raw if isinstance(e, str) and e.strip()]
44
+
45
+
46
+ def load_cases(path: Path) -> list[dict]:
47
+ if not path.exists():
48
+ sys.exit(f"dataset not found: {path}")
49
+ cases = []
50
+ for i, line in enumerate(path.read_text().splitlines(), 1):
51
+ if not line.strip():
52
+ continue
53
+ try:
54
+ case = json.loads(line)
55
+ except json.JSONDecodeError as e:
56
+ sys.exit(f"{path}:{i}: invalid JSON — {e}")
57
+ if "expect_path_contains" not in case: # tolerate other schemas by skipping
58
+ continue
59
+ if not _expected_substrings(case):
60
+ sys.exit(
61
+ f"{path}:{i}: case {case.get('query', '?')!r} has empty expect_path_contains "
62
+ f"— a malformed eval case (an empty substring matches every path). Fix the eval set."
63
+ )
64
+ cases.append(case)
65
+ return cases
66
+
67
+
68
+ def load_corpus(db: Path) -> list[tuple[str, str]]:
69
+ """(source_type, path) for every indexed chunk."""
70
+ if not db.exists():
71
+ sys.exit(f"no index at {db} — run ragcore/build.py first")
72
+ conn = sqlite3.connect(db)
73
+ try:
74
+ return conn.execute("SELECT source_type, path FROM chunks").fetchall()
75
+ finally:
76
+ conn.close()
77
+
78
+
79
+ def classify(case: dict, corpus: list[tuple[str, str]]) -> str:
80
+ expected = _expected_substrings(case) # already validated non-empty in load_cases
81
+ scope = case.get("expect_scope")
82
+ scopes = scope if isinstance(scope, list) else ([scope] if scope else [])
83
+
84
+ def path_matches(p: str) -> bool:
85
+ return any(e in p for e in expected)
86
+
87
+ anywhere = [(st, p) for st, p in corpus if path_matches(p)]
88
+ if not anywhere:
89
+ return "CONTAMINATED"
90
+ if scopes and not any(st in scopes for st, _ in anywhere):
91
+ return "scope-mismatch"
92
+ return "ok"
93
+
94
+
95
+ def resolve_dataset(arg: str) -> Path:
96
+ """Resolve --dataset robustly: absolute as-is; otherwise try cwd-relative (standard
97
+ CLI behavior) and then repo-root-relative, so the tool works whether you run it from
98
+ inside the repo or from elsewhere. Errors clearly, naming both paths tried."""
99
+ p = Path(arg)
100
+ if p.is_absolute():
101
+ return p
102
+ candidates = [Path.cwd() / arg, ROOT / arg]
103
+ for candidate in candidates:
104
+ if candidate.exists():
105
+ return candidate
106
+ sys.exit("dataset not found — tried " + " and ".join(str(c) for c in candidates))
107
+
108
+
109
+ def main() -> int:
110
+ ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
111
+ ap.add_argument("--dataset", default=str(DEFAULT_DATASET),
112
+ help="eval jsonl to audit (absolute, or relative to cwd or repo root)")
113
+ args = ap.parse_args()
114
+
115
+ dataset = resolve_dataset(args.dataset)
116
+ cases = load_cases(dataset)
117
+ if not cases:
118
+ sys.exit(f"no usable cases in {dataset}")
119
+ corpus = load_corpus(DB)
120
+
121
+ verdicts = {"ok": [], "scope-mismatch": [], "CONTAMINATED": []}
122
+ for case in cases:
123
+ verdicts[classify(case, corpus)].append(case)
124
+
125
+ n = len(cases)
126
+ print(f"contamination audit: {n} cases vs {len(corpus)} indexed chunks ({DB})")
127
+ print(f" ok: {len(verdicts['ok'])}")
128
+ print(f" scope-mismatch: {len(verdicts['scope-mismatch'])}")
129
+ print(f" CONTAMINATED: {len(verdicts['CONTAMINATED'])}")
130
+
131
+ for case in verdicts["scope-mismatch"]:
132
+ print(f"\n ⚠ scope-mismatch: {case['query'][:70]!r}")
133
+ print(f" expects {case['expect_path_contains']} in scope={case.get('expect_scope')}, found only out of scope")
134
+ for case in verdicts["CONTAMINATED"]:
135
+ print(f"\n ✗ CONTAMINATED: {case['query'][:70]!r}")
136
+ print(f" expects {case['expect_path_contains']} — not in the corpus; this case is un-winnable")
137
+
138
+ if verdicts["CONTAMINATED"]:
139
+ print(f"\n{len(verdicts['CONTAMINATED'])} un-winnable case(s) — remove them or fix the corpus before trusting the score.")
140
+ return 1
141
+ print("\n✓ no contamination — every case's answer is in the corpus.")
142
+ return 0
143
+
144
+
145
+ if __name__ == "__main__":
146
+ sys.exit(main())
hitgate/compare.py ADDED
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env python3
2
+ """compare.py — compare two eval result JSON files and emit a structured verdict.
3
+
4
+ Usage:
5
+ python hitgate/compare.py <current.json> <baseline.json> [tol_pp=5]
6
+
7
+ Prints the human-readable delta table to stdout (same format as before).
8
+ Writes <current>.verdict.json alongside the current result.
9
+ Exits 0 if verdict is pass or improvement, 1 if regression.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+
18
+ def compare(cur_j: dict, base_j: dict, tol_pp: float = 5.0) -> dict:
19
+ """Core comparison. Returns verdict dict; does not touch the filesystem."""
20
+ tol = tol_pp / 100.0
21
+ metrics = ("mrr", "hit@1", "hit@3", "hit@5")
22
+
23
+ regressions: list[dict] = []
24
+ improvements: list[dict] = []
25
+ deltas: dict[str, float] = {}
26
+
27
+ for m in metrics:
28
+ delta = round(cur_j[m] - base_j[m], 4)
29
+ deltas[m] = delta
30
+ if delta < -tol:
31
+ regressions.append({"metric": m, "scope": "aggregate", "delta": delta})
32
+ elif delta > tol:
33
+ improvements.append({"metric": m, "scope": "aggregate", "delta": delta})
34
+
35
+ base_intent = base_j.get("by_intent", {})
36
+ cur_intent = cur_j.get("by_intent", {})
37
+ for intent in sorted(base_intent):
38
+ if intent not in cur_intent:
39
+ continue
40
+ b5 = base_intent[intent].get("hit@5")
41
+ c5 = cur_intent[intent].get("hit@5")
42
+ if b5 is None or c5 is None:
43
+ continue
44
+ delta = round(c5 - b5, 4)
45
+ scope = f"intent:{intent}"
46
+ if delta < -tol:
47
+ regressions.append({"metric": "hit@5", "scope": scope, "delta": delta})
48
+ elif delta > tol:
49
+ improvements.append({"metric": "hit@5", "scope": scope, "delta": delta})
50
+
51
+ refreeze = any(
52
+ r["metric"] == "hit@5" and r["scope"] == "aggregate" for r in improvements
53
+ )
54
+
55
+ if regressions:
56
+ verdict = "regression"
57
+ elif refreeze:
58
+ verdict = "improvement"
59
+ else:
60
+ verdict = "pass"
61
+
62
+ return {
63
+ "verdict": verdict,
64
+ "gated_metric": "hit@5",
65
+ "tolerance_pp": tol_pp,
66
+ "regressions": regressions,
67
+ "improvements": improvements,
68
+ "deltas": deltas,
69
+ "refreeze_recommended": refreeze,
70
+ }
71
+
72
+
73
+ def print_table(cur_j: dict, base_j: dict, v: dict, tol_pp: float) -> None:
74
+ """Human-readable delta table — same format as the old check.sh heredoc."""
75
+ metrics = ("mrr", "hit@1", "hit@3", "hit@5")
76
+ print(f"\nDelta vs baseline (tolerance ±{tol_pp}pp):")
77
+ for m in metrics:
78
+ delta = cur_j[m] - base_j[m]
79
+ arrow = "↑" if delta > 0 else ("↓" if delta < 0 else "·")
80
+ flag = (
81
+ " ⚠ REGRESSION"
82
+ if any(r["metric"] == m and r["scope"] == "aggregate" for r in v["regressions"])
83
+ else ""
84
+ )
85
+ print(f" {m:<8} {base_j[m]:.3f} → {cur_j[m]:.3f} {arrow}{abs(delta):+.3f}{flag}")
86
+
87
+ base_intent = base_j.get("by_intent", {})
88
+ cur_intent = cur_j.get("by_intent", {})
89
+ if base_intent and cur_intent:
90
+ print(f"\nPer-intent Hit@5 (tolerance ±{tol_pp}pp):")
91
+ for intent in sorted(base_intent):
92
+ if intent not in cur_intent:
93
+ continue
94
+ b5 = base_intent[intent].get("hit@5")
95
+ c5 = cur_intent[intent].get("hit@5")
96
+ if b5 is None or c5 is None:
97
+ continue
98
+ delta = c5 - b5
99
+ n = cur_intent[intent].get("n", "?")
100
+ arrow = "↑" if delta > 0 else ("↓" if delta < 0 else "·")
101
+ scope = f"intent:{intent}"
102
+ flag = (
103
+ " ⚠ REGRESSION"
104
+ if any(r["scope"] == scope for r in v["regressions"])
105
+ else ""
106
+ )
107
+ print(f" {intent:<16} n={n} {b5:.3f} → {c5:.3f} {arrow}{abs(delta):+.3f}{flag}")
108
+
109
+ if v["regressions"]:
110
+ names = [
111
+ r["metric"] if r["scope"] == "aggregate" else r["scope"]
112
+ for r in v["regressions"]
113
+ ]
114
+ print(f"\nRegressed: {', '.join(names)} — investigate before shipping retrieval changes.")
115
+ return
116
+
117
+ if v["refreeze_recommended"]:
118
+ print("\n✓ within tolerance. Hit@5 improved — consider re-freezing the baseline.")
119
+ else:
120
+ print("\n✓ within tolerance.")
121
+
122
+
123
+ def main() -> int:
124
+ if len(sys.argv) < 3:
125
+ sys.exit(f"Usage: {sys.argv[0]} <current.json> <baseline.json> [tol_pp=5]")
126
+
127
+ cur_path = Path(sys.argv[1])
128
+ base_path = Path(sys.argv[2])
129
+ tol_pp = float(sys.argv[3]) if len(sys.argv) > 3 else 5.0
130
+
131
+ cur_j = json.loads(cur_path.read_text())
132
+ base_j = json.loads(base_path.read_text())
133
+
134
+ v = compare(cur_j, base_j, tol_pp)
135
+ print_table(cur_j, base_j, v, tol_pp)
136
+
137
+ verdict_path = cur_path.with_suffix("").parent / (cur_path.stem + ".verdict.json")
138
+ verdict_path.write_text(json.dumps(v, indent=2))
139
+
140
+ return 1 if v["verdict"] == "regression" else 0
141
+
142
+
143
+ if __name__ == "__main__":
144
+ sys.exit(main())
hitgate/diff.py ADDED
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """Compare two eval result JSON files and report per-case rank changes.
3
+
4
+ Usage:
5
+ python -m hitgate.diff hitgate/baseline.example.json hitgate/head.json
6
+ python -m hitgate.diff hitgate/A.json hitgate/B.json --quiet # summary only
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def load(path: Path) -> dict:
17
+ return json.loads(path.read_text())
18
+
19
+
20
+ def rank_label(r: int | None) -> str:
21
+ return "MISS" if r is None else f"#{r}"
22
+
23
+
24
+ def main() -> int:
25
+ ap = argparse.ArgumentParser(description="Diff two eval result JSON files")
26
+ ap.add_argument("baseline", help="baseline result JSON (before)")
27
+ ap.add_argument("head", help="head result JSON (after)")
28
+ ap.add_argument("--quiet", action="store_true", help="summary line only, no per-case detail")
29
+ args = ap.parse_args()
30
+
31
+ base = load(Path(args.baseline))
32
+ head = load(Path(args.head))
33
+
34
+ base_cases = {c["query"]: c for c in base.get("per_case", [])}
35
+ head_cases = {c["query"]: c for c in head.get("per_case", [])}
36
+
37
+ queries = list(base_cases)
38
+ unmatched = [q for q in head_cases if q not in base_cases]
39
+
40
+ regressed, improved, stable_1, stable_other = [], [], [], []
41
+
42
+ for q in queries:
43
+ bc = base_cases[q]
44
+ hc = head_cases.get(q)
45
+ if hc is None:
46
+ continue
47
+ br, hr = bc["hit_rank"], hc["hit_rank"]
48
+
49
+ # Lower rank number = better. None (MISS) = worst.
50
+ def rank_val(r):
51
+ return r if r is not None else 999
52
+
53
+ if rank_val(hr) > rank_val(br):
54
+ regressed.append((q, bc, hc))
55
+ elif rank_val(hr) < rank_val(br):
56
+ improved.append((q, bc, hc))
57
+ elif hr == 1:
58
+ stable_1.append((q, bc, hc))
59
+ else:
60
+ stable_other.append((q, bc, hc))
61
+
62
+ def fmt_case(q, bc, hc):
63
+ intent = bc.get("intent", "?")
64
+ br, hr = bc["hit_rank"], hc["hit_rank"]
65
+ delta = f"{rank_label(br)}→{rank_label(hr)}"
66
+ top_changed = bc.get("top_hit") != hc.get("top_hit")
67
+ top_note = f" top: {hc.get('top_hit', '?')}" if top_changed else ""
68
+ return f" [{intent:14}] {delta:10} {q[:70]}{top_note}"
69
+
70
+ if not args.quiet:
71
+ if regressed:
72
+ print(f"\nREGRESSED ({len(regressed)}):")
73
+ for item in regressed:
74
+ print(fmt_case(*item))
75
+
76
+ if improved:
77
+ print(f"\nIMPROVED ({len(improved)}):")
78
+ for item in improved:
79
+ print(fmt_case(*item))
80
+
81
+ total_stable = len(stable_1) + len(stable_other)
82
+ print(f"\nSTABLE ({total_stable} cases: {len(stable_1)} at rank 1, {len(stable_other)} at rank 2-5)")
83
+
84
+ if unmatched:
85
+ print(f"\nNEW in head ({len(unmatched)} cases not in baseline):")
86
+ for q in unmatched:
87
+ hc = head_cases[q]
88
+ print(f" [{hc.get('intent','?'):14}] {rank_label(hc['hit_rank']):10} {q[:70]}")
89
+
90
+ def delta(key: str) -> str:
91
+ b, h = base.get(key, 0.0), head.get(key, 0.0)
92
+ d = h - b
93
+ sign = f"+{d:.3f}" if d >= 0 else f"{d:.3f}"
94
+ return f"{sign} ({b} → {h})"
95
+
96
+ verdict = "REGRESSION" if regressed and not improved else ("IMPROVEMENT" if improved and not regressed else "MIXED" if improved or regressed else "IDENTICAL")
97
+ print(f"\n{'─'*60}")
98
+ print(f" Δ hit@1 : {delta('hit@1')}")
99
+ print(f" Δ hit@3 : {delta('hit@3')}")
100
+ print(f" Δ hit@5 : {delta('hit@5')}")
101
+ print(f" Δ mrr : {delta('mrr')}")
102
+ print(f" verdict : {verdict} ({len(improved)} improved, {len(regressed)} regressed, {len(stable_1)+len(stable_other)} stable)")
103
+ print(f"{'─'*60}")
104
+
105
+ return 1 if regressed else 0
106
+
107
+
108
+ if __name__ == "__main__":
109
+ sys.exit(main())
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env python3
2
+ """A minimal, dependency-free example retriever — the template for "bring your own".
3
+
4
+ It implements the harness protocol: retrieve(query, top, scope) -> list[dict].
5
+ This one is deliberately dumb (ranks files by how many distinct query words they contain —
6
+ no embeddings, no index) to prove the harness measures ANY retriever, not just the bundled
7
+ hybrid one. Score it through the exact same eval:
8
+
9
+ RAG_SOURCE_ROOTS="$PWD" python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve
10
+
11
+ To wire your own retriever, copy this signature and return results ranked best-first, each a
12
+ mapping with a "path" (and optionally "start_line"). See adapters/ for ecosystem wrappers.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import re
18
+ from pathlib import Path
19
+
20
+ _WORD = re.compile(r"[A-Za-z][A-Za-z0-9_]+")
21
+ _CODE_EXTS = {".py", ".ts", ".tsx", ".js", ".jsx", ".mjs", ".sh"}
22
+ _SKIP_PARTS = {".git", "node_modules", "venv", ".venv", ".rag-index", "__pycache__", "dist", "build"}
23
+
24
+
25
+ def _roots() -> list[Path]:
26
+ raw = os.environ.get("RAG_SOURCE_ROOTS", "")
27
+ return [Path(p) for p in raw.split(os.pathsep) if p] or [Path.cwd()]
28
+
29
+
30
+ def retrieve(query: str, top: int, scope: str | None = None) -> list[dict]:
31
+ """Rank files by how many distinct query terms appear in them. Best-first, top-`top`."""
32
+ terms = {w.lower() for w in _WORD.findall(query)}
33
+ scored: list[tuple[int, str]] = []
34
+ for root in _roots():
35
+ for path in root.rglob("*"):
36
+ if path.suffix not in _CODE_EXTS or not path.is_file():
37
+ continue
38
+ if _SKIP_PARTS & set(path.parts):
39
+ continue
40
+ try:
41
+ text = path.read_text(errors="ignore").lower()
42
+ except OSError:
43
+ continue
44
+ score = sum(1 for t in terms if t in text)
45
+ if score:
46
+ scored.append((score, str(path)))
47
+ scored.sort(key=lambda s: -s[0])
48
+ return [{"path": p, "start_line": 1} for _, p in scored[:top]]