PyPI - hitgate - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hitgate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

hitgate/__init__.py +5 -0
hitgate/audit_contamination.py +146 -0
hitgate/compare.py +144 -0
hitgate/diff.py +109 -0
hitgate/example_external_retriever.py +48 -0
hitgate/generate.py +438 -0
hitgate/plot_history.py +147 -0
hitgate/run.py +166 -0
hitgate/test_determinism.py +68 -0
hitgate-0.1.0.dist-info/METADATA +256 -0
hitgate-0.1.0.dist-info/RECORD +23 -0
hitgate-0.1.0.dist-info/WHEEL +5 -0
hitgate-0.1.0.dist-info/entry_points.txt +7 -0
hitgate-0.1.0.dist-info/licenses/LICENSE +21 -0
hitgate-0.1.0.dist-info/top_level.txt +2 -0
ragcore/__init__.py +1 -0
ragcore/build.py +353 -0
ragcore/chunkers.py +180 -0
ragcore/config.py +92 -0
ragcore/mcp_server.py +142 -0
ragcore/pack.py +121 -0
ragcore/query.py +92 -0
ragcore/retrieval.py +359 -0

hitgate/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""hitgate — a label-free, regression-gated retrieval-evaluation harness.
+The harness core is dependency-free; the bundled hybrid retriever it can measure
+lives in `ragcore` and installs with the optional `[hybrid]` extra.
+"""

hitgate/audit_contamination.py ADDED Viewed

@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""hitgate/audit_contamination.py — find un-winnable cases in an eval set.
+The most insidious way a retrieval benchmark lies is *contamination*: a "golden"
+case whose expected answer isn't in the indexed corpus at all. Such a case can
+only ever miss, so it caps the score with a constant penalty that looks like a
+quality floor — and every decision made on that number inherits the lie. (This is
+the audit that moved this project's own baseline ~8pp; see DECISIONS.md.)
+This script makes that audit reusable. Point it at any eval set (same schema as
+hitgate/golden.demo.jsonl) and an index, and it classifies every case:
+  ok             — the expected path is indexed within the case's declared scope
+  scope-mismatch — the path is indexed, but only OUTSIDE the declared scope
+  CONTAMINATED   — the expected path is not in the corpus at all → un-winnable
+Exit 0 if no contamination, 1 if any case is un-winnable (so it can gate a build).
+scope-mismatch is reported as a warning, not a failure.
+Usage:
+  RAG_SOURCE_ROOTS="$PWD" python -m ragcore.build          # build the index first
+  RAG_SOURCE_ROOTS="$PWD" python -m hitgate.audit_contamination
+  python -m hitgate.audit_contamination --dataset path/to/your.jsonl
+"""
+from __future__ import annotations
+import argparse
+import json
+import sqlite3
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+from ragcore.config import DB  # honors RAG_INDEX_DIR
+DEFAULT_DATASET = ROOT / "hitgate" / "golden.demo.jsonl"
+def _expected_substrings(case: dict) -> list[str]:
+    """Non-empty expected path substrings for a case (empties dropped — '' matches every path)."""
+    raw = case["expect_path_contains"]
+    raw = raw if isinstance(raw, list) else [raw]
+    return [e for e in raw if isinstance(e, str) and e.strip()]
+def load_cases(path: Path) -> list[dict]:
+    if not path.exists():
+        sys.exit(f"dataset not found: {path}")
+    cases = []
+    for i, line in enumerate(path.read_text().splitlines(), 1):
+        if not line.strip():
+            continue
+        try:
+            case = json.loads(line)
+        except json.JSONDecodeError as e:
+            sys.exit(f"{path}:{i}: invalid JSON — {e}")
+        if "expect_path_contains" not in case:  # tolerate other schemas by skipping
+            continue
+        if not _expected_substrings(case):
+            sys.exit(
+                f"{path}:{i}: case {case.get('query', '?')!r} has empty expect_path_contains "
+                f"— a malformed eval case (an empty substring matches every path). Fix the eval set."
+            )
+        cases.append(case)
+    return cases
+def load_corpus(db: Path) -> list[tuple[str, str]]:
+    """(source_type, path) for every indexed chunk."""
+    if not db.exists():
+        sys.exit(f"no index at {db} — run ragcore/build.py first")
+    conn = sqlite3.connect(db)
+    try:
+        return conn.execute("SELECT source_type, path FROM chunks").fetchall()
+    finally:
+        conn.close()
+def classify(case: dict, corpus: list[tuple[str, str]]) -> str:
+    expected = _expected_substrings(case)  # already validated non-empty in load_cases
+    scope = case.get("expect_scope")
+    scopes = scope if isinstance(scope, list) else ([scope] if scope else [])
+    def path_matches(p: str) -> bool:
+        return any(e in p for e in expected)
+    anywhere = [(st, p) for st, p in corpus if path_matches(p)]
+    if not anywhere:
+        return "CONTAMINATED"
+    if scopes and not any(st in scopes for st, _ in anywhere):
+        return "scope-mismatch"
+    return "ok"
+def resolve_dataset(arg: str) -> Path:
+    """Resolve --dataset robustly: absolute as-is; otherwise try cwd-relative (standard
+    CLI behavior) and then repo-root-relative, so the tool works whether you run it from
+    inside the repo or from elsewhere. Errors clearly, naming both paths tried."""
+    p = Path(arg)
+    if p.is_absolute():
+        return p
+    candidates = [Path.cwd() / arg, ROOT / arg]
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    sys.exit("dataset not found — tried " + " and ".join(str(c) for c in candidates))
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--dataset", default=str(DEFAULT_DATASET),
+                    help="eval jsonl to audit (absolute, or relative to cwd or repo root)")
+    args = ap.parse_args()
+    dataset = resolve_dataset(args.dataset)
+    cases = load_cases(dataset)
+    if not cases:
+        sys.exit(f"no usable cases in {dataset}")
+    corpus = load_corpus(DB)
+    verdicts = {"ok": [], "scope-mismatch": [], "CONTAMINATED": []}
+    for case in cases:
+        verdicts[classify(case, corpus)].append(case)
+    n = len(cases)
+    print(f"contamination audit: {n} cases vs {len(corpus)} indexed chunks ({DB})")
+    print(f"  ok:             {len(verdicts['ok'])}")
+    print(f"  scope-mismatch: {len(verdicts['scope-mismatch'])}")
+    print(f"  CONTAMINATED:   {len(verdicts['CONTAMINATED'])}")
+    for case in verdicts["scope-mismatch"]:
+        print(f"\n  ⚠ scope-mismatch: {case['query'][:70]!r}")
+        print(f"      expects {case['expect_path_contains']} in scope={case.get('expect_scope')}, found only out of scope")
+    for case in verdicts["CONTAMINATED"]:
+        print(f"\n  ✗ CONTAMINATED: {case['query'][:70]!r}")
+        print(f"      expects {case['expect_path_contains']} — not in the corpus; this case is un-winnable")
+    if verdicts["CONTAMINATED"]:
+        print(f"\n{len(verdicts['CONTAMINATED'])} un-winnable case(s) — remove them or fix the corpus before trusting the score.")
+        return 1
+    print("\n✓ no contamination — every case's answer is in the corpus.")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

hitgate/compare.py ADDED Viewed

@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""compare.py — compare two eval result JSON files and emit a structured verdict.
+Usage:
+    python hitgate/compare.py <current.json> <baseline.json> [tol_pp=5]
+Prints the human-readable delta table to stdout (same format as before).
+Writes <current>.verdict.json alongside the current result.
+Exits 0 if verdict is pass or improvement, 1 if regression.
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+def compare(cur_j: dict, base_j: dict, tol_pp: float = 5.0) -> dict:
+    """Core comparison. Returns verdict dict; does not touch the filesystem."""
+    tol = tol_pp / 100.0
+    metrics = ("mrr", "hit@1", "hit@3", "hit@5")
+    regressions: list[dict] = []
+    improvements: list[dict] = []
+    deltas: dict[str, float] = {}
+    for m in metrics:
+        delta = round(cur_j[m] - base_j[m], 4)
+        deltas[m] = delta
+        if delta < -tol:
+            regressions.append({"metric": m, "scope": "aggregate", "delta": delta})
+        elif delta > tol:
+            improvements.append({"metric": m, "scope": "aggregate", "delta": delta})
+    base_intent = base_j.get("by_intent", {})
+    cur_intent = cur_j.get("by_intent", {})
+    for intent in sorted(base_intent):
+        if intent not in cur_intent:
+            continue
+        b5 = base_intent[intent].get("hit@5")
+        c5 = cur_intent[intent].get("hit@5")
+        if b5 is None or c5 is None:
+            continue
+        delta = round(c5 - b5, 4)
+        scope = f"intent:{intent}"
+        if delta < -tol:
+            regressions.append({"metric": "hit@5", "scope": scope, "delta": delta})
+        elif delta > tol:
+            improvements.append({"metric": "hit@5", "scope": scope, "delta": delta})
+    refreeze = any(
+        r["metric"] == "hit@5" and r["scope"] == "aggregate" for r in improvements
+    )
+    if regressions:
+        verdict = "regression"
+    elif refreeze:
+        verdict = "improvement"
+    else:
+        verdict = "pass"
+    return {
+        "verdict": verdict,
+        "gated_metric": "hit@5",
+        "tolerance_pp": tol_pp,
+        "regressions": regressions,
+        "improvements": improvements,
+        "deltas": deltas,
+        "refreeze_recommended": refreeze,
+    }
+def print_table(cur_j: dict, base_j: dict, v: dict, tol_pp: float) -> None:
+    """Human-readable delta table — same format as the old check.sh heredoc."""
+    metrics = ("mrr", "hit@1", "hit@3", "hit@5")
+    print(f"\nDelta vs baseline (tolerance ±{tol_pp}pp):")
+    for m in metrics:
+        delta = cur_j[m] - base_j[m]
+        arrow = "↑" if delta > 0 else ("↓" if delta < 0 else "·")
+        flag = (
+            "  ⚠ REGRESSION"
+            if any(r["metric"] == m and r["scope"] == "aggregate" for r in v["regressions"])
+            else ""
+        )
+        print(f"  {m:<8} {base_j[m]:.3f} → {cur_j[m]:.3f}  {arrow}{abs(delta):+.3f}{flag}")
+    base_intent = base_j.get("by_intent", {})
+    cur_intent = cur_j.get("by_intent", {})
+    if base_intent and cur_intent:
+        print(f"\nPer-intent Hit@5 (tolerance ±{tol_pp}pp):")
+        for intent in sorted(base_intent):
+            if intent not in cur_intent:
+                continue
+            b5 = base_intent[intent].get("hit@5")
+            c5 = cur_intent[intent].get("hit@5")
+            if b5 is None or c5 is None:
+                continue
+            delta = c5 - b5
+            n = cur_intent[intent].get("n", "?")
+            arrow = "↑" if delta > 0 else ("↓" if delta < 0 else "·")
+            scope = f"intent:{intent}"
+            flag = (
+                "  ⚠ REGRESSION"
+                if any(r["scope"] == scope for r in v["regressions"])
+                else ""
+            )
+            print(f"  {intent:<16} n={n}  {b5:.3f} → {c5:.3f}  {arrow}{abs(delta):+.3f}{flag}")
+    if v["regressions"]:
+        names = [
+            r["metric"] if r["scope"] == "aggregate" else r["scope"]
+            for r in v["regressions"]
+        ]
+        print(f"\nRegressed: {', '.join(names)} — investigate before shipping retrieval changes.")
+        return
+    if v["refreeze_recommended"]:
+        print("\n✓ within tolerance. Hit@5 improved — consider re-freezing the baseline.")
+    else:
+        print("\n✓ within tolerance.")
+def main() -> int:
+    if len(sys.argv) < 3:
+        sys.exit(f"Usage: {sys.argv[0]} <current.json> <baseline.json> [tol_pp=5]")
+    cur_path = Path(sys.argv[1])
+    base_path = Path(sys.argv[2])
+    tol_pp = float(sys.argv[3]) if len(sys.argv) > 3 else 5.0
+    cur_j = json.loads(cur_path.read_text())
+    base_j = json.loads(base_path.read_text())
+    v = compare(cur_j, base_j, tol_pp)
+    print_table(cur_j, base_j, v, tol_pp)
+    verdict_path = cur_path.with_suffix("").parent / (cur_path.stem + ".verdict.json")
+    verdict_path.write_text(json.dumps(v, indent=2))
+    return 1 if v["verdict"] == "regression" else 0
+if __name__ == "__main__":
+    sys.exit(main())

hitgate/diff.py ADDED Viewed

@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""Compare two eval result JSON files and report per-case rank changes.
+Usage:
+  python -m hitgate.diff hitgate/baseline.example.json hitgate/head.json
+  python -m hitgate.diff hitgate/A.json hitgate/B.json --quiet   # summary only
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+def load(path: Path) -> dict:
+    return json.loads(path.read_text())
+def rank_label(r: int | None) -> str:
+    return "MISS" if r is None else f"#{r}"
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Diff two eval result JSON files")
+    ap.add_argument("baseline", help="baseline result JSON (before)")
+    ap.add_argument("head", help="head result JSON (after)")
+    ap.add_argument("--quiet", action="store_true", help="summary line only, no per-case detail")
+    args = ap.parse_args()
+    base = load(Path(args.baseline))
+    head = load(Path(args.head))
+    base_cases = {c["query"]: c for c in base.get("per_case", [])}
+    head_cases = {c["query"]: c for c in head.get("per_case", [])}
+    queries = list(base_cases)
+    unmatched = [q for q in head_cases if q not in base_cases]
+    regressed, improved, stable_1, stable_other = [], [], [], []
+    for q in queries:
+        bc = base_cases[q]
+        hc = head_cases.get(q)
+        if hc is None:
+            continue
+        br, hr = bc["hit_rank"], hc["hit_rank"]
+        # Lower rank number = better. None (MISS) = worst.
+        def rank_val(r):
+            return r if r is not None else 999
+        if rank_val(hr) > rank_val(br):
+            regressed.append((q, bc, hc))
+        elif rank_val(hr) < rank_val(br):
+            improved.append((q, bc, hc))
+        elif hr == 1:
+            stable_1.append((q, bc, hc))
+        else:
+            stable_other.append((q, bc, hc))
+    def fmt_case(q, bc, hc):
+        intent = bc.get("intent", "?")
+        br, hr = bc["hit_rank"], hc["hit_rank"]
+        delta = f"{rank_label(br)}→{rank_label(hr)}"
+        top_changed = bc.get("top_hit") != hc.get("top_hit")
+        top_note = f"  top: {hc.get('top_hit', '?')}" if top_changed else ""
+        return f"  [{intent:14}]  {delta:10}  {q[:70]}{top_note}"
+    if not args.quiet:
+        if regressed:
+            print(f"\nREGRESSED ({len(regressed)}):")
+            for item in regressed:
+                print(fmt_case(*item))
+        if improved:
+            print(f"\nIMPROVED ({len(improved)}):")
+            for item in improved:
+                print(fmt_case(*item))
+        total_stable = len(stable_1) + len(stable_other)
+        print(f"\nSTABLE ({total_stable} cases: {len(stable_1)} at rank 1, {len(stable_other)} at rank 2-5)")
+        if unmatched:
+            print(f"\nNEW in head ({len(unmatched)} cases not in baseline):")
+            for q in unmatched:
+                hc = head_cases[q]
+                print(f"  [{hc.get('intent','?'):14}]  {rank_label(hc['hit_rank']):10}  {q[:70]}")
+    def delta(key: str) -> str:
+        b, h = base.get(key, 0.0), head.get(key, 0.0)
+        d = h - b
+        sign = f"+{d:.3f}" if d >= 0 else f"{d:.3f}"
+        return f"{sign} ({b} → {h})"
+    verdict = "REGRESSION" if regressed and not improved else ("IMPROVEMENT" if improved and not regressed else "MIXED" if improved or regressed else "IDENTICAL")
+    print(f"\n{'─'*60}")
+    print(f"  Δ hit@1 : {delta('hit@1')}")
+    print(f"  Δ hit@3 : {delta('hit@3')}")
+    print(f"  Δ hit@5 : {delta('hit@5')}")
+    print(f"  Δ mrr   : {delta('mrr')}")
+    print(f"  verdict : {verdict}  ({len(improved)} improved, {len(regressed)} regressed, {len(stable_1)+len(stable_other)} stable)")
+    print(f"{'─'*60}")
+    return 1 if regressed else 0
+if __name__ == "__main__":
+    sys.exit(main())

hitgate/example_external_retriever.py ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""A minimal, dependency-free example retriever — the template for "bring your own".
+It implements the harness protocol:  retrieve(query, top, scope) -> list[dict].
+This one is deliberately dumb (ranks files by how many distinct query words they contain —
+no embeddings, no index) to prove the harness measures ANY retriever, not just the bundled
+hybrid one. Score it through the exact same eval:
+    RAG_SOURCE_ROOTS="$PWD" python -m hitgate.run --retriever hitgate.example_external_retriever:retrieve
+To wire your own retriever, copy this signature and return results ranked best-first, each a
+mapping with a "path" (and optionally "start_line"). See adapters/ for ecosystem wrappers.
+"""
+from __future__ import annotations
+import os
+import re
+from pathlib import Path
+_WORD = re.compile(r"[A-Za-z][A-Za-z0-9_]+")
+_CODE_EXTS = {".py", ".ts", ".tsx", ".js", ".jsx", ".mjs", ".sh"}
+_SKIP_PARTS = {".git", "node_modules", "venv", ".venv", ".rag-index", "__pycache__", "dist", "build"}
+def _roots() -> list[Path]:
+    raw = os.environ.get("RAG_SOURCE_ROOTS", "")
+    return [Path(p) for p in raw.split(os.pathsep) if p] or [Path.cwd()]
+def retrieve(query: str, top: int, scope: str | None = None) -> list[dict]:
+    """Rank files by how many distinct query terms appear in them. Best-first, top-`top`."""
+    terms = {w.lower() for w in _WORD.findall(query)}
+    scored: list[tuple[int, str]] = []
+    for root in _roots():
+        for path in root.rglob("*"):
+            if path.suffix not in _CODE_EXTS or not path.is_file():
+                continue
+            if _SKIP_PARTS & set(path.parts):
+                continue
+            try:
+                text = path.read_text(errors="ignore").lower()
+            except OSError:
+                continue
+            score = sum(1 for t in terms if t in text)
+            if score:
+                scored.append((score, str(path)))
+    scored.sort(key=lambda s: -s[0])
+    return [{"path": p, "start_line": 1} for _, p in scored[:top]]