PyPI - splitsmith - Versions diffs - 0.2.0__py3-none-any.whl - Mend

splitsmith 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

splitsmith/__init__.py +3 -0
splitsmith/audit.py +87 -0
splitsmith/automation.py +238 -0
splitsmith/backup.py +298 -0
splitsmith/beep_calibration.py +324 -0
splitsmith/beep_detect.py +371 -0
splitsmith/cleanup.py +327 -0
splitsmith/cli.py +1281 -0
splitsmith/coach.py +253 -0
splitsmith/coach_distributions.py +348 -0
splitsmith/compare/__init__.py +7 -0
splitsmith/compare/cli.py +153 -0
splitsmith/compare/emitter.py +456 -0
splitsmith/compare/filler.py +98 -0
splitsmith/compare/layout.py +164 -0
splitsmith/compare/manifest.py +91 -0
splitsmith/compare/project_loader.py +195 -0
splitsmith/composition.py +606 -0
splitsmith/config.py +442 -0
splitsmith/cross_align.py +210 -0
splitsmith/csv_gen.py +66 -0
splitsmith/data/ensemble_calibration.json +248 -0
splitsmith/data/fonts/Antonio-OFL.txt +93 -0
splitsmith/data/fonts/Antonio-VariableFont.ttf +0 -0
splitsmith/data/fonts/JetBrainsMono-Bold.ttf +0 -0
splitsmith/data/fonts/JetBrainsMono-OFL.txt +93 -0
splitsmith/data/overlay_theme.json +40 -0
splitsmith/data/templates/action-cut.yaml +19 -0
splitsmith/data/templates/match-recap.yaml +20 -0
splitsmith/data/voter_c_gbdt.joblib +0 -0
splitsmith/data/voter_e_visual_probe.joblib +0 -0
splitsmith/ensemble/__init__.py +67 -0
splitsmith/ensemble/agc_state.py +165 -0
splitsmith/ensemble/api.py +419 -0
splitsmith/ensemble/backend.py +89 -0
splitsmith/ensemble/calibration.py +367 -0
splitsmith/ensemble/clap_mel.py +138 -0
splitsmith/ensemble/features.py +680 -0
splitsmith/ensemble/fixtures.py +222 -0
splitsmith/ensemble/tta.py +115 -0
splitsmith/ensemble/visual.py +294 -0
splitsmith/ensemble/voters.py +202 -0
splitsmith/fcp7xml_render.py +558 -0
splitsmith/fcpxml_gen.py +1721 -0
splitsmith/fixture_schema.py +482 -0
splitsmith/lab/__init__.py +79 -0
splitsmith/lab/core.py +1118 -0
splitsmith/lab/promote.py +555 -0
splitsmith/lab/snap_window.py +331 -0
splitsmith/lab/sweeps.py +231 -0
splitsmith/lab_cli.py +750 -0
splitsmith/match_cli.py +315 -0
splitsmith/match_model.py +793 -0
splitsmith/match_registry.py +131 -0
splitsmith/mcp/__init__.py +23 -0
splitsmith/mcp/__main__.py +20 -0
splitsmith/mcp/detect_tools.py +476 -0
splitsmith/mcp/export_tools.py +356 -0
splitsmith/mcp/sandbox.py +77 -0
splitsmith/mcp/server.py +393 -0
splitsmith/mcp/tools.py +207 -0
splitsmith/mcp/write_tools.py +268 -0
splitsmith/model_cli.py +153 -0
splitsmith/models/__init__.py +40 -0
splitsmith/models/cache.py +139 -0
splitsmith/models/download.py +95 -0
splitsmith/models/errors.py +50 -0
splitsmith/models/manifest.py +68 -0
splitsmith/models/registry.py +256 -0
splitsmith/mp4_render.py +513 -0
splitsmith/overlay_render.py +817 -0
splitsmith/overlay_theme.py +146 -0
splitsmith/relink.py +245 -0
splitsmith/report.py +258 -0
splitsmith/runtime.py +268 -0
splitsmith/shot_detect.py +506 -0
splitsmith/shot_refine.py +252 -0
splitsmith/system_check.py +162 -0
splitsmith/templates.py +188 -0
splitsmith/thumbnail.py +230 -0
splitsmith/trim.py +211 -0
splitsmith/ui/__init__.py +10 -0
splitsmith/ui/audio.py +536 -0
splitsmith/ui/embedded.py +312 -0
splitsmith/ui/exports.py +533 -0
splitsmith/ui/jobs.py +652 -0
splitsmith/ui/logging_setup.py +108 -0
splitsmith/ui/match_exports.py +500 -0
splitsmith/ui/project.py +1734 -0
splitsmith/ui/scoreboard/__init__.py +77 -0
splitsmith/ui/scoreboard/cache.py +237 -0
splitsmith/ui/scoreboard/http.py +206 -0
splitsmith/ui/scoreboard/local.py +377 -0
splitsmith/ui/scoreboard/models.py +301 -0
splitsmith/ui/scoreboard/protocol.py +51 -0
splitsmith/ui/server.py +9178 -0
splitsmith/ui_static/package-lock.json +3062 -0
splitsmith/ui_static/tsconfig.app.tsbuildinfo +1 -0
splitsmith/ui_static/tsconfig.node.tsbuildinfo +1 -0
splitsmith/user_config.py +380 -0
splitsmith/video_match.py +159 -0
splitsmith/video_probe.py +143 -0
splitsmith/waveform.py +121 -0
splitsmith/youtube_sidecar.py +293 -0
splitsmith-0.2.0.dist-info/METADATA +301 -0
splitsmith-0.2.0.dist-info/RECORD +109 -0
splitsmith-0.2.0.dist-info/WHEEL +4 -0
splitsmith-0.2.0.dist-info/entry_points.txt +3 -0
splitsmith-0.2.0.dist-info/licenses/LICENSE +21 -0

splitsmith/beep_calibration.py ADDED Viewed

@@ -0,0 +1,324 @@
+"""Beep-detector calibration suite -- manifest, ground truth, eval aggregation.
+This module is the pure-data backbone of the layer-1 work for issue #220
+(``beep: improve detection accuracy``). The detector itself lives in
+``splitsmith.beep_detect``; this module only describes WHAT to detect and
+HOW to score the result.
+Two tracks share the suite:
+* **Clip track** -- the ~10-50 s WAV files already checked into
+  ``tests/fixtures/`` (post-trim, with 0.5 s or 5 s pre-beep padding).
+  Always available; covers the trivial-baseline case + handheld iPhone
+  clips with 5 s of pre-beep noise.
+* **Full track** -- the wide-window WAVs produced by
+  ``scripts/extract_full_fixture_audio.py`` under ``tests/fixtures/full/``.
+  Covers late-beep / cross-bay scenarios that don't appear in the trimmed
+  clips. Optional: only present when the source MP4s have been extracted
+  on this machine.
+The ``ground_truth_in_clip`` and ``ground_truth_in_full`` fields express
+the beep position in seconds within each respective WAV's coordinate
+frame. ``detect_beep`` returns clip-relative or full-relative depending
+on which audio buffer it's fed -- pick the matching ground truth.
+"""
+from __future__ import annotations
+import json
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from pathlib import Path
+from pydantic import BaseModel, Field
+# Tolerance the audit JSONs themselves use (``tolerance_ms``). A detected
+# beep is "correct" if it lands within this many ms of ground truth.
+DEFAULT_TOLERANCE_MS = 100.0
+# Heuristic thresholds applied during manifest build to seed failure-mode
+# tags. These are SUGGESTIONS the user can override by hand-editing the
+# ``tags`` list in ``manifest.yaml``.
+LATE_BEEP_THRESHOLD_S = 10.0
+VERY_LATE_BEEP_THRESHOLD_S = 30.0
+STEEL_PRONE_PLATES_THRESHOLD = 1  # any popper / plate count
+class BeepFixtureEntry(BaseModel):
+    """One fixture's calibration metadata.
+    The manifest is a list of these. Field semantics:
+    * ``stem`` -- audit JSON / WAV basename (no extension).
+    * ``camera_kind`` -- ``head`` for body-mounted cameras (Insta360 GO),
+      ``hand`` for handheld phones. The detector should be robust to both
+      but the failure-modes differ.
+    * ``camera_id`` -- the audit JSON's camera.id, kept for filtering.
+    * ``clip_wav`` -- path to the post-trim WAV. Relative to
+      ``tests/fixtures/``.
+    * ``ground_truth_in_clip`` -- beep time in seconds, relative to start
+      of ``clip_wav``. Pulled directly from the audit JSON's ``beep_time``.
+    * ``full_wav`` / ``ground_truth_in_full`` / ``full_duration_s`` --
+      populated when ``scripts/extract_full_fixture_audio.py`` has been
+      run; ``full_wav`` is relative to ``tests/fixtures/``.
+    * ``tags`` -- failure-mode buckets. See module-level constants for
+      the heuristic auto-tags; humans can add finer tags (``cross-bay``,
+      ``steel-fp``, ``ro-chatter``, ...) by editing manifest.yaml.
+    """
+    stem: str
+    camera_kind: str
+    camera_id: str | None = None
+    clip_wav: str
+    ground_truth_in_clip: float
+    tolerance_ms: float = DEFAULT_TOLERANCE_MS
+    full_wav: str | None = None
+    ground_truth_in_full: float | None = None
+    full_duration_s: float | None = None
+    tags: list[str] = Field(default_factory=list)
+class BeepCalibrationManifest(BaseModel):
+    """Top-level manifest persisted to ``manifest.yaml``."""
+    fixtures: list[BeepFixtureEntry] = Field(default_factory=list)
+@dataclass(frozen=True)
+class FixtureEvalResult:
+    """Outcome of running the detector against one fixture's audio buffer.
+    ``track`` distinguishes the clip vs full evaluation since the same
+    ``stem`` produces two rows when both wavs are present.
+    """
+    stem: str
+    track: str  # "clip" or "full"
+    tags: tuple[str, ...]
+    ground_truth_s: float
+    tolerance_s: float
+    detected_time_s: float | None
+    detected_score: float | None
+    error_s: float | None  # detected - ground_truth, None if missed
+    correct_top1: bool
+    correct_in_topn: bool  # any candidate within tolerance
+    candidate_count: int
+    detected_confidence: float | None = None
+    error_kind: str | None = None  # "not_found", "exception", or None
+@dataclass
+class EvalSummary:
+    """Aggregated eval result. Used to print the report and gate CI."""
+    total: int = 0
+    top1_hits: int = 0
+    topn_hits: int = 0
+    not_found: int = 0
+    exceptions: int = 0
+    by_tag: dict[str, EvalSummary] = field(default_factory=dict)
+    @property
+    def recall_top1(self) -> float:
+        return (self.top1_hits / self.total) if self.total else 0.0
+    @property
+    def recall_topn(self) -> float:
+        return (self.topn_hits / self.total) if self.total else 0.0
+def derive_camera_kind(camera_block: dict | None) -> str:
+    """Map an audit-JSON ``camera`` dict to ``head`` / ``hand`` / ``unknown``.
+    The audit schema uses ``mount`` = ``head`` | ``hand`` directly, but a
+    few legacy fixtures don't have the camera block at all -- treat those
+    as ``unknown`` so the manifest stays explicit instead of guessing.
+    """
+    if not isinstance(camera_block, dict):
+        return "unknown"
+    mount = camera_block.get("mount")
+    if mount in ("head", "hand"):
+        return mount
+    return "unknown"
+def auto_tags(
+    *,
+    camera_kind: str,
+    ground_truth_in_full: float | None,
+    stage_rounds: dict | None,
+) -> list[str]:
+    """Seed failure-mode tags from audit-JSON facts.
+    Always-applicable:
+    * ``handheld`` / ``headcam`` -- from camera mount.
+    Conditional (full-audio-only):
+    * ``late-beep`` -- beep > 10 s into source. Today's 30 s search cap
+      still catches it but silence-preference scoring can drift.
+    * ``very-late-beep`` -- beep > 30 s into source. Current detector
+      hard-fails on these (search window cap).
+    * ``steel-prone`` -- stage has poppers or plates; raises the chance
+      of a steel-ring false positive being scored above the beep.
+    """
+    tags: list[str] = []
+    if camera_kind == "head":
+        tags.append("headcam")
+    elif camera_kind == "hand":
+        tags.append("handheld")
+    if ground_truth_in_full is not None:
+        if ground_truth_in_full >= VERY_LATE_BEEP_THRESHOLD_S:
+            tags.append("very-late-beep")
+        elif ground_truth_in_full >= LATE_BEEP_THRESHOLD_S:
+            tags.append("late-beep")
+    if isinstance(stage_rounds, dict):
+        plates = int(stage_rounds.get("plates") or 0)
+        poppers = int(stage_rounds.get("poppers") or 0)
+        if plates + poppers >= STEEL_PRONE_PLATES_THRESHOLD:
+            tags.append("steel-prone")
+    return tags
+def compute_full_beep_time(
+    *,
+    fixture_window_in_source: tuple[float, float],
+    full_window_in_source: tuple[float, float],
+    clip_beep_time: float,
+) -> float:
+    """Translate the audit's clip-relative beep into the full-WAV's frame.
+    The audit JSON pins the beep within a TRIMMED clip whose start sits
+    at ``fixture_window_in_source[0]`` in source-time. The full WAV
+    starts at ``full_window_in_source[0]``. Both are seconds-into-source.
+    The beep position in the full WAV is therefore::
+        full_beep = (fws[0] - full[0]) + clip_beep_time
+    """
+    fws_start = fixture_window_in_source[0]
+    full_start = full_window_in_source[0]
+    return (fws_start - full_start) + clip_beep_time
+def evaluate_detection(
+    *,
+    stem: str,
+    track: str,
+    tags: Iterable[str],
+    ground_truth_s: float,
+    tolerance_ms: float,
+    detected_time_s: float | None,
+    detected_score: float | None,
+    detected_confidence: float | None = None,
+    candidate_times_s: Iterable[float] = (),
+    error_kind: str | None = None,
+) -> FixtureEvalResult:
+    """Score one detector call against the ground truth.
+    ``candidate_times_s`` are the runner-up candidate timestamps the
+    detector surfaced (``BeepDetection.candidates[1:].time``). They count
+    toward ``correct_in_topn`` even when the top-1 winner was wrong --
+    this is the signal that matters for the HITL flow (#219): if the
+    real beep is in the top-N list, the human can pick it without
+    typing a timestamp.
+    """
+    tol_s = tolerance_ms / 1000.0
+    if detected_time_s is None:
+        return FixtureEvalResult(
+            stem=stem,
+            track=track,
+            tags=tuple(tags),
+            ground_truth_s=ground_truth_s,
+            tolerance_s=tol_s,
+            detected_time_s=None,
+            detected_score=detected_score,
+            detected_confidence=detected_confidence,
+            error_s=None,
+            correct_top1=False,
+            correct_in_topn=False,
+            candidate_count=0,
+            error_kind=error_kind or "not_found",
+        )
+    error = detected_time_s - ground_truth_s
+    correct_top1 = abs(error) <= tol_s
+    candidates = list(candidate_times_s)
+    correct_in_topn = correct_top1 or any(abs(c - ground_truth_s) <= tol_s for c in candidates)
+    return FixtureEvalResult(
+        stem=stem,
+        track=track,
+        tags=tuple(tags),
+        ground_truth_s=ground_truth_s,
+        tolerance_s=tol_s,
+        detected_time_s=detected_time_s,
+        detected_score=detected_score,
+        detected_confidence=detected_confidence,
+        error_s=error,
+        correct_top1=correct_top1,
+        correct_in_topn=correct_in_topn,
+        candidate_count=len(candidates) + 1,
+        error_kind=error_kind,
+    )
+def summarize(results: Iterable[FixtureEvalResult]) -> EvalSummary:
+    """Aggregate per-fixture results into an overall + per-tag summary."""
+    overall = EvalSummary()
+    for r in results:
+        overall.total += 1
+        if r.correct_top1:
+            overall.top1_hits += 1
+        if r.correct_in_topn:
+            overall.topn_hits += 1
+        if r.error_kind == "not_found":
+            overall.not_found += 1
+        elif r.error_kind == "exception":
+            overall.exceptions += 1
+        for tag in r.tags:
+            bucket = overall.by_tag.setdefault(tag, EvalSummary())
+            bucket.total += 1
+            if r.correct_top1:
+                bucket.top1_hits += 1
+            if r.correct_in_topn:
+                bucket.topn_hits += 1
+            if r.error_kind == "not_found":
+                bucket.not_found += 1
+            elif r.error_kind == "exception":
+                bucket.exceptions += 1
+    return overall
+def load_manifest(path: Path) -> BeepCalibrationManifest:
+    """Read a manifest YAML. Missing file returns an empty manifest."""
+    import yaml
+    if not path.exists():
+        return BeepCalibrationManifest()
+    raw = yaml.safe_load(path.read_text()) or {}
+    return BeepCalibrationManifest.model_validate(raw)
+def save_manifest(manifest: BeepCalibrationManifest, path: Path) -> None:
+    """Write a manifest YAML in a stable, hand-editable format."""
+    import yaml
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = manifest.model_dump(exclude_none=True)
+    path.write_text(yaml.safe_dump(payload, sort_keys=False, indent=2))
+def fixtures_with_full_audio(
+    manifest: BeepCalibrationManifest,
+    fixtures_dir: Path,
+) -> list[BeepFixtureEntry]:
+    """Subset of the manifest where the full-track WAV exists on disk."""
+    rows = []
+    for entry in manifest.fixtures:
+        if not entry.full_wav:
+            continue
+        if (fixtures_dir / entry.full_wav).exists():
+            rows.append(entry)
+    return rows
+def read_audit_json(path: Path) -> dict:
+    """Thin wrapper -- only exists so tests can stub the read."""
+    return json.loads(path.read_text())

splitsmith/beep_detect.py ADDED Viewed

@@ -0,0 +1,371 @@
+"""Detect the start beep timestamp via bandpass + envelope peak detection.
+Strategy:
+1. Bandpass to ``[freq_min_hz, freq_max_hz]`` (typical shot-timer beep
+   2-5 kHz). Hilbert envelope, smoothed at ``envelope_smoothing_ms``
+   (40 ms by default -- wide enough to bridge the natural intra-beep
+   wobble, narrow enough to keep the 300-500 ms beep distinct from
+   sustained ambient noise).
+2. **Adaptive cutoff**: a candidate run must clear ``max(min_amplitude *
+   global_peak, noise_floor * noise_factor, min_abs_peak)``. The noise-
+   floor leg is what recovers handheld / phone clips where the beep is
+   faint in absolute terms but still 10x+ above the median noise floor.
+   ``global_peak`` is held in reserve for cases where a gunshot dominates
+   the band; ``min_abs_peak`` is a sub-noise sanity floor.
+3. **Composite scoring**: each candidate is ranked by
+   ``silence_score * tonal_score`` where:
+   * ``silence_score = run_peak / (mean envelope in pre-silence window)``.
+     IPSC beeps are preceded by ~3 s of "Are you ready / Stand by" + a
+     pause; mid-stage transients are not. Higher = quieter pre-roll.
+   * ``tonal_score = energy_in_3_kHz_band / energy_in_full_band``,
+     in [0, 1]. The IPSC timer emits a near-pure ~3.0-3.3 kHz tone;
+     gunshots, steel rings, and RO chatter spread energy across the
+     full 2-5 kHz band. ``tonal_weight`` controls how strongly this
+     component tilts the ranking.
+4. **Adaptive rise-foot leading edge**: walk backward from the run's peak
+   while the envelope stays above ``max(peak * RISE_FOOT_FRAC, noise_floor
+   * RISE_FOOT_NOISE_FACTOR)``. The noise-floor lower bound stops the
+   walk from sliding into pre-beep noise on faint beeps where 5 % of the
+   peak falls below the noise floor.
+This shares the "leading edge" definition with shot_detect: peak-relative
+when the beep is loud, noise-floor-relative when it isn't, insensitive
+to gain / distance / ambient noise, and lands at the visibly audible
+start of the rise.
+Pure function: takes audio + sample rate + config, returns a BeepDetection. No
+file I/O. ``load_audio`` is provided as a thin convenience for callers.
+"""
+from __future__ import annotations
+import math
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from scipy.signal import butter, hilbert, sosfiltfilt
+from .config import BeepCandidate, BeepDetectConfig, BeepDetection
+# Rise-foot leading-edge parameters. Same definition as shot_detect (the
+# burst's own peak is the reference, so detection is insensitive to gain /
+# distance / ambient noise). Tied to the smoothed bandpass envelope -- the
+# tone's amplitude profile, not the raw oscillation. The noise-floor
+# multiplier kicks in when the burst is only marginally above the floor:
+# walking back to 5 % of a faint peak otherwise crosses into pre-beep noise.
+_RISE_FOOT_FRAC = 0.05
+_RISE_FOOT_NOISE_FACTOR = 1.5
+# Fine smoothing window applied to the rise-foot envelope only. Just enough
+# to suppress single-sample wobble; not so wide that it shifts the onset.
+_LEADING_EDGE_SMOOTHING_MS = 10.0
+# Confidence-formula weights. Empirically tuned against the labelled
+# calibration suite (issue #220 layer 3): the resulting distribution has
+# >=0.7 right ~95 % of the time and 0.5-0.7 sitting around chance, which
+# is the gap the HITL queue (#219) needs. Bump these only with paired
+# eval-set numbers in the commit; the threshold settings downstream
+# rely on the calibration holding.
+_CONFIDENCE_TONAL_WEIGHT = 0.45
+_CONFIDENCE_DURATION_WEIGHT = 0.30
+_CONFIDENCE_SILENCE_WEIGHT = 0.25
+# Silence-score saturation point: tanh(silence / SILENCE_SCALE) maps the
+# raw ratio to [0, 1]. 5x is "comfortably above pre-roll noise" -- below
+# it we're in steel-ring / mag-swap-quiet territory; above it the metric
+# is saturated.
+_CONFIDENCE_SILENCE_SCALE = 5.0
+# Duration normalisation: ramp from 0 at MIN_MS to 1 at FULL_MS. Slightly
+# wider than the ranking-side prior so a 250 ms beep still gets ~0.25
+# confidence (it would land in HITL, which is the right call).
+_CONFIDENCE_DUR_MIN_MS = 200.0
+_CONFIDENCE_DUR_FULL_MS = 400.0
+# Margin tilt: the runner-up's score is folded in via
+# ``mix * (margin_floor + (1 - margin_floor) * margin)``. ``margin = 0``
+# (ties with runner-up) leaves only ``margin_floor`` of the quality
+# score; ``margin = 1`` (runner-up scores 0) preserves the full quality.
+_CONFIDENCE_MARGIN_FLOOR = 0.6
+def candidate_confidence(
+    *,
+    silence_score: float,
+    tonal_score: float,
+    duration_ms: float,
+    score: float,
+    runner_up_score: float,
+) -> float:
+    """Map per-candidate diagnostic features to a calibrated [0, 1].
+    The formula is a weighted blend of three quality components --
+    tonal purity, duration plausibility, saturating silence preference
+    -- multiplied by a margin tilt that demotes the winner when a
+    runner-up scores nearly as high. Calibration evidence lives in
+    ``tests/fixtures/beep_calibration/baseline.json``; bumping the
+    constants without re-checking the eval-set bins is asking for a
+    silent regression.
+    Pure function: no audio, no I/O. Tests cover the corner shapes
+    (peak winner, tied runner-up, sub-min duration, etc.).
+    """
+    tonal_norm = max(0.0, min(1.0, tonal_score))
+    dur_span = max(1.0, _CONFIDENCE_DUR_FULL_MS - _CONFIDENCE_DUR_MIN_MS)
+    dur_norm = max(0.0, min(1.0, (duration_ms - _CONFIDENCE_DUR_MIN_MS) / dur_span))
+    silence_norm = math.tanh(max(0.0, silence_score) / _CONFIDENCE_SILENCE_SCALE)
+    quality = (
+        _CONFIDENCE_TONAL_WEIGHT * tonal_norm
+        + _CONFIDENCE_DURATION_WEIGHT * dur_norm
+        + _CONFIDENCE_SILENCE_WEIGHT * silence_norm
+    )
+    if score > 0.0:
+        margin = max(0.0, min(1.0, 1.0 - runner_up_score / score))
+    else:
+        margin = 0.0
+    margin_factor = _CONFIDENCE_MARGIN_FLOOR + (1.0 - _CONFIDENCE_MARGIN_FLOOR) * margin
+    return max(0.0, min(1.0, quality * margin_factor))
+class BeepNotFoundError(RuntimeError):
+    """No beep candidate met the duration + amplitude criteria."""
+def load_audio(path: Path) -> tuple[np.ndarray, int]:
+    """Load an audio file and return (mono float32 samples, sample rate)."""
+    data, sr = sf.read(path, always_2d=False)
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    return data.astype(np.float32, copy=False), int(sr)
+def _bandpass_envelope(
+    audio: np.ndarray, sample_rate: int, lo: float, hi: float, smoothing_ms: float
+) -> np.ndarray:
+    """4th-order Butterworth bandpass + Hilbert envelope + moving-average smooth."""
+    sos = butter(4, [lo, hi], btype="band", fs=sample_rate, output="sos")
+    band = sosfiltfilt(sos, audio)
+    env = np.abs(hilbert(band)).astype(np.float32)
+    smooth_win = max(1, int(round(sample_rate * smoothing_ms / 1000.0)))
+    if smooth_win > 1:
+        kernel = np.ones(smooth_win, dtype=np.float32) / smooth_win
+        env = np.convolve(env, kernel, mode="same")
+    return env
+def detect_beep(
+    audio: np.ndarray,
+    sample_rate: int,
+    config: BeepDetectConfig,
+) -> BeepDetection:
+    """Locate the start beep in ``audio`` and return its leading-edge timestamp.
+    Raises ``BeepNotFoundError`` if no candidate satisfies the duration/amplitude
+    thresholds.
+    """
+    if audio.ndim != 1:
+        raise ValueError("audio must be 1-D (mono); mix down before calling detect_beep")
+    if audio.size == 0:
+        raise ValueError("audio is empty")
+    # Limit the search to the configured leading window. This prevents mid-
+    # stage low-activity moments from out-scoring the real beep on silence
+    # preference alone (e.g. a steel ring after a long reload, late in the
+    # stage, can have lower pre-window energy than the beep itself).
+    if config.search_window_s and config.search_window_s > 0:
+        search_hi = min(audio.size, int(round(config.search_window_s * sample_rate)))
+        audio = audio[:search_hi]
+        if audio.size == 0:
+            raise BeepNotFoundError("search window is empty")
+    # Two envelopes: coarse (40 ms) for run detection / scoring, fine (10
+    # ms) for rise-foot leading-edge timing. The coarse envelope bridges
+    # intra-beep dips; the fine envelope keeps the leading-edge sample
+    # accurate -- a wide moving-average smear shifts the apparent onset
+    # earlier by ~half the smoothing window, which would otherwise blow
+    # the ~15 ms tolerance the audit JSONs use.
+    env = _bandpass_envelope(
+        audio,
+        sample_rate,
+        config.freq_min_hz,
+        config.freq_max_hz,
+        config.envelope_smoothing_ms,
+    )
+    env_fine = _bandpass_envelope(
+        audio,
+        sample_rate,
+        config.freq_min_hz,
+        config.freq_max_hz,
+        _LEADING_EDGE_SMOOTHING_MS,
+    )
+    peak_value = float(env.max())
+    if peak_value <= 0.0:
+        raise BeepNotFoundError("flat audio: no energy in beep band")
+    # Noise floor = median of the smoothed envelope. Robust to gunshots /
+    # steel rings (a few high samples don't move the median) and to long
+    # quiet leads (most samples are near-silent so median stays small).
+    noise_floor = float(np.median(env))
+    # Effective cutoff: see ``BeepDetectConfig`` -- three legs, take the max.
+    cutoff = max(
+        config.min_amplitude * peak_value,
+        config.noise_floor_factor * noise_floor,
+        config.min_abs_peak,
+    )
+    above = env >= cutoff
+    edges = np.diff(above.astype(np.int8), prepend=0, append=0)
+    starts = np.flatnonzero(edges == 1)
+    ends = np.flatnonzero(edges == -1)  # exclusive
+    min_run_samples = int(round(sample_rate * config.min_duration_ms / 1000.0))
+    pre_window_samples = int(round(sample_rate * config.silence_window_s))
+    pre_skip_samples = int(round(sample_rate * config.silence_pre_skip_s))
+    # Tonal-quality envelope: same audio, narrower bandpass around the
+    # IPSC timer fundamental. We compare run-window energy in this band
+    # against the wider band envelope above.
+    tonal_env = _bandpass_envelope(
+        audio,
+        sample_rate,
+        config.tonal_band_lo_hz,
+        config.tonal_band_hi_hz,
+        config.envelope_smoothing_ms,
+    )
+    candidates: list[tuple[int, int, float, float, float, float]] = []
+    for s, e in zip(starts, ends, strict=True):
+        if (e - s) < min_run_samples:
+            continue
+        run_peak = float(env[s:e].max())
+        # Silence-preference uses the MAX of the pre-window envelope, not
+        # its mean. Mean-based scoring let mid-stage candidates beat the
+        # real beep when the pre-window happened to span a magazine swap
+        # or a brief lull between shots: the lull dragged the mean down
+        # even when the window also contained one or two loud transients.
+        # Max-based scoring asks "is there anything else loud in recent
+        # past?" -- a real beep has a clean pre-roll, so the answer is no.
+        #
+        # Candidates near t=0 don't have a full pre-window. The metric is
+        # undefined for them, so we substitute a neutral 1.0: tonal +
+        # duration must do the discrimination instead. Otherwise a
+        # truncated-pre-window candidate gets ``peak / noise_floor``,
+        # which beats real beeps whose pre-window contains RO chatter.
+        pre_hi = max(0, s - pre_skip_samples)
+        pre_lo = max(0, pre_hi - pre_window_samples)
+        available_pre_s = (pre_hi - pre_lo) / sample_rate
+        if available_pre_s < config.min_pre_window_s:
+            silence_score = 1.0
+        else:
+            pre_max = float(env[pre_lo:pre_hi].max())
+            pre_max = max(pre_max, noise_floor)
+            silence_score = run_peak / (pre_max + 1e-6)
+        # Tonal concentration: energy in the IPSC fundamental band over
+        # energy in the full search band, computed on the smoothed envelope.
+        # Sums over a few hundred ms of run samples are stable; a single-
+        # sample peak ratio would be noisy.
+        wide_energy = float(np.sum(env[s:e]))
+        narrow_energy = float(np.sum(tonal_env[s:e]))
+        tonal_ratio = narrow_energy / (wide_energy + 1e-6)
+        tonal_ratio = max(0.0, min(1.0, tonal_ratio))
+        # Composite score: silence-preference, modulated by tonal quality
+        # AND duration-match. tonal_weight=0 + dur_match_weight=0 falls
+        # back to legacy silence-only behaviour.
+        weight = max(0.0, min(1.0, config.tonal_weight))
+        tonal_factor = (1.0 - weight) + weight * tonal_ratio
+        # Duration-match factor: ramp from 0 at min_ms to 1 at full_ms,
+        # squared to make the penalty bite harder on short transients.
+        # A 168 ms shot (typical post-smoothing length) lands at
+        # ((168-150)/150)^2 = 0.014; a 340 ms beep at 1.0. The squaring
+        # is what actually demotes shots whose pre-window happens to be
+        # quiet (magazine-swap lulls etc.) -- silence-preference alone
+        # can't tell those from the real beep.
+        dur_ms = (e - s) * 1000.0 / sample_rate
+        span_ms = max(1.0, config.dur_match_full_ms - config.dur_match_min_ms)
+        dur_ratio = max(0.0, min(1.0, (dur_ms - config.dur_match_min_ms) / span_ms))
+        dur_weight = max(0.0, min(1.0, config.dur_match_weight))
+        dur_factor = (1.0 - dur_weight) + dur_weight * dur_ratio * dur_ratio
+        score = silence_score * tonal_factor * dur_factor
+        candidates.append((s, e, run_peak, score, silence_score, tonal_ratio))
+    if not candidates:
+        raise BeepNotFoundError(
+            f"no beep candidate of >={config.min_duration_ms} ms above "
+            f"cutoff {cutoff:.4f} (peak={peak_value:.4f}, "
+            f"noise_floor={noise_floor:.4f}) in [{config.freq_min_hz}, "
+            f"{config.freq_max_hz}] Hz"
+        )
+    # Rank by composite score (highest first). Compute the rise-foot
+    # leading edge for every candidate so the UI can show alternatives
+    # without a second pass.
+    ranked = sorted(candidates, key=lambda c: c[3], reverse=True)
+    runner_up_score = ranked[1][3] if len(ranked) > 1 else 0.0
+    ranked_models: list[BeepCandidate] = []
+    for run_start, run_end, run_peak, score, silence_score, tonal_ratio in ranked:
+        leading_idx = _rise_foot_leading_edge(env_fine, run_start, run_end, noise_floor)
+        duration_ms = (run_end - run_start) * 1000.0 / sample_rate
+        # Confidence uses the GLOBAL runner-up's score for every
+        # candidate, not the next-lower in the sorted list. The HITL
+        # protocol cares about "is the winner clearly better than the
+        # next-best alternative?"; a runner-up's own confidence is
+        # mostly informational so the UI can colour the chip.
+        confidence = candidate_confidence(
+            silence_score=silence_score,
+            tonal_score=tonal_ratio,
+            duration_ms=duration_ms,
+            score=score,
+            runner_up_score=runner_up_score,
+        )
+        ranked_models.append(
+            BeepCandidate(
+                time=leading_idx / sample_rate,
+                score=score,
+                peak_amplitude=run_peak,
+                duration_ms=duration_ms,
+                silence_score=silence_score,
+                tonal_score=tonal_ratio,
+                confidence=confidence,
+            )
+        )
+    top_n = config.top_n_candidates if config.top_n_candidates > 0 else 1
+    surfaced = ranked_models[:top_n]
+    winner = ranked_models[0]
+    return BeepDetection(
+        time=winner.time,
+        peak_amplitude=winner.peak_amplitude,
+        duration_ms=winner.duration_ms,
+        confidence=winner.confidence,
+        candidates=surfaced,
+    )
+def _rise_foot_leading_edge(env: np.ndarray, run_start: int, run_end: int, noise_floor: float) -> int:
+    """Rise-foot of the tone: walk backward from the envelope peak (within the
+    strong run) while the envelope stays at or above ``max(peak *
+    RISE_FOOT_FRAC, noise_floor * RISE_FOOT_NOISE_FACTOR)``. The earliest
+    such sample is the foot of the rise.
+    The noise-floor lower bound prevents the walk from continuing into
+    pre-beep silence on faint beeps where 5 % of the peak falls below the
+    median noise floor (e.g. iPhone handheld clips with ~10x SNR).
+    """
+    if run_end <= run_start:
+        return run_start
+    peak_offset = int(np.argmax(env[run_start:run_end]))
+    peak_idx = run_start + peak_offset
+    peak = float(env[peak_idx])
+    if peak <= 0.0:
+        return run_start
+    foot = max(peak * _RISE_FOOT_FRAC, noise_floor * _RISE_FOOT_NOISE_FACTOR)
+    i = peak_idx
+    while i > 0 and env[i - 1] >= foot:
+        i -= 1
+    return i